diff --git a/incorporealcms/markdown.py b/incorporealcms/markdown.py index 8a36784..2442357 100644 --- a/incorporealcms/markdown.py +++ b/incorporealcms/markdown.py @@ -13,6 +13,7 @@ import os import re import markdown +from bs4 import BeautifulSoup from markupsafe import Markup from incorporealcms import jinja_env @@ -82,7 +83,18 @@ def parse_md(path: str, pages_root: str): logger.debug("file metadata: %s", md.Meta) rel_path = os.path.relpath(path, pages_root) - page_name = get_meta_str(md, 'title') if md.Meta.get('title') else instance_resource_path_to_request_path(rel_path) + + soup = BeautifulSoup(content, features='lxml') + + # get the page title first from the markdown tags, second from the first h1, last from the path + page_name = None + if md.Meta.get('title'): + page_name = get_meta_str(md, 'title') + elif h1_tag := soup.find('h1'): + page_name = h1_tag.string + elif not page_name: + page_name = instance_resource_path_to_request_path(rel_path) + page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX logger.debug("title (potentially derived): %s", page_title) diff --git a/pyproject.toml b/pyproject.toml index 76231fb..f6841f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Brian S. Stephan", email = "bss@incorporeal.org"}, ] requires-python = ">=3.10" -dependencies = ["feedgen", "jinja2", "Markdown", "termcolor"] +dependencies = ["beautifulsoup4", "feedgen", "jinja2", "Markdown", "termcolor"] dynamic = ["version"] classifiers = [ "Programming Language :: Python :: 3", diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 474b354..8348fd1 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -14,6 +14,8 @@ authlib==1.6.6 # via safety bandit==1.9.3 # via incorporeal-cms (pyproject.toml) +beautifulsoup4==4.14.3 + # via incorporeal-cms (pyproject.toml) boolean-py==5.0 # via license-expression build==1.4.0 @@ -266,6 +268,8 @@ six==1.17.0 # via python-dateutil snowballstemmer==3.0.1 # via pydocstyle +soupsieve==2.8.3 + # via beautifulsoup4 stevedore==5.6.0 # via bandit tenacity==9.1.2 @@ -286,6 +290,7 @@ typer==0.21.1 # via safety typing-extensions==4.15.0 # via + # beautifulsoup4 # mypy # pydantic # pydantic-core diff --git a/requirements/requirements.txt b/requirements/requirements.txt index fc1ddf7..07701d8 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -4,6 +4,8 @@ # # pip-compile --output-file=requirements/requirements.txt # +beautifulsoup4==4.14.3 + # via incorporeal-cms (pyproject.toml) feedgen==1.0.0 # via incorporeal-cms (pyproject.toml) jinja2==3.1.6 @@ -18,5 +20,9 @@ python-dateutil==2.9.0.post0 # via feedgen six==1.17.0 # via python-dateutil +soupsieve==2.8.3 + # via beautifulsoup4 termcolor==3.3.0 # via incorporeal-cms (pyproject.toml) +typing-extensions==4.15.0 + # via beautifulsoup4 diff --git a/tests/instance/pages/no-title-or-h1.md b/tests/instance/pages/no-title-or-h1.md new file mode 100644 index 0000000..e211573 --- /dev/null +++ b/tests/instance/pages/no-title-or-h1.md @@ -0,0 +1 @@ +there's just some words here but no title tag or h1 diff --git a/tests/instance/pages/no-title-subdir/index.md b/tests/instance/pages/no-title-subdir/index.md new file mode 100644 index 0000000..e211573 --- /dev/null +++ b/tests/instance/pages/no-title-subdir/index.md @@ -0,0 +1 @@ +there's just some words here but no title tag or h1 diff --git a/tests/instance/pages/no-title-subdir/no-title-or-h1.md b/tests/instance/pages/no-title-subdir/no-title-or-h1.md new file mode 100644 index 0000000..e211573 --- /dev/null +++ b/tests/instance/pages/no-title-subdir/no-title-or-h1.md @@ -0,0 +1 @@ +there's just some words here but no title tag or h1 diff --git a/tests/test_markdown.py b/tests/test_markdown.py index 71932a0..9122265 100644 --- a/tests/test_markdown.py +++ b/tests/test_markdown.py @@ -146,11 +146,33 @@ def test_parse_md_metadata_forced_no_title(): assert page_title == 'example.org' -def test_parse_md_metadata_no_title_so_path(): +def test_parse_md_metadata_no_title_so_h1(): """Test the direct results of parsing a markdown file.""" content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR) - assert page_name == '/subdir/' - assert page_title == '/subdir/ - example.org' + assert page_name == 'another page' + assert page_title == 'another page - example.org' + + +def test_parse_md_metadata_no_title_or_h1_so_path(): + """Test the direct results of parsing a markdown file.""" + content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR) + assert page_name == '/no-title-or-h1' + assert page_title == '/no-title-or-h1 - example.org' + + +def test_parse_md_metadata_no_title_or_h1_so_path_dir(): + """Test the direct results of parsing a markdown file.""" + content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR) + assert page_name == '/no-title-subdir/' + assert page_title == '/no-title-subdir/ - example.org' + + +def test_parse_md_metadata_no_title_or_h1_so_path_dir_file(): + """Test the direct results of parsing a markdown file.""" + content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'), + PAGES_DIR) + assert page_name == '/no-title-subdir/no-title-or-h1' + assert page_title == '/no-title-subdir/no-title-or-h1 - example.org' def test_parse_md_no_file():