use beautifulsoup to derive title from HTML h1

Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
2026-01-28 14:08:48 -06:00 · 2026-01-28 14:08:48 -06:00 · 20673c178a
commit 20673c178a
parent 3ca13cc6f8
8 changed files with 53 additions and 5 deletions
--- a/incorporealcms/markdown.py
+++ b/incorporealcms/markdown.py
@ -13,6 +13,7 @@ import os
 import re

 import markdown
+from bs4 import BeautifulSoup
 from markupsafe import Markup

 from incorporealcms import jinja_env
@ -82,7 +83,18 @@ def parse_md(path: str, pages_root: str):
    logger.debug("file metadata: %s", md.Meta)

    rel_path = os.path.relpath(path, pages_root)
-    page_name = get_meta_str(md, 'title') if md.Meta.get('title') else instance_resource_path_to_request_path(rel_path)
+
+    soup = BeautifulSoup(content, features='lxml')
+
+    # get the page title first from the markdown tags, second from the first h1, last from the path
+    page_name = None
+    if md.Meta.get('title'):
+        page_name = get_meta_str(md, 'title')
+    elif h1_tag := soup.find('h1'):
+        page_name = h1_tag.string
+    elif not page_name:
+        page_name = instance_resource_path_to_request_path(rel_path)
+
    page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
    logger.debug("title (potentially derived): %s", page_title)

--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,7 +11,7 @@ authors = [
 	{name = "Brian S. Stephan", email = "bss@incorporeal.org"},
 ]
 requires-python = ">=3.10"
-dependencies = ["feedgen", "jinja2", "Markdown", "termcolor"]
+dependencies = ["beautifulsoup4", "feedgen", "jinja2", "Markdown", "termcolor"]
 dynamic = ["version"]
 classifiers = [
 	"Programming Language :: Python :: 3",
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@ -14,6 +14,8 @@ authlib==1.6.6
    # via safety
 bandit==1.9.3
    # via incorporeal-cms (pyproject.toml)
+beautifulsoup4==4.14.3
+    # via incorporeal-cms (pyproject.toml)
 boolean-py==5.0
    # via license-expression
 build==1.4.0
@ -266,6 +268,8 @@ six==1.17.0
    # via python-dateutil
 snowballstemmer==3.0.1
    # via pydocstyle
+soupsieve==2.8.3
+    # via beautifulsoup4
 stevedore==5.6.0
    # via bandit
 tenacity==9.1.2
@ -286,6 +290,7 @@ typer==0.21.1
    # via safety
 typing-extensions==4.15.0
    # via
+    #   beautifulsoup4
    #   mypy
    #   pydantic
    #   pydantic-core
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@ -4,6 +4,8 @@
 #
 #    pip-compile --output-file=requirements/requirements.txt
 #
+beautifulsoup4==4.14.3
+    # via incorporeal-cms (pyproject.toml)
 feedgen==1.0.0
    # via incorporeal-cms (pyproject.toml)
 jinja2==3.1.6
@ -18,5 +20,9 @@ python-dateutil==2.9.0.post0
    # via feedgen
 six==1.17.0
    # via python-dateutil
+soupsieve==2.8.3
+    # via beautifulsoup4
 termcolor==3.3.0
    # via incorporeal-cms (pyproject.toml)
+typing-extensions==4.15.0
+    # via beautifulsoup4
--- a/tests/instance/pages/no-title-or-h1.md
+++ b/tests/instance/pages/no-title-or-h1.md
@ -0,0 +1 @@
+there's just some words here but no title tag or h1
--- a/tests/instance/pages/no-title-subdir/index.md
+++ b/tests/instance/pages/no-title-subdir/index.md
@ -0,0 +1 @@
+there's just some words here but no title tag or h1
--- a/tests/instance/pages/no-title-subdir/no-title-or-h1.md
+++ b/tests/instance/pages/no-title-subdir/no-title-or-h1.md
@ -0,0 +1 @@
+there's just some words here but no title tag or h1
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@ -146,11 +146,33 @@ def test_parse_md_metadata_forced_no_title():
    assert page_title == 'example.org'


-def test_parse_md_metadata_no_title_so_path():
+def test_parse_md_metadata_no_title_so_h1():
    """Test the direct results of parsing a markdown file."""
    content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR)
-    assert page_name == '/subdir/'
-    assert page_title == '/subdir/ - example.org'
+    assert page_name == 'another page'
+    assert page_title == 'another page - example.org'
+
+
+def test_parse_md_metadata_no_title_or_h1_so_path():
+    """Test the direct results of parsing a markdown file."""
+    content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR)
+    assert page_name == '/no-title-or-h1'
+    assert page_title == '/no-title-or-h1 - example.org'
+
+
+def test_parse_md_metadata_no_title_or_h1_so_path_dir():
+    """Test the direct results of parsing a markdown file."""
+    content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR)
+    assert page_name == '/no-title-subdir/'
+    assert page_title == '/no-title-subdir/ - example.org'
+
+
+def test_parse_md_metadata_no_title_or_h1_so_path_dir_file():
+    """Test the direct results of parsing a markdown file."""
+    content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'),
+                                                         PAGES_DIR)
+    assert page_name == '/no-title-subdir/no-title-or-h1'
+    assert page_title == '/no-title-subdir/no-title-or-h1 - example.org'


 def test_parse_md_no_file():
				`@ -0,0 +1 @@`
				`there's just some words here but no title tag or h1`