use beautifulsoup to derive title from HTML h1

Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
2026-01-28 14:08:48 -06:00
parent 3ca13cc6f8
commit 20673c178a
8 changed files with 53 additions and 5 deletions
--- a/incorporealcms/markdown.py
+++ b/incorporealcms/markdown.py
@@ -13,6 +13,7 @@ import os
 import re

 import markdown
+from bs4 import BeautifulSoup
 from markupsafe import Markup

 from incorporealcms import jinja_env
@@ -82,7 +83,18 @@ def parse_md(path: str, pages_root: str):
    logger.debug("file metadata: %s", md.Meta)

    rel_path = os.path.relpath(path, pages_root)
-    page_name = get_meta_str(md, 'title') if md.Meta.get('title') else instance_resource_path_to_request_path(rel_path)
+
+    soup = BeautifulSoup(content, features='lxml')
+
+    # get the page title first from the markdown tags, second from the first h1, last from the path
+    page_name = None
+    if md.Meta.get('title'):
+        page_name = get_meta_str(md, 'title')
+    elif h1_tag := soup.find('h1'):
+        page_name = h1_tag.string
+    elif not page_name:
+        page_name = instance_resource_path_to_request_path(rel_path)
+
    page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
    logger.debug("title (potentially derived): %s", page_title)