use beautifulsoup to derive title from HTML h1

Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
This commit is contained in:
Brian S. Stephan 2026-01-28 14:08:48 -06:00
parent 3ca13cc6f8
commit 20673c178a
Signed by: bss
GPG Key ID: 3DE06D3180895FCB
8 changed files with 53 additions and 5 deletions

View File

@ -13,6 +13,7 @@ import os
import re import re
import markdown import markdown
from bs4 import BeautifulSoup
from markupsafe import Markup from markupsafe import Markup
from incorporealcms import jinja_env from incorporealcms import jinja_env
@ -82,7 +83,18 @@ def parse_md(path: str, pages_root: str):
logger.debug("file metadata: %s", md.Meta) logger.debug("file metadata: %s", md.Meta)
rel_path = os.path.relpath(path, pages_root) rel_path = os.path.relpath(path, pages_root)
page_name = get_meta_str(md, 'title') if md.Meta.get('title') else instance_resource_path_to_request_path(rel_path)
soup = BeautifulSoup(content, features='lxml')
# get the page title first from the markdown tags, second from the first h1, last from the path
page_name = None
if md.Meta.get('title'):
page_name = get_meta_str(md, 'title')
elif h1_tag := soup.find('h1'):
page_name = h1_tag.string
elif not page_name:
page_name = instance_resource_path_to_request_path(rel_path)
page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
logger.debug("title (potentially derived): %s", page_title) logger.debug("title (potentially derived): %s", page_title)

View File

@ -11,7 +11,7 @@ authors = [
{name = "Brian S. Stephan", email = "bss@incorporeal.org"}, {name = "Brian S. Stephan", email = "bss@incorporeal.org"},
] ]
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = ["feedgen", "jinja2", "Markdown", "termcolor"] dependencies = ["beautifulsoup4", "feedgen", "jinja2", "Markdown", "termcolor"]
dynamic = ["version"] dynamic = ["version"]
classifiers = [ classifiers = [
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",

View File

@ -14,6 +14,8 @@ authlib==1.6.6
# via safety # via safety
bandit==1.9.3 bandit==1.9.3
# via incorporeal-cms (pyproject.toml) # via incorporeal-cms (pyproject.toml)
beautifulsoup4==4.14.3
# via incorporeal-cms (pyproject.toml)
boolean-py==5.0 boolean-py==5.0
# via license-expression # via license-expression
build==1.4.0 build==1.4.0
@ -266,6 +268,8 @@ six==1.17.0
# via python-dateutil # via python-dateutil
snowballstemmer==3.0.1 snowballstemmer==3.0.1
# via pydocstyle # via pydocstyle
soupsieve==2.8.3
# via beautifulsoup4
stevedore==5.6.0 stevedore==5.6.0
# via bandit # via bandit
tenacity==9.1.2 tenacity==9.1.2
@ -286,6 +290,7 @@ typer==0.21.1
# via safety # via safety
typing-extensions==4.15.0 typing-extensions==4.15.0
# via # via
# beautifulsoup4
# mypy # mypy
# pydantic # pydantic
# pydantic-core # pydantic-core

View File

@ -4,6 +4,8 @@
# #
# pip-compile --output-file=requirements/requirements.txt # pip-compile --output-file=requirements/requirements.txt
# #
beautifulsoup4==4.14.3
# via incorporeal-cms (pyproject.toml)
feedgen==1.0.0 feedgen==1.0.0
# via incorporeal-cms (pyproject.toml) # via incorporeal-cms (pyproject.toml)
jinja2==3.1.6 jinja2==3.1.6
@ -18,5 +20,9 @@ python-dateutil==2.9.0.post0
# via feedgen # via feedgen
six==1.17.0 six==1.17.0
# via python-dateutil # via python-dateutil
soupsieve==2.8.3
# via beautifulsoup4
termcolor==3.3.0 termcolor==3.3.0
# via incorporeal-cms (pyproject.toml) # via incorporeal-cms (pyproject.toml)
typing-extensions==4.15.0
# via beautifulsoup4

View File

@ -0,0 +1 @@
there's just some words here but no title tag or h1

View File

@ -0,0 +1 @@
there's just some words here but no title tag or h1

View File

@ -0,0 +1 @@
there's just some words here but no title tag or h1

View File

@ -146,11 +146,33 @@ def test_parse_md_metadata_forced_no_title():
assert page_title == 'example.org' assert page_title == 'example.org'
def test_parse_md_metadata_no_title_so_path(): def test_parse_md_metadata_no_title_so_h1():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR) content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR)
assert page_name == '/subdir/' assert page_name == 'another page'
assert page_title == '/subdir/ - example.org' assert page_title == 'another page - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path():
"""Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR)
assert page_name == '/no-title-or-h1'
assert page_title == '/no-title-or-h1 - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path_dir():
"""Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR)
assert page_name == '/no-title-subdir/'
assert page_title == '/no-title-subdir/ - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path_dir_file():
"""Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'),
PAGES_DIR)
assert page_name == '/no-title-subdir/no-title-or-h1'
assert page_title == '/no-title-subdir/no-title-or-h1 - example.org'
def test_parse_md_no_file(): def test_parse_md_no_file():