use beautifulsoup to derive title from HTML h1
Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
This commit is contained in:
parent
3ca13cc6f8
commit
20673c178a
@ -13,6 +13,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
import markdown
|
import markdown
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from markupsafe import Markup
|
from markupsafe import Markup
|
||||||
|
|
||||||
from incorporealcms import jinja_env
|
from incorporealcms import jinja_env
|
||||||
@ -82,7 +83,18 @@ def parse_md(path: str, pages_root: str):
|
|||||||
logger.debug("file metadata: %s", md.Meta)
|
logger.debug("file metadata: %s", md.Meta)
|
||||||
|
|
||||||
rel_path = os.path.relpath(path, pages_root)
|
rel_path = os.path.relpath(path, pages_root)
|
||||||
page_name = get_meta_str(md, 'title') if md.Meta.get('title') else instance_resource_path_to_request_path(rel_path)
|
|
||||||
|
soup = BeautifulSoup(content, features='lxml')
|
||||||
|
|
||||||
|
# get the page title first from the markdown tags, second from the first h1, last from the path
|
||||||
|
page_name = None
|
||||||
|
if md.Meta.get('title'):
|
||||||
|
page_name = get_meta_str(md, 'title')
|
||||||
|
elif h1_tag := soup.find('h1'):
|
||||||
|
page_name = h1_tag.string
|
||||||
|
elif not page_name:
|
||||||
|
page_name = instance_resource_path_to_request_path(rel_path)
|
||||||
|
|
||||||
page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
|
page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
|
||||||
logger.debug("title (potentially derived): %s", page_title)
|
logger.debug("title (potentially derived): %s", page_title)
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ authors = [
|
|||||||
{name = "Brian S. Stephan", email = "bss@incorporeal.org"},
|
{name = "Brian S. Stephan", email = "bss@incorporeal.org"},
|
||||||
]
|
]
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["feedgen", "jinja2", "Markdown", "termcolor"]
|
dependencies = ["beautifulsoup4", "feedgen", "jinja2", "Markdown", "termcolor"]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
|
|||||||
@ -14,6 +14,8 @@ authlib==1.6.6
|
|||||||
# via safety
|
# via safety
|
||||||
bandit==1.9.3
|
bandit==1.9.3
|
||||||
# via incorporeal-cms (pyproject.toml)
|
# via incorporeal-cms (pyproject.toml)
|
||||||
|
beautifulsoup4==4.14.3
|
||||||
|
# via incorporeal-cms (pyproject.toml)
|
||||||
boolean-py==5.0
|
boolean-py==5.0
|
||||||
# via license-expression
|
# via license-expression
|
||||||
build==1.4.0
|
build==1.4.0
|
||||||
@ -266,6 +268,8 @@ six==1.17.0
|
|||||||
# via python-dateutil
|
# via python-dateutil
|
||||||
snowballstemmer==3.0.1
|
snowballstemmer==3.0.1
|
||||||
# via pydocstyle
|
# via pydocstyle
|
||||||
|
soupsieve==2.8.3
|
||||||
|
# via beautifulsoup4
|
||||||
stevedore==5.6.0
|
stevedore==5.6.0
|
||||||
# via bandit
|
# via bandit
|
||||||
tenacity==9.1.2
|
tenacity==9.1.2
|
||||||
@ -286,6 +290,7 @@ typer==0.21.1
|
|||||||
# via safety
|
# via safety
|
||||||
typing-extensions==4.15.0
|
typing-extensions==4.15.0
|
||||||
# via
|
# via
|
||||||
|
# beautifulsoup4
|
||||||
# mypy
|
# mypy
|
||||||
# pydantic
|
# pydantic
|
||||||
# pydantic-core
|
# pydantic-core
|
||||||
|
|||||||
@ -4,6 +4,8 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=requirements/requirements.txt
|
# pip-compile --output-file=requirements/requirements.txt
|
||||||
#
|
#
|
||||||
|
beautifulsoup4==4.14.3
|
||||||
|
# via incorporeal-cms (pyproject.toml)
|
||||||
feedgen==1.0.0
|
feedgen==1.0.0
|
||||||
# via incorporeal-cms (pyproject.toml)
|
# via incorporeal-cms (pyproject.toml)
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
@ -18,5 +20,9 @@ python-dateutil==2.9.0.post0
|
|||||||
# via feedgen
|
# via feedgen
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
# via python-dateutil
|
# via python-dateutil
|
||||||
|
soupsieve==2.8.3
|
||||||
|
# via beautifulsoup4
|
||||||
termcolor==3.3.0
|
termcolor==3.3.0
|
||||||
# via incorporeal-cms (pyproject.toml)
|
# via incorporeal-cms (pyproject.toml)
|
||||||
|
typing-extensions==4.15.0
|
||||||
|
# via beautifulsoup4
|
||||||
|
|||||||
1
tests/instance/pages/no-title-or-h1.md
Normal file
1
tests/instance/pages/no-title-or-h1.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
there's just some words here but no title tag or h1
|
||||||
1
tests/instance/pages/no-title-subdir/index.md
Normal file
1
tests/instance/pages/no-title-subdir/index.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
there's just some words here but no title tag or h1
|
||||||
1
tests/instance/pages/no-title-subdir/no-title-or-h1.md
Normal file
1
tests/instance/pages/no-title-subdir/no-title-or-h1.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
there's just some words here but no title tag or h1
|
||||||
@ -146,11 +146,33 @@ def test_parse_md_metadata_forced_no_title():
|
|||||||
assert page_title == 'example.org'
|
assert page_title == 'example.org'
|
||||||
|
|
||||||
|
|
||||||
def test_parse_md_metadata_no_title_so_path():
|
def test_parse_md_metadata_no_title_so_h1():
|
||||||
"""Test the direct results of parsing a markdown file."""
|
"""Test the direct results of parsing a markdown file."""
|
||||||
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR)
|
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR)
|
||||||
assert page_name == '/subdir/'
|
assert page_name == 'another page'
|
||||||
assert page_title == '/subdir/ - example.org'
|
assert page_title == 'another page - example.org'
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_md_metadata_no_title_or_h1_so_path():
|
||||||
|
"""Test the direct results of parsing a markdown file."""
|
||||||
|
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR)
|
||||||
|
assert page_name == '/no-title-or-h1'
|
||||||
|
assert page_title == '/no-title-or-h1 - example.org'
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_md_metadata_no_title_or_h1_so_path_dir():
|
||||||
|
"""Test the direct results of parsing a markdown file."""
|
||||||
|
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR)
|
||||||
|
assert page_name == '/no-title-subdir/'
|
||||||
|
assert page_title == '/no-title-subdir/ - example.org'
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_md_metadata_no_title_or_h1_so_path_dir_file():
|
||||||
|
"""Test the direct results of parsing a markdown file."""
|
||||||
|
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'),
|
||||||
|
PAGES_DIR)
|
||||||
|
assert page_name == '/no-title-subdir/no-title-or-h1'
|
||||||
|
assert page_title == '/no-title-subdir/no-title-or-h1 - example.org'
|
||||||
|
|
||||||
|
|
||||||
def test_parse_md_no_file():
|
def test_parse_md_no_file():
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user