use beautifulsoup to parse the description from the first paragraph

Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
This commit is contained in:
Brian S. Stephan 2026-01-28 14:27:21 -06:00
parent 20673c178a
commit 8238787900
Signed by: bss
GPG Key ID: 3DE06D3180895FCB
4 changed files with 44 additions and 15 deletions

View File

@ -51,7 +51,7 @@ def generate_feed(feed_type: str, instance_dir: str, dest_dir: str) -> None:
# get the actual file to parse it # get the actual file to parse it
resolved_path = os.path.relpath(os.path.realpath(feed_entry_path), pages_dir) resolved_path = os.path.relpath(os.path.realpath(feed_entry_path), pages_dir)
try: try:
content, md, page_name, page_title, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir)
link = f'https://{Config.DOMAIN_NAME}{instance_resource_path_to_request_path(resolved_path)}' link = f'https://{Config.DOMAIN_NAME}{instance_resource_path_to_request_path(resolved_path)}'
except (OSError, ValueError, TypeError): except (OSError, ValueError, TypeError):
logger.exception("error loading/rendering markdown!") logger.exception("error loading/rendering markdown!")

View File

@ -95,10 +95,18 @@ def parse_md(path: str, pages_root: str):
elif not page_name: elif not page_name:
page_name = instance_resource_path_to_request_path(rel_path) page_name = instance_resource_path_to_request_path(rel_path)
# get the page description from the markdown tags or first paragraph
page_description = None
if md.Meta.get('description'):
page_description = get_meta_str(md, 'description')
elif p_tag := soup.find('p'):
if page_description := p_tag.string:
page_description = page_description.replace('\n', ' ')
page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
logger.debug("title (potentially derived): %s", page_title) logger.debug("title (potentially derived): %s", page_title)
return content, md, page_name, page_title, mtime return content, md, page_name, page_title, page_description, mtime
def handle_markdown_file_path(path: str, pages_root: str) -> str: def handle_markdown_file_path(path: str, pages_root: str) -> str:
@ -109,7 +117,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str:
pages_root: the absolute path to the pages/ dir, which the path should be within. necessary for pages_root: the absolute path to the pages/ dir, which the path should be within. necessary for
proper resolution of resolving parent pages (which needs to know when to stop) proper resolution of resolving parent pages (which needs to know when to stop)
""" """
content, md, page_name, page_title, mtime = parse_md(path, pages_root) content, md, page_name, page_title, page_description, mtime = parse_md(path, pages_root)
relative_path = os.path.relpath(path, pages_root) relative_path = os.path.relpath(path, pages_root)
parent_navs = generate_parent_navs(relative_path, pages_root) parent_navs = generate_parent_navs(relative_path, pages_root)
extra_footer = get_meta_str(md, 'footer') if md.Meta.get('footer') else None extra_footer = get_meta_str(md, 'footer') if md.Meta.get('footer') else None
@ -123,7 +131,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str:
template = jinja_env.get_template(template_name) template = jinja_env.get_template(template_name)
return template.render(title=page_title, return template.render(title=page_title,
config=Config, config=Config,
description=get_meta_str(md, 'description'), description=page_description,
image=Config.BASE_HOST + get_meta_str(md, 'image'), image=Config.BASE_HOST + get_meta_str(md, 'image'),
content=content, content=content,
base_url=Config.BASE_HOST + instance_resource_path_to_request_path(relative_path), base_url=Config.BASE_HOST + instance_resource_path_to_request_path(relative_path),

View File

@ -0,0 +1,6 @@
# rambling test for inferred description
this is a long string of text where
I am typing a lot over multiple lines
this second paragraph shouldn't be in the metadata

View File

@ -134,62 +134,77 @@ def test_request_path_to_breadcrumb_display_patterns():
def test_parse_md_metadata(): def test_parse_md_metadata():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'more-metadata.md'), PAGES_DIR) content, md, page_name, page_title, page_desc, mtime = parse_md(
os.path.join(PAGES_DIR, 'more-metadata.md'),
PAGES_DIR
)
assert page_name == 'title for the page' assert page_name == 'title for the page'
assert page_title == 'title for the page - example.org' assert page_title == 'title for the page - example.org'
assert page_desc == 'description of this page made even longer'
def test_parse_md_metadata_forced_no_title(): def test_parse_md_metadata_forced_no_title():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'forced-no-title.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'forced-no-title.md'), PAGES_DIR)
assert page_name == '' assert page_name == ''
assert page_title == 'example.org' assert page_title == 'example.org'
def test_parse_md_metadata_no_title_so_h1(): def test_parse_md_metadata_no_title_so_h1():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR)
assert page_name == 'another page' assert page_name == 'another page'
assert page_title == 'another page - example.org' assert page_title == 'another page - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path(): def test_parse_md_metadata_no_title_or_h1_so_path():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR)
assert page_name == '/no-title-or-h1' assert page_name == '/no-title-or-h1'
assert page_title == '/no-title-or-h1 - example.org' assert page_title == '/no-title-or-h1 - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path_dir(): def test_parse_md_metadata_no_title_or_h1_so_path_dir():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'),
PAGES_DIR)
assert page_name == '/no-title-subdir/' assert page_name == '/no-title-subdir/'
assert page_title == '/no-title-subdir/ - example.org' assert page_title == '/no-title-subdir/ - example.org'
def test_parse_md_metadata_no_title_or_h1_so_path_dir_file(): def test_parse_md_metadata_no_title_or_h1_so_path_dir_file():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'), content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR,
'no-title-subdir/no-title-or-h1.md'),
PAGES_DIR) PAGES_DIR)
assert page_name == '/no-title-subdir/no-title-or-h1' assert page_name == '/no-title-subdir/no-title-or-h1'
assert page_title == '/no-title-subdir/no-title-or-h1 - example.org' assert page_title == '/no-title-subdir/no-title-or-h1 - example.org'
def test_parse_md_derive_description_from_p():
"""Test that we can get a description from the first paragraph in the file."""
content, md, page_name, page_title, page_desc, mtime = parse_md(
os.path.join(PAGES_DIR, 'rambling.md'),
PAGES_DIR
)
assert page_desc == 'this is a long string of text where I am typing a lot over multiple lines'
def test_parse_md_no_file(): def test_parse_md_no_file():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
with pytest.raises(FileNotFoundError): with pytest.raises(FileNotFoundError):
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'nope.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'nope.md'), PAGES_DIR)
def test_parse_md_bad_file(): def test_parse_md_bad_file():
"""Test the direct results of parsing a markdown file.""" """Test the direct results of parsing a markdown file."""
with pytest.raises(ValueError): with pytest.raises(ValueError):
content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'actually-a-png.md'), PAGES_DIR) content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'actually-a-png.md'), PAGES_DIR)
def test_md_extension_in_source_link_is_stripped(): def test_md_extension_in_source_link_is_stripped():
"""Test that if a foo.md file link is specified in the Markdown, it is foo in the HTML.""" """Test that if a foo.md file link is specified in the Markdown, it is foo in the HTML."""
content, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-md-link.md'), PAGES_DIR) content, _, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-md-link.md'), PAGES_DIR)
assert '<a href="foo">Foo</a>' in content assert '<a href="foo">Foo</a>' in content
assert '<a href="foo#anchor">Anchored Foo</a>' in content assert '<a href="foo#anchor">Anchored Foo</a>' in content
assert '<a href="sub/foo">Sub Foo</a>' in content assert '<a href="sub/foo">Sub Foo</a>' in content
@ -198,7 +213,7 @@ def test_md_extension_in_source_link_is_stripped():
def test_index_in_source_link_is_stripped(): def test_index_in_source_link_is_stripped():
"""Test that if a index.md file link is specified in the Markdown, it is just the dir in the HTML.""" """Test that if a index.md file link is specified in the Markdown, it is just the dir in the HTML."""
content, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-index.md-link.md'), PAGES_DIR) content, _, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-index.md-link.md'), PAGES_DIR)
assert '<a href="cool/">Cool</a>' in content assert '<a href="cool/">Cool</a>' in content
assert '<a href="cool/#anchor">Anchored Cool</a>' in content assert '<a href="cool/#anchor">Anchored Cool</a>' in content
assert '<a href=".">This Index</a>' in content assert '<a href=".">This Index</a>' in content