use beautifulsoup to parse the description from the first paragraph

Signed-off-by: Brian S. Stephan <bss@incorporeal.org>
2026-01-28 14:27:21 -06:00
parent 20673c178a
commit 8238787900
4 changed files with 44 additions and 15 deletions
--- a/incorporealcms/feed.py
+++ b/incorporealcms/feed.py
@@ -51,7 +51,7 @@ def generate_feed(feed_type: str, instance_dir: str, dest_dir: str) -> None:
        # get the actual file to parse it
        resolved_path = os.path.relpath(os.path.realpath(feed_entry_path), pages_dir)
        try:
-            content, md, page_name, page_title, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir)
+            content, md, page_name, page_title, _, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir)
            link = f'https://{Config.DOMAIN_NAME}{instance_resource_path_to_request_path(resolved_path)}'
        except (OSError, ValueError, TypeError):
            logger.exception("error loading/rendering markdown!")
--- a/incorporealcms/markdown.py
+++ b/incorporealcms/markdown.py
@@ -95,10 +95,18 @@ def parse_md(path: str, pages_root: str):
    elif not page_name:
        page_name = instance_resource_path_to_request_path(rel_path)

+    # get the page description from the markdown tags or first paragraph
+    page_description = None
+    if md.Meta.get('description'):
+        page_description = get_meta_str(md, 'description')
+    elif p_tag := soup.find('p'):
+        if page_description := p_tag.string:
+            page_description = page_description.replace('\n', ' ')
+
    page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX
    logger.debug("title (potentially derived): %s", page_title)

-    return content, md, page_name, page_title, mtime
+    return content, md, page_name, page_title, page_description, mtime


 def handle_markdown_file_path(path: str, pages_root: str) -> str:
@@ -109,7 +117,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str:
        pages_root: the absolute path to the pages/ dir, which the path should be within. necessary for
                    proper resolution of resolving parent pages (which needs to know when to stop)
    """
-    content, md, page_name, page_title, mtime = parse_md(path, pages_root)
+    content, md, page_name, page_title, page_description, mtime = parse_md(path, pages_root)
    relative_path = os.path.relpath(path, pages_root)
    parent_navs = generate_parent_navs(relative_path, pages_root)
    extra_footer = get_meta_str(md, 'footer') if md.Meta.get('footer') else None
@@ -123,7 +131,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str:
    template = jinja_env.get_template(template_name)
    return template.render(title=page_title,
                           config=Config,
-                           description=get_meta_str(md, 'description'),
+                           description=page_description,
                           image=Config.BASE_HOST + get_meta_str(md, 'image'),
                           content=content,
                           base_url=Config.BASE_HOST + instance_resource_path_to_request_path(relative_path),