From 82387879006c6d7fc1920e9c369dc8f3b81410fe Mon Sep 17 00:00:00 2001 From: "Brian S. Stephan" Date: Wed, 28 Jan 2026 14:27:21 -0600 Subject: [PATCH] use beautifulsoup to parse the description from the first paragraph Signed-off-by: Brian S. Stephan --- incorporealcms/feed.py | 2 +- incorporealcms/markdown.py | 14 +++++++++--- tests/instance/pages/rambling.md | 6 ++++++ tests/test_markdown.py | 37 ++++++++++++++++++++++---------- 4 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 tests/instance/pages/rambling.md diff --git a/incorporealcms/feed.py b/incorporealcms/feed.py index 5a80071..d5d74ed 100644 --- a/incorporealcms/feed.py +++ b/incorporealcms/feed.py @@ -51,7 +51,7 @@ def generate_feed(feed_type: str, instance_dir: str, dest_dir: str) -> None: # get the actual file to parse it resolved_path = os.path.relpath(os.path.realpath(feed_entry_path), pages_dir) try: - content, md, page_name, page_title, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(pages_dir, resolved_path), pages_dir) link = f'https://{Config.DOMAIN_NAME}{instance_resource_path_to_request_path(resolved_path)}' except (OSError, ValueError, TypeError): logger.exception("error loading/rendering markdown!") diff --git a/incorporealcms/markdown.py b/incorporealcms/markdown.py index 2442357..9527d19 100644 --- a/incorporealcms/markdown.py +++ b/incorporealcms/markdown.py @@ -95,10 +95,18 @@ def parse_md(path: str, pages_root: str): elif not page_name: page_name = instance_resource_path_to_request_path(rel_path) + # get the page description from the markdown tags or first paragraph + page_description = None + if md.Meta.get('description'): + page_description = get_meta_str(md, 'description') + elif p_tag := soup.find('p'): + if page_description := p_tag.string: + page_description = page_description.replace('\n', ' ') + page_title = f'{page_name} - {Config.TITLE_SUFFIX}' if page_name else Config.TITLE_SUFFIX logger.debug("title (potentially derived): %s", page_title) - return content, md, page_name, page_title, mtime + return content, md, page_name, page_title, page_description, mtime def handle_markdown_file_path(path: str, pages_root: str) -> str: @@ -109,7 +117,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str: pages_root: the absolute path to the pages/ dir, which the path should be within. necessary for proper resolution of resolving parent pages (which needs to know when to stop) """ - content, md, page_name, page_title, mtime = parse_md(path, pages_root) + content, md, page_name, page_title, page_description, mtime = parse_md(path, pages_root) relative_path = os.path.relpath(path, pages_root) parent_navs = generate_parent_navs(relative_path, pages_root) extra_footer = get_meta_str(md, 'footer') if md.Meta.get('footer') else None @@ -123,7 +131,7 @@ def handle_markdown_file_path(path: str, pages_root: str) -> str: template = jinja_env.get_template(template_name) return template.render(title=page_title, config=Config, - description=get_meta_str(md, 'description'), + description=page_description, image=Config.BASE_HOST + get_meta_str(md, 'image'), content=content, base_url=Config.BASE_HOST + instance_resource_path_to_request_path(relative_path), diff --git a/tests/instance/pages/rambling.md b/tests/instance/pages/rambling.md new file mode 100644 index 0000000..6f6fd57 --- /dev/null +++ b/tests/instance/pages/rambling.md @@ -0,0 +1,6 @@ +# rambling test for inferred description + +this is a long string of text where +I am typing a lot over multiple lines + +this second paragraph shouldn't be in the metadata diff --git a/tests/test_markdown.py b/tests/test_markdown.py index 9122265..c5a9dfd 100644 --- a/tests/test_markdown.py +++ b/tests/test_markdown.py @@ -134,62 +134,77 @@ def test_request_path_to_breadcrumb_display_patterns(): def test_parse_md_metadata(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'more-metadata.md'), PAGES_DIR) + content, md, page_name, page_title, page_desc, mtime = parse_md( + os.path.join(PAGES_DIR, 'more-metadata.md'), + PAGES_DIR + ) assert page_name == 'title for the page' assert page_title == 'title for the page - example.org' + assert page_desc == 'description of this page made even longer' def test_parse_md_metadata_forced_no_title(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'forced-no-title.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'forced-no-title.md'), PAGES_DIR) assert page_name == '' assert page_title == 'example.org' def test_parse_md_metadata_no_title_so_h1(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'subdir/index.md'), PAGES_DIR) assert page_name == 'another page' assert page_title == 'another page - example.org' def test_parse_md_metadata_no_title_or_h1_so_path(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-or-h1.md'), PAGES_DIR) assert page_name == '/no-title-or-h1' assert page_title == '/no-title-or-h1 - example.org' def test_parse_md_metadata_no_title_or_h1_so_path_dir(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/index.md'), + PAGES_DIR) assert page_name == '/no-title-subdir/' assert page_title == '/no-title-subdir/ - example.org' def test_parse_md_metadata_no_title_or_h1_so_path_dir_file(): """Test the direct results of parsing a markdown file.""" - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'no-title-subdir/no-title-or-h1.md'), - PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, + 'no-title-subdir/no-title-or-h1.md'), + PAGES_DIR) assert page_name == '/no-title-subdir/no-title-or-h1' assert page_title == '/no-title-subdir/no-title-or-h1 - example.org' +def test_parse_md_derive_description_from_p(): + """Test that we can get a description from the first paragraph in the file.""" + content, md, page_name, page_title, page_desc, mtime = parse_md( + os.path.join(PAGES_DIR, 'rambling.md'), + PAGES_DIR + ) + assert page_desc == 'this is a long string of text where I am typing a lot over multiple lines' + + def test_parse_md_no_file(): """Test the direct results of parsing a markdown file.""" with pytest.raises(FileNotFoundError): - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'nope.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'nope.md'), PAGES_DIR) def test_parse_md_bad_file(): """Test the direct results of parsing a markdown file.""" with pytest.raises(ValueError): - content, md, page_name, page_title, mtime = parse_md(os.path.join(PAGES_DIR, 'actually-a-png.md'), PAGES_DIR) + content, md, page_name, page_title, _, mtime = parse_md(os.path.join(PAGES_DIR, 'actually-a-png.md'), PAGES_DIR) def test_md_extension_in_source_link_is_stripped(): """Test that if a foo.md file link is specified in the Markdown, it is foo in the HTML.""" - content, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-md-link.md'), PAGES_DIR) + content, _, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-md-link.md'), PAGES_DIR) assert 'Foo' in content assert 'Anchored Foo' in content assert 'Sub Foo' in content @@ -198,7 +213,7 @@ def test_md_extension_in_source_link_is_stripped(): def test_index_in_source_link_is_stripped(): """Test that if a index.md file link is specified in the Markdown, it is just the dir in the HTML.""" - content, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-index.md-link.md'), PAGES_DIR) + content, _, _, _, _, _ = parse_md(os.path.join(PAGES_DIR, 'file-with-index.md-link.md'), PAGES_DIR) assert 'Cool' in content assert 'Anchored Cool' in content assert 'This Index' in content