From 69f2852a7f2e8e74492528461ac4c4678ce787b6 Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Fri, 3 Apr 2026 08:58:36 -0500 Subject: [PATCH 1/4] feat(seo): add sitemap.xml and robots.txt generation (#48, #60) Add /sitemap.xml with loc+lastmod for homepage, post index, all published posts, and public pages. Add /robots.txt allowing all crawlers, disallowing admin/auth/health/webhooks paths, with Sitemap directive. Extract get_all_pages() helper in content service and refactor feed.py to use shared get_all_posts(). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/squishmark/main.py | 3 +- src/squishmark/routers/feed.py | 21 +- src/squishmark/routers/seo.py | 112 +++++++++++ src/squishmark/services/content.py | 32 ++- tests/test_seo.py | 299 +++++++++++++++++++++++++++++ 5 files changed, 446 insertions(+), 21 deletions(-) create mode 100644 src/squishmark/routers/seo.py create mode 100644 tests/test_seo.py diff --git a/src/squishmark/main.py b/src/squishmark/main.py index 2fb1edf..b757d56 100644 --- a/src/squishmark/main.py +++ b/src/squishmark/main.py @@ -14,7 +14,7 @@ from squishmark.config import get_settings from squishmark.models.content import Config from squishmark.models.db import close_db, get_db_session, init_db -from squishmark.routers import admin, auth, feed, pages, posts, webhooks +from squishmark.routers import admin, auth, feed, pages, posts, seo, webhooks from squishmark.services.analytics import AnalyticsService from squishmark.services.github import get_github_service, shutdown_github_service from squishmark.services.markdown import get_markdown_service @@ -296,6 +296,7 @@ async def livereload_ws(websocket: WebSocket) -> None: app.include_router(admin.router) app.include_router(webhooks.router) app.include_router(feed.router) + app.include_router(seo.router) app.include_router(posts.router) app.include_router(pages.router) # Catch-all for static pages, must be last diff --git a/src/squishmark/routers/feed.py b/src/squishmark/routers/feed.py index 79d7792..1f28c7e 100644 --- a/src/squishmark/routers/feed.py +++ b/src/squishmark/routers/feed.py @@ -8,6 +8,7 @@ from squishmark.models.content import Config, Post from squishmark.services.cache import get_cache +from squishmark.services.content import get_all_posts from squishmark.services.github import get_github_service from squishmark.services.markdown import get_markdown_service @@ -91,24 +92,8 @@ async def atom_feed() -> Response: config = Config.from_dict(config_data) markdown_service = get_markdown_service(config) - # Fetch all published posts - post_files = await github_service.list_directory("posts") - posts: list[Post] = [] - for path in post_files: - if not path.endswith(".md"): - continue - file = await github_service.get_file(path) - if file is None: - continue - post = markdown_service.parse_post(path, file.content) - if not post.draft: - posts.append(post) - - # Newest first - posts.sort(key=lambda p: (p.date is not None, p.date), reverse=True) - - # Limit to 20 most recent - posts = posts[:20] + posts = await get_all_posts(github_service, markdown_service) + posts = posts[:20] # Limit to 20 most recent xml_bytes = _build_atom_feed(config, posts) await cache.set(FEED_CACHE_KEY, xml_bytes) diff --git a/src/squishmark/routers/seo.py b/src/squishmark/routers/seo.py new file mode 100644 index 0000000..cd2e1f8 --- /dev/null +++ b/src/squishmark/routers/seo.py @@ -0,0 +1,112 @@ +"""SEO routes: sitemap.xml and robots.txt.""" + +from xml.etree.ElementTree import Element, SubElement, tostring + +from fastapi import APIRouter +from fastapi.responses import Response + +from squishmark.models.content import Config, Page, Post +from squishmark.services.cache import get_cache +from squishmark.services.content import get_all_pages, get_all_posts +from squishmark.services.github import get_github_service +from squishmark.services.markdown import get_markdown_service + +router = APIRouter(tags=["seo"]) + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" +SITEMAP_CACHE_KEY = "seo:sitemap" +ROBOTS_CACHE_KEY = "seo:robots" + + +def _build_sitemap(config: Config, posts: list[Post], pages: list[Page]) -> bytes: + """Build a sitemap.xml from config, posts, and pages.""" + site_url = config.site.url.rstrip("/") if config.site.url else "" + + urlset = Element("urlset", xmlns=SITEMAP_NS) + + # Homepage + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = f"{site_url}/" + if posts and posts[0].date: + SubElement(url_el, "lastmod").text = posts[0].date.isoformat() + + # Post index + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = f"{site_url}/posts" + + # Individual posts + for post in posts: + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = f"{site_url}{post.url}" + if post.date: + SubElement(url_el, "lastmod").text = post.date.isoformat() + + # Public pages only (not unlisted or hidden) + for page in pages: + if page.visibility != "public": + continue + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = f"{site_url}{page.url}" + + return b'\n' + tostring(urlset, encoding="unicode").encode("utf-8") + + +def _build_robots_txt(config: Config) -> str: + """Build robots.txt content.""" + site_url = config.site.url.rstrip("/") if config.site.url else "" + + lines = [ + "User-agent: *", + "Allow: /", + "", + "Disallow: /admin/*", + "Disallow: /auth/*", + "Disallow: /health", + "Disallow: /webhooks/*", + ] + + if site_url: + lines.append("") + lines.append(f"Sitemap: {site_url}/sitemap.xml") + + return "\n".join(lines) + "\n" + + +@router.get("/sitemap.xml") +async def sitemap_xml() -> Response: + """Serve the XML sitemap.""" + cache = get_cache() + + cached = await cache.get(SITEMAP_CACHE_KEY) + if cached is not None: + return Response(content=cached, media_type="application/xml; charset=utf-8") + + github_service = get_github_service() + config_data = await github_service.get_config() + config = Config.from_dict(config_data) + markdown_service = get_markdown_service(config) + + posts = await get_all_posts(github_service, markdown_service) + pages = await get_all_pages(github_service, markdown_service) + + xml_bytes = _build_sitemap(config, posts, pages) + await cache.set(SITEMAP_CACHE_KEY, xml_bytes) + return Response(content=xml_bytes, media_type="application/xml; charset=utf-8") + + +@router.get("/robots.txt") +async def robots_txt() -> Response: + """Serve robots.txt.""" + cache = get_cache() + + cached = await cache.get(ROBOTS_CACHE_KEY) + if cached is not None: + return Response(content=cached, media_type="text/plain; charset=utf-8") + + github_service = get_github_service() + config_data = await github_service.get_config() + config = Config.from_dict(config_data) + + content = _build_robots_txt(config) + await cache.set(ROBOTS_CACHE_KEY, content) + return Response(content=content, media_type="text/plain; charset=utf-8") diff --git a/src/squishmark/services/content.py b/src/squishmark/services/content.py index c0bf9f2..8264509 100644 --- a/src/squishmark/services/content.py +++ b/src/squishmark/services/content.py @@ -1,6 +1,6 @@ -"""Shared content helpers for fetching and filtering posts.""" +"""Shared content helpers for fetching and filtering posts and pages.""" -from squishmark.models.content import Post, SiteConfig +from squishmark.models.content import Page, Post, SiteConfig from squishmark.services.github import GitHubService from squishmark.services.markdown import MarkdownService @@ -51,3 +51,31 @@ def get_featured_posts(posts: list[Post], site_config: SiteConfig) -> list[Post] ), ) return featured[: site_config.featured_max] + + +async def get_all_pages( + github_service: GitHubService, + markdown_service: MarkdownService, + include_hidden: bool = False, +) -> list[Page]: + """Fetch and parse all pages from the content repository.""" + page_files = await github_service.list_directory("pages") + + pages: list[Page] = [] + for path in page_files: + if not path.endswith(".md"): + continue + + file = await github_service.get_file(path) + if file is None: + continue + + page = markdown_service.parse_page(path, file.content) + + # Skip hidden pages unless requested + if page.visibility == "hidden" and not include_hidden: + continue + + pages.append(page) + + return pages diff --git a/tests/test_seo.py b/tests/test_seo.py new file mode 100644 index 0000000..ddc950d --- /dev/null +++ b/tests/test_seo.py @@ -0,0 +1,299 @@ +"""Tests for SEO routes: sitemap.xml and robots.txt.""" + +import datetime +from unittest.mock import AsyncMock, MagicMock, patch +from xml.etree.ElementTree import fromstring + +import pytest + +from squishmark.models.content import Config, Page, Post +from squishmark.routers.seo import _build_robots_txt, _build_sitemap + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" + + +def _ns(tag: str) -> str: + """Prefix a tag with the sitemap namespace.""" + return f"{{{SITEMAP_NS}}}{tag}" + + +@pytest.fixture +def sample_config() -> Config: + return Config.from_dict( + { + "site": { + "title": "Test Blog", + "description": "A test blog", + "author": "Test Author", + "url": "https://example.com", + }, + } + ) + + +@pytest.fixture +def sample_config_no_url() -> Config: + return Config.from_dict( + { + "site": { + "title": "Test Blog", + }, + } + ) + + +@pytest.fixture +def sample_posts() -> list[Post]: + return [ + Post( + slug="post-one", + title="Post One", + date=datetime.date(2026, 2, 15), + html="

Content one

", + ), + Post( + slug="post-two", + title="Post Two", + date=datetime.date(2026, 2, 10), + html="

Content two

", + ), + ] + + +@pytest.fixture +def sample_pages() -> list[Page]: + return [ + Page(slug="about", title="About", visibility="public"), + Page(slug="secret", title="Secret", visibility="unlisted"), + Page(slug="hidden-page", title="Hidden", visibility="hidden"), + ] + + +class TestBuildSitemap: + def test_valid_xml_structure(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + + assert xml_bytes.startswith(b'') + root = fromstring(xml_bytes) + assert root.tag == _ns("urlset") + + def test_homepage_entry(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + + # First URL should be homepage + loc = urls[0].find(_ns("loc")).text + assert loc == "https://example.com/" + + def test_homepage_lastmod_from_newest_post(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + homepage = root.findall(_ns("url"))[0] + lastmod = homepage.find(_ns("lastmod")).text + assert lastmod == "2026-02-15" + + def test_post_index_entry(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/posts" in locs + + def test_posts_included_with_lastmod(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + + post_urls = {u.find(_ns("loc")).text: u for u in urls} + + post_one = post_urls["https://example.com/posts/post-one"] + assert post_one.find(_ns("lastmod")).text == "2026-02-15" + + post_two = post_urls["https://example.com/posts/post-two"] + assert post_two.find(_ns("lastmod")).text == "2026-02-10" + + def test_public_pages_included(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/about" in locs + + def test_unlisted_pages_excluded(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/secret" not in locs + + def test_hidden_pages_excluded(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/hidden-page" not in locs + + def test_empty_content(self, sample_config): + xml_bytes = _build_sitemap(sample_config, [], []) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + # Only homepage and post index + assert len(urls) == 2 + locs = [u.find(_ns("loc")).text for u in urls] + assert "https://example.com/" in locs + assert "https://example.com/posts" in locs + + def test_no_site_url_uses_relative_paths(self, sample_config_no_url, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config_no_url, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "/" in locs + assert "/posts" in locs + assert "/posts/post-one" in locs + + def test_post_without_date_has_no_lastmod(self, sample_config): + post = Post(slug="no-date", title="No Date", html="

Hi

") + xml_bytes = _build_sitemap(sample_config, [post], []) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + post_url = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/posts/no-date"][0] + assert post_url.find(_ns("lastmod")) is None + + def test_no_priority_or_changefreq(self, sample_config, sample_posts, sample_pages): + """Sitemap should not include priority or changefreq elements.""" + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + for url in root.findall(_ns("url")): + assert url.find(_ns("priority")) is None + assert url.find(_ns("changefreq")) is None + + +class TestBuildRobotsTxt: + def test_allows_all_crawlers(self, sample_config): + content = _build_robots_txt(sample_config) + assert "User-agent: *" in content + assert "Allow: /" in content + + def test_disallow_admin_paths(self, sample_config): + content = _build_robots_txt(sample_config) + assert "Disallow: /admin/*" in content + assert "Disallow: /auth/*" in content + assert "Disallow: /health" in content + assert "Disallow: /webhooks/*" in content + + def test_sitemap_directive_with_url(self, sample_config): + content = _build_robots_txt(sample_config) + assert "Sitemap: https://example.com/sitemap.xml" in content + + def test_no_sitemap_without_url(self, sample_config_no_url): + content = _build_robots_txt(sample_config_no_url) + assert "Sitemap" not in content + + def test_no_static_disallow(self, sample_config): + """Static files should not be blocked.""" + content = _build_robots_txt(sample_config) + assert "/static" not in content + + +class TestSitemapEndpoint: + @pytest.mark.asyncio + async def test_returns_xml_content_type(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = {"site": {"title": "Test"}} + mock_github.list_directory.return_value = [] + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + assert "application/xml" in response.media_type + + @pytest.mark.asyncio + async def test_cached_response_returned(self): + cached_xml = b'cached' + + with patch("squishmark.routers.seo.get_cache") as mock_cache_fn: + mock_cache = AsyncMock() + mock_cache.get.return_value = cached_xml + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + assert response.body == cached_xml + + @pytest.mark.asyncio + async def test_drafts_excluded(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = { + "site": {"title": "Test", "url": "https://example.com"}, + } + mock_github.list_directory.side_effect = [ + # posts directory + ["posts/2026-01-01-published.md", "posts/2026-01-02-draft.md"], + # pages directory + [], + ] + mock_github.get_file.side_effect = [ + MagicMock(content="---\ntitle: Published\ndate: 2026-01-01\n---\nContent."), + MagicMock(content="---\ntitle: Draft\ndate: 2026-01-02\ndraft: true\n---\nDraft."), + ] + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + root = fromstring(response.body) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/posts/published" in locs + assert "https://example.com/posts/draft" not in locs + + +class TestRobotsEndpoint: + @pytest.mark.asyncio + async def test_returns_text_content_type(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = {"site": {"title": "Test"}} + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import robots_txt + + response = await robots_txt() + + assert "text/plain" in response.media_type + + @pytest.mark.asyncio + async def test_cached_response_returned(self): + cached_txt = "User-agent: *\nAllow: /\n" + + with patch("squishmark.routers.seo.get_cache") as mock_cache_fn: + mock_cache = AsyncMock() + mock_cache.get.return_value = cached_txt + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import robots_txt + + response = await robots_txt() + + assert response.body.decode() == cached_txt From 6807bbc535c3652f98d5b3d9661cc9e582b571d6 Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:12:25 -0500 Subject: [PATCH 2/4] fix(seo): add lastmod to post index entry in sitemap Co-Authored-By: Claude Opus 4.6 (1M context) --- src/squishmark/routers/seo.py | 2 ++ tests/test_seo.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/squishmark/routers/seo.py b/src/squishmark/routers/seo.py index cd2e1f8..a0ff7b1 100644 --- a/src/squishmark/routers/seo.py +++ b/src/squishmark/routers/seo.py @@ -33,6 +33,8 @@ def _build_sitemap(config: Config, posts: list[Post], pages: list[Page]) -> byte # Post index url_el = SubElement(urlset, "url") SubElement(url_el, "loc").text = f"{site_url}/posts" + if posts and posts[0].date: + SubElement(url_el, "lastmod").text = posts[0].date.isoformat() # Individual posts for post in posts: diff --git a/tests/test_seo.py b/tests/test_seo.py index ddc950d..a615844 100644 --- a/tests/test_seo.py +++ b/tests/test_seo.py @@ -96,8 +96,9 @@ def test_homepage_lastmod_from_newest_post(self, sample_config, sample_posts, sa def test_post_index_entry(self, sample_config, sample_posts, sample_pages): xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) root = fromstring(xml_bytes) - locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] - assert "https://example.com/posts" in locs + urls = root.findall(_ns("url")) + post_index = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/posts"][0] + assert post_index.find(_ns("lastmod")).text == "2026-02-15" def test_posts_included_with_lastmod(self, sample_config, sample_posts, sample_pages): xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) From 1c5d43b8b850d9911ac8a301d150c598e3fb4853 Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:20:14 -0500 Subject: [PATCH 3/4] feat(seo): add date field to Page model and lastmod to page sitemap entries Pages can now have an optional date in frontmatter. When present, it appears as lastmod in the sitemap. Pages without dates simply omit lastmod, which is valid per the sitemap spec. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/squishmark/models/content.py | 1 + src/squishmark/routers/seo.py | 2 ++ src/squishmark/services/markdown.py | 1 + tests/test_seo.py | 17 +++++++++++++---- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/squishmark/models/content.py b/src/squishmark/models/content.py index c54099b..9caf07a 100644 --- a/src/squishmark/models/content.py +++ b/src/squishmark/models/content.py @@ -68,6 +68,7 @@ class Page(BaseModel): slug: str title: str + date: datetime.date | None = None description: str = "" content: str = "" # Raw markdown html: str = "" # Rendered HTML diff --git a/src/squishmark/routers/seo.py b/src/squishmark/routers/seo.py index a0ff7b1..d6dcf6e 100644 --- a/src/squishmark/routers/seo.py +++ b/src/squishmark/routers/seo.py @@ -49,6 +49,8 @@ def _build_sitemap(config: Config, posts: list[Post], pages: list[Page]) -> byte continue url_el = SubElement(urlset, "url") SubElement(url_el, "loc").text = f"{site_url}{page.url}" + if page.date: + SubElement(url_el, "lastmod").text = page.date.isoformat() return b'\n' + tostring(urlset, encoding="unicode").encode("utf-8") diff --git a/src/squishmark/services/markdown.py b/src/squishmark/services/markdown.py index 0bc939c..b1affbb 100644 --- a/src/squishmark/services/markdown.py +++ b/src/squishmark/services/markdown.py @@ -219,6 +219,7 @@ def parse_page(self, path: str, content: str) -> Page: return Page( slug=slug, title=frontmatter.title, + date=frontmatter.date, description=description, content=markdown_content, html=html, diff --git a/tests/test_seo.py b/tests/test_seo.py index a615844..5eaca1c 100644 --- a/tests/test_seo.py +++ b/tests/test_seo.py @@ -63,7 +63,7 @@ def sample_posts() -> list[Post]: @pytest.fixture def sample_pages() -> list[Page]: return [ - Page(slug="about", title="About", visibility="public"), + Page(slug="about", title="About", visibility="public", date=datetime.date(2026, 1, 20)), Page(slug="secret", title="Secret", visibility="unlisted"), Page(slug="hidden-page", title="Hidden", visibility="hidden"), ] @@ -113,11 +113,20 @@ def test_posts_included_with_lastmod(self, sample_config, sample_posts, sample_p post_two = post_urls["https://example.com/posts/post-two"] assert post_two.find(_ns("lastmod")).text == "2026-02-10" - def test_public_pages_included(self, sample_config, sample_posts, sample_pages): + def test_public_pages_included_with_lastmod(self, sample_config, sample_posts, sample_pages): xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) root = fromstring(xml_bytes) - locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] - assert "https://example.com/about" in locs + urls = root.findall(_ns("url")) + about = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/about"][0] + assert about.find(_ns("lastmod")).text == "2026-01-20" + + def test_page_without_date_has_no_lastmod(self, sample_config, sample_posts): + page = Page(slug="no-date", title="No Date", visibility="public") + xml_bytes = _build_sitemap(sample_config, sample_posts, [page]) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + page_url = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/no-date"][0] + assert page_url.find(_ns("lastmod")) is None def test_unlisted_pages_excluded(self, sample_config, sample_posts, sample_pages): xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) From 856a865ad942c99b025770f5f138f620eb5b88ce Mon Sep 17 00:00:00 2001 From: Xeek <6032840+x3ek@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:29:36 -0500 Subject: [PATCH 4/4] refactor(seo): extract _add_url helper to reduce duplication in sitemap builder Co-Authored-By: Claude Opus 4.6 (1M context) --- src/squishmark/routers/seo.py | 38 ++++++++++++++--------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/src/squishmark/routers/seo.py b/src/squishmark/routers/seo.py index d6dcf6e..a54e13b 100644 --- a/src/squishmark/routers/seo.py +++ b/src/squishmark/routers/seo.py @@ -1,5 +1,6 @@ """SEO routes: sitemap.xml and robots.txt.""" +import datetime from xml.etree.ElementTree import Element, SubElement, tostring from fastapi import APIRouter @@ -18,39 +19,30 @@ ROBOTS_CACHE_KEY = "seo:robots" +def _add_url(urlset: Element, loc: str, lastmod: datetime.date | None = None) -> None: + """Append a entry to the sitemap urlset.""" + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = loc + if lastmod: + SubElement(url_el, "lastmod").text = lastmod.isoformat() + + def _build_sitemap(config: Config, posts: list[Post], pages: list[Page]) -> bytes: """Build a sitemap.xml from config, posts, and pages.""" site_url = config.site.url.rstrip("/") if config.site.url else "" + newest_post_date = posts[0].date if posts else None urlset = Element("urlset", xmlns=SITEMAP_NS) - # Homepage - url_el = SubElement(urlset, "url") - SubElement(url_el, "loc").text = f"{site_url}/" - if posts and posts[0].date: - SubElement(url_el, "lastmod").text = posts[0].date.isoformat() - - # Post index - url_el = SubElement(urlset, "url") - SubElement(url_el, "loc").text = f"{site_url}/posts" - if posts and posts[0].date: - SubElement(url_el, "lastmod").text = posts[0].date.isoformat() + _add_url(urlset, f"{site_url}/", newest_post_date) + _add_url(urlset, f"{site_url}/posts", newest_post_date) - # Individual posts for post in posts: - url_el = SubElement(urlset, "url") - SubElement(url_el, "loc").text = f"{site_url}{post.url}" - if post.date: - SubElement(url_el, "lastmod").text = post.date.isoformat() + _add_url(urlset, f"{site_url}{post.url}", post.date) - # Public pages only (not unlisted or hidden) for page in pages: - if page.visibility != "public": - continue - url_el = SubElement(urlset, "url") - SubElement(url_el, "loc").text = f"{site_url}{page.url}" - if page.date: - SubElement(url_el, "lastmod").text = page.date.isoformat() + if page.visibility == "public": + _add_url(urlset, f"{site_url}{page.url}", page.date) return b'\n' + tostring(urlset, encoding="unicode").encode("utf-8")