diff --git a/src/squishmark/main.py b/src/squishmark/main.py index 2fb1edf..b757d56 100644 --- a/src/squishmark/main.py +++ b/src/squishmark/main.py @@ -14,7 +14,7 @@ from squishmark.config import get_settings from squishmark.models.content import Config from squishmark.models.db import close_db, get_db_session, init_db -from squishmark.routers import admin, auth, feed, pages, posts, webhooks +from squishmark.routers import admin, auth, feed, pages, posts, seo, webhooks from squishmark.services.analytics import AnalyticsService from squishmark.services.github import get_github_service, shutdown_github_service from squishmark.services.markdown import get_markdown_service @@ -296,6 +296,7 @@ async def livereload_ws(websocket: WebSocket) -> None: app.include_router(admin.router) app.include_router(webhooks.router) app.include_router(feed.router) + app.include_router(seo.router) app.include_router(posts.router) app.include_router(pages.router) # Catch-all for static pages, must be last diff --git a/src/squishmark/models/content.py b/src/squishmark/models/content.py index c54099b..9caf07a 100644 --- a/src/squishmark/models/content.py +++ b/src/squishmark/models/content.py @@ -68,6 +68,7 @@ class Page(BaseModel): slug: str title: str + date: datetime.date | None = None description: str = "" content: str = "" # Raw markdown html: str = "" # Rendered HTML diff --git a/src/squishmark/routers/feed.py b/src/squishmark/routers/feed.py index 79d7792..1f28c7e 100644 --- a/src/squishmark/routers/feed.py +++ b/src/squishmark/routers/feed.py @@ -8,6 +8,7 @@ from squishmark.models.content import Config, Post from squishmark.services.cache import get_cache +from squishmark.services.content import get_all_posts from squishmark.services.github import get_github_service from squishmark.services.markdown import get_markdown_service @@ -91,24 +92,8 @@ async def atom_feed() -> Response: config = Config.from_dict(config_data) markdown_service = get_markdown_service(config) - # Fetch all published posts - post_files = await github_service.list_directory("posts") - posts: list[Post] = [] - for path in post_files: - if not path.endswith(".md"): - continue - file = await github_service.get_file(path) - if file is None: - continue - post = markdown_service.parse_post(path, file.content) - if not post.draft: - posts.append(post) - - # Newest first - posts.sort(key=lambda p: (p.date is not None, p.date), reverse=True) - - # Limit to 20 most recent - posts = posts[:20] + posts = await get_all_posts(github_service, markdown_service) + posts = posts[:20] # Limit to 20 most recent xml_bytes = _build_atom_feed(config, posts) await cache.set(FEED_CACHE_KEY, xml_bytes) diff --git a/src/squishmark/routers/seo.py b/src/squishmark/routers/seo.py new file mode 100644 index 0000000..a54e13b --- /dev/null +++ b/src/squishmark/routers/seo.py @@ -0,0 +1,108 @@ +"""SEO routes: sitemap.xml and robots.txt.""" + +import datetime +from xml.etree.ElementTree import Element, SubElement, tostring + +from fastapi import APIRouter +from fastapi.responses import Response + +from squishmark.models.content import Config, Page, Post +from squishmark.services.cache import get_cache +from squishmark.services.content import get_all_pages, get_all_posts +from squishmark.services.github import get_github_service +from squishmark.services.markdown import get_markdown_service + +router = APIRouter(tags=["seo"]) + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" +SITEMAP_CACHE_KEY = "seo:sitemap" +ROBOTS_CACHE_KEY = "seo:robots" + + +def _add_url(urlset: Element, loc: str, lastmod: datetime.date | None = None) -> None: + """Append a entry to the sitemap urlset.""" + url_el = SubElement(urlset, "url") + SubElement(url_el, "loc").text = loc + if lastmod: + SubElement(url_el, "lastmod").text = lastmod.isoformat() + + +def _build_sitemap(config: Config, posts: list[Post], pages: list[Page]) -> bytes: + """Build a sitemap.xml from config, posts, and pages.""" + site_url = config.site.url.rstrip("/") if config.site.url else "" + newest_post_date = posts[0].date if posts else None + + urlset = Element("urlset", xmlns=SITEMAP_NS) + + _add_url(urlset, f"{site_url}/", newest_post_date) + _add_url(urlset, f"{site_url}/posts", newest_post_date) + + for post in posts: + _add_url(urlset, f"{site_url}{post.url}", post.date) + + for page in pages: + if page.visibility == "public": + _add_url(urlset, f"{site_url}{page.url}", page.date) + + return b'\n' + tostring(urlset, encoding="unicode").encode("utf-8") + + +def _build_robots_txt(config: Config) -> str: + """Build robots.txt content.""" + site_url = config.site.url.rstrip("/") if config.site.url else "" + + lines = [ + "User-agent: *", + "Allow: /", + "", + "Disallow: /admin/*", + "Disallow: /auth/*", + "Disallow: /health", + "Disallow: /webhooks/*", + ] + + if site_url: + lines.append("") + lines.append(f"Sitemap: {site_url}/sitemap.xml") + + return "\n".join(lines) + "\n" + + +@router.get("/sitemap.xml") +async def sitemap_xml() -> Response: + """Serve the XML sitemap.""" + cache = get_cache() + + cached = await cache.get(SITEMAP_CACHE_KEY) + if cached is not None: + return Response(content=cached, media_type="application/xml; charset=utf-8") + + github_service = get_github_service() + config_data = await github_service.get_config() + config = Config.from_dict(config_data) + markdown_service = get_markdown_service(config) + + posts = await get_all_posts(github_service, markdown_service) + pages = await get_all_pages(github_service, markdown_service) + + xml_bytes = _build_sitemap(config, posts, pages) + await cache.set(SITEMAP_CACHE_KEY, xml_bytes) + return Response(content=xml_bytes, media_type="application/xml; charset=utf-8") + + +@router.get("/robots.txt") +async def robots_txt() -> Response: + """Serve robots.txt.""" + cache = get_cache() + + cached = await cache.get(ROBOTS_CACHE_KEY) + if cached is not None: + return Response(content=cached, media_type="text/plain; charset=utf-8") + + github_service = get_github_service() + config_data = await github_service.get_config() + config = Config.from_dict(config_data) + + content = _build_robots_txt(config) + await cache.set(ROBOTS_CACHE_KEY, content) + return Response(content=content, media_type="text/plain; charset=utf-8") diff --git a/src/squishmark/services/content.py b/src/squishmark/services/content.py index c0bf9f2..8264509 100644 --- a/src/squishmark/services/content.py +++ b/src/squishmark/services/content.py @@ -1,6 +1,6 @@ -"""Shared content helpers for fetching and filtering posts.""" +"""Shared content helpers for fetching and filtering posts and pages.""" -from squishmark.models.content import Post, SiteConfig +from squishmark.models.content import Page, Post, SiteConfig from squishmark.services.github import GitHubService from squishmark.services.markdown import MarkdownService @@ -51,3 +51,31 @@ def get_featured_posts(posts: list[Post], site_config: SiteConfig) -> list[Post] ), ) return featured[: site_config.featured_max] + + +async def get_all_pages( + github_service: GitHubService, + markdown_service: MarkdownService, + include_hidden: bool = False, +) -> list[Page]: + """Fetch and parse all pages from the content repository.""" + page_files = await github_service.list_directory("pages") + + pages: list[Page] = [] + for path in page_files: + if not path.endswith(".md"): + continue + + file = await github_service.get_file(path) + if file is None: + continue + + page = markdown_service.parse_page(path, file.content) + + # Skip hidden pages unless requested + if page.visibility == "hidden" and not include_hidden: + continue + + pages.append(page) + + return pages diff --git a/src/squishmark/services/markdown.py b/src/squishmark/services/markdown.py index 0bc939c..b1affbb 100644 --- a/src/squishmark/services/markdown.py +++ b/src/squishmark/services/markdown.py @@ -219,6 +219,7 @@ def parse_page(self, path: str, content: str) -> Page: return Page( slug=slug, title=frontmatter.title, + date=frontmatter.date, description=description, content=markdown_content, html=html, diff --git a/tests/test_seo.py b/tests/test_seo.py new file mode 100644 index 0000000..5eaca1c --- /dev/null +++ b/tests/test_seo.py @@ -0,0 +1,309 @@ +"""Tests for SEO routes: sitemap.xml and robots.txt.""" + +import datetime +from unittest.mock import AsyncMock, MagicMock, patch +from xml.etree.ElementTree import fromstring + +import pytest + +from squishmark.models.content import Config, Page, Post +from squishmark.routers.seo import _build_robots_txt, _build_sitemap + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" + + +def _ns(tag: str) -> str: + """Prefix a tag with the sitemap namespace.""" + return f"{{{SITEMAP_NS}}}{tag}" + + +@pytest.fixture +def sample_config() -> Config: + return Config.from_dict( + { + "site": { + "title": "Test Blog", + "description": "A test blog", + "author": "Test Author", + "url": "https://example.com", + }, + } + ) + + +@pytest.fixture +def sample_config_no_url() -> Config: + return Config.from_dict( + { + "site": { + "title": "Test Blog", + }, + } + ) + + +@pytest.fixture +def sample_posts() -> list[Post]: + return [ + Post( + slug="post-one", + title="Post One", + date=datetime.date(2026, 2, 15), + html="

Content one

", + ), + Post( + slug="post-two", + title="Post Two", + date=datetime.date(2026, 2, 10), + html="

Content two

", + ), + ] + + +@pytest.fixture +def sample_pages() -> list[Page]: + return [ + Page(slug="about", title="About", visibility="public", date=datetime.date(2026, 1, 20)), + Page(slug="secret", title="Secret", visibility="unlisted"), + Page(slug="hidden-page", title="Hidden", visibility="hidden"), + ] + + +class TestBuildSitemap: + def test_valid_xml_structure(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + + assert xml_bytes.startswith(b'') + root = fromstring(xml_bytes) + assert root.tag == _ns("urlset") + + def test_homepage_entry(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + + # First URL should be homepage + loc = urls[0].find(_ns("loc")).text + assert loc == "https://example.com/" + + def test_homepage_lastmod_from_newest_post(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + homepage = root.findall(_ns("url"))[0] + lastmod = homepage.find(_ns("lastmod")).text + assert lastmod == "2026-02-15" + + def test_post_index_entry(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + post_index = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/posts"][0] + assert post_index.find(_ns("lastmod")).text == "2026-02-15" + + def test_posts_included_with_lastmod(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + + post_urls = {u.find(_ns("loc")).text: u for u in urls} + + post_one = post_urls["https://example.com/posts/post-one"] + assert post_one.find(_ns("lastmod")).text == "2026-02-15" + + post_two = post_urls["https://example.com/posts/post-two"] + assert post_two.find(_ns("lastmod")).text == "2026-02-10" + + def test_public_pages_included_with_lastmod(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + about = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/about"][0] + assert about.find(_ns("lastmod")).text == "2026-01-20" + + def test_page_without_date_has_no_lastmod(self, sample_config, sample_posts): + page = Page(slug="no-date", title="No Date", visibility="public") + xml_bytes = _build_sitemap(sample_config, sample_posts, [page]) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + page_url = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/no-date"][0] + assert page_url.find(_ns("lastmod")) is None + + def test_unlisted_pages_excluded(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/secret" not in locs + + def test_hidden_pages_excluded(self, sample_config, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/hidden-page" not in locs + + def test_empty_content(self, sample_config): + xml_bytes = _build_sitemap(sample_config, [], []) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + # Only homepage and post index + assert len(urls) == 2 + locs = [u.find(_ns("loc")).text for u in urls] + assert "https://example.com/" in locs + assert "https://example.com/posts" in locs + + def test_no_site_url_uses_relative_paths(self, sample_config_no_url, sample_posts, sample_pages): + xml_bytes = _build_sitemap(sample_config_no_url, sample_posts, sample_pages) + root = fromstring(xml_bytes) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "/" in locs + assert "/posts" in locs + assert "/posts/post-one" in locs + + def test_post_without_date_has_no_lastmod(self, sample_config): + post = Post(slug="no-date", title="No Date", html="

Hi

") + xml_bytes = _build_sitemap(sample_config, [post], []) + root = fromstring(xml_bytes) + urls = root.findall(_ns("url")) + post_url = [u for u in urls if u.find(_ns("loc")).text == "https://example.com/posts/no-date"][0] + assert post_url.find(_ns("lastmod")) is None + + def test_no_priority_or_changefreq(self, sample_config, sample_posts, sample_pages): + """Sitemap should not include priority or changefreq elements.""" + xml_bytes = _build_sitemap(sample_config, sample_posts, sample_pages) + root = fromstring(xml_bytes) + for url in root.findall(_ns("url")): + assert url.find(_ns("priority")) is None + assert url.find(_ns("changefreq")) is None + + +class TestBuildRobotsTxt: + def test_allows_all_crawlers(self, sample_config): + content = _build_robots_txt(sample_config) + assert "User-agent: *" in content + assert "Allow: /" in content + + def test_disallow_admin_paths(self, sample_config): + content = _build_robots_txt(sample_config) + assert "Disallow: /admin/*" in content + assert "Disallow: /auth/*" in content + assert "Disallow: /health" in content + assert "Disallow: /webhooks/*" in content + + def test_sitemap_directive_with_url(self, sample_config): + content = _build_robots_txt(sample_config) + assert "Sitemap: https://example.com/sitemap.xml" in content + + def test_no_sitemap_without_url(self, sample_config_no_url): + content = _build_robots_txt(sample_config_no_url) + assert "Sitemap" not in content + + def test_no_static_disallow(self, sample_config): + """Static files should not be blocked.""" + content = _build_robots_txt(sample_config) + assert "/static" not in content + + +class TestSitemapEndpoint: + @pytest.mark.asyncio + async def test_returns_xml_content_type(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = {"site": {"title": "Test"}} + mock_github.list_directory.return_value = [] + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + assert "application/xml" in response.media_type + + @pytest.mark.asyncio + async def test_cached_response_returned(self): + cached_xml = b'cached' + + with patch("squishmark.routers.seo.get_cache") as mock_cache_fn: + mock_cache = AsyncMock() + mock_cache.get.return_value = cached_xml + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + assert response.body == cached_xml + + @pytest.mark.asyncio + async def test_drafts_excluded(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = { + "site": {"title": "Test", "url": "https://example.com"}, + } + mock_github.list_directory.side_effect = [ + # posts directory + ["posts/2026-01-01-published.md", "posts/2026-01-02-draft.md"], + # pages directory + [], + ] + mock_github.get_file.side_effect = [ + MagicMock(content="---\ntitle: Published\ndate: 2026-01-01\n---\nContent."), + MagicMock(content="---\ntitle: Draft\ndate: 2026-01-02\ndraft: true\n---\nDraft."), + ] + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import sitemap_xml + + response = await sitemap_xml() + + root = fromstring(response.body) + locs = [u.find(_ns("loc")).text for u in root.findall(_ns("url"))] + assert "https://example.com/posts/published" in locs + assert "https://example.com/posts/draft" not in locs + + +class TestRobotsEndpoint: + @pytest.mark.asyncio + async def test_returns_text_content_type(self): + mock_github = AsyncMock() + mock_github.get_config.return_value = {"site": {"title": "Test"}} + + with ( + patch("squishmark.routers.seo.get_github_service", return_value=mock_github), + patch("squishmark.routers.seo.get_cache") as mock_cache_fn, + ): + mock_cache = AsyncMock() + mock_cache.get.return_value = None + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import robots_txt + + response = await robots_txt() + + assert "text/plain" in response.media_type + + @pytest.mark.asyncio + async def test_cached_response_returned(self): + cached_txt = "User-agent: *\nAllow: /\n" + + with patch("squishmark.routers.seo.get_cache") as mock_cache_fn: + mock_cache = AsyncMock() + mock_cache.get.return_value = cached_txt + mock_cache_fn.return_value = mock_cache + + from squishmark.routers.seo import robots_txt + + response = await robots_txt() + + assert response.body.decode() == cached_txt