diff --git a/.changeset/per-collection-sitemaps.md b/.changeset/per-collection-sitemaps.md new file mode 100644 index 000000000..262d09790 --- /dev/null +++ b/.changeset/per-collection-sitemaps.md @@ -0,0 +1,7 @@ +--- +"emdash": minor +--- + +Per-collection sitemaps with sitemap index and lastmod + +`/sitemap.xml` now serves a `` with one child sitemap per SEO-enabled collection. Each collection's sitemap is at `/sitemap-{collection}.xml` with `` on both index entries and individual URLs. Uses the collection's `url_pattern` for correct URL building. diff --git a/packages/core/src/api/handlers/index.ts b/packages/core/src/api/handlers/index.ts index 292fbb35a..1c317fcb5 100644 --- a/packages/core/src/api/handlers/index.ts +++ b/packages/core/src/api/handlers/index.ts @@ -84,7 +84,12 @@ export { } from "./schema.js"; // SEO handlers -export { handleSitemapData, type SitemapContentEntry, type SitemapDataResponse } from "./seo.js"; +export { + handleSitemapData, + type SitemapCollectionData, + type SitemapContentEntry, + type SitemapDataResponse, +} from "./seo.js"; // Plugin handlers export { diff --git a/packages/core/src/api/handlers/seo.ts b/packages/core/src/api/handlers/seo.ts index 4c4840571..82df1e6a2 100644 --- a/packages/core/src/api/handlers/seo.ts +++ b/packages/core/src/api/handlers/seo.ts @@ -12,48 +12,65 @@ import type { ApiResult } from "../types.js"; /** Raw content data for sitemap generation — the route builds the actual URLs */ export interface SitemapContentEntry { - /** Collection slug (e.g., "post", "page") */ - collection: string; - /** Content slug or ID */ - identifier: string; + /** Content ID (ULID) */ + id: string; + /** Content slug, or null when the entry has no slug */ + slug: string | null; /** ISO date of last modification */ updatedAt: string; } -export interface SitemapDataResponse { +/** Per-collection sitemap data with entries and URL pattern */ +export interface SitemapCollectionData { + /** Collection slug (e.g., "post", "page") */ + collection: string; + /** URL pattern with {slug} placeholder, or null for default /{collection}/{slug} */ + urlPattern: string | null; + /** Most recent updated_at across all entries (for sitemap index lastmod) */ + lastmod: string; + /** Individual content entries */ entries: SitemapContentEntry[]; } +export interface SitemapDataResponse { + collections: SitemapCollectionData[]; +} + /** Maximum entries per sitemap (per spec) */ const SITEMAP_MAX_ENTRIES = 50_000; /** * Collect all published, indexable content across SEO-enabled collections - * for sitemap generation. + * for sitemap generation, grouped by collection. * * Only includes content from collections with `has_seo = 1`. * Excludes content with `seo_no_index = 1` in the `_emdash_seo` table. * - * Returns raw data (collection + identifier + date). The caller (route) - * is responsible for building absolute URLs — this handler does NOT + * Returns raw data grouped per collection. The caller (route) is + * responsible for building absolute URLs — this handler does NOT * assume a URL structure. */ export async function handleSitemapData( db: Kysely, + /** When set, only return data for this collection. */ + collectionSlug?: string, ): Promise> { try { - // Find all SEO-enabled collections - const collections = await db + // Find SEO-enabled collections (optionally filtered) + let query = db .selectFrom("_emdash_collections") - .select(["slug"]) - .where("has_seo", "=", 1) - .execute(); + .select(["slug", "url_pattern"]) + .where("has_seo", "=", 1); + + if (collectionSlug) { + query = query.where("slug", "=", collectionSlug); + } - const entries: SitemapContentEntry[] = []; + const collections = await query.execute(); - for (const col of collections) { - if (entries.length >= SITEMAP_MAX_ENTRIES) break; + const result: SitemapCollectionData[] = []; + for (const col of collections) { // Validate the slug before using it as a table name identifier. // Should always pass (slugs are validated on creation), but // guards against corrupted DB data. @@ -65,7 +82,6 @@ export async function handleSitemapData( } const tableName = `ec_${col.slug}`; - const remaining = SITEMAP_MAX_ENTRIES - entries.length; // Query published, non-deleted content. // LEFT JOIN _emdash_seo to check noindex flag. @@ -87,16 +103,27 @@ export async function handleSitemapData( AND c.deleted_at IS NULL AND (s.seo_no_index IS NULL OR s.seo_no_index = 0) ORDER BY c.updated_at DESC - LIMIT ${remaining} + LIMIT ${SITEMAP_MAX_ENTRIES} `.execute(db); + if (rows.rows.length === 0) continue; + + const entries: SitemapContentEntry[] = []; for (const row of rows.rows) { entries.push({ - collection: col.slug, - identifier: row.slug || row.id, + id: row.id, + slug: row.slug, updatedAt: row.updated_at, }); } + + result.push({ + collection: col.slug, + urlPattern: col.url_pattern, + // Rows are ordered by updated_at DESC, so first row is the latest + lastmod: rows.rows[0].updated_at, + entries, + }); } catch (err) { // Table missing or query error — skip this collection console.warn(`[SITEMAP] Failed to query collection "${col.slug}":`, err); @@ -104,7 +131,7 @@ export async function handleSitemapData( } } - return { success: true, data: { entries } }; + return { success: true, data: { collections: result } }; } catch (error) { console.error("[SITEMAP_ERROR]", error); return { diff --git a/packages/core/src/astro/integration/routes.ts b/packages/core/src/astro/integration/routes.ts index a3eb49d53..ac4bd1c86 100644 --- a/packages/core/src/astro/integration/routes.ts +++ b/packages/core/src/astro/integration/routes.ts @@ -664,6 +664,11 @@ export function injectCoreRoutes(injectRoute: InjectRoute): void { entrypoint: resolveRoute("sitemap.xml.ts"), }); + injectRoute({ + pattern: "/sitemap-[collection].xml", + entrypoint: resolveRoute("sitemap-[collection].xml.ts"), + }); + injectRoute({ pattern: "/robots.txt", entrypoint: resolveRoute("robots.txt.ts"), diff --git a/packages/core/src/astro/middleware.ts b/packages/core/src/astro/middleware.ts index 67475220d..fd1b41501 100644 --- a/packages/core/src/astro/middleware.ts +++ b/packages/core/src/astro/middleware.ts @@ -177,6 +177,7 @@ function setBaselineSecurityHeaders(response: Response): void { /** Public routes that require the runtime (sitemap, robots.txt, etc.) */ const PUBLIC_RUNTIME_ROUTES = new Set(["/sitemap.xml", "/robots.txt"]); +const SITEMAP_COLLECTION_RE = /^\/sitemap-[a-z][a-z0-9_]*\.xml$/; export const onRequest = defineMiddleware(async (context, next) => { const { request, locals, cookies } = context; @@ -185,7 +186,8 @@ export const onRequest = defineMiddleware(async (context, next) => { // Process /_emdash routes and public routes with an active session // (logged-in editors need the runtime for toolbar/visual editing on public pages) const isEmDashRoute = url.pathname.startsWith("/_emdash"); - const isPublicRuntimeRoute = PUBLIC_RUNTIME_ROUTES.has(url.pathname); + const isPublicRuntimeRoute = + PUBLIC_RUNTIME_ROUTES.has(url.pathname) || SITEMAP_COLLECTION_RE.test(url.pathname); // Check for edit mode cookie - editors viewing public pages need the runtime // so auth middleware can verify their session for visual editing diff --git a/packages/core/src/astro/routes/sitemap-[collection].xml.ts b/packages/core/src/astro/routes/sitemap-[collection].xml.ts new file mode 100644 index 000000000..4c8505488 --- /dev/null +++ b/packages/core/src/astro/routes/sitemap-[collection].xml.ts @@ -0,0 +1,104 @@ +/** + * Per-collection sitemap endpoint + * + * GET /sitemap-{collection}.xml - Sitemap for a single content collection. + * + * Uses the collection's url_pattern to build URLs. Falls back to + * /{collection}/{slug} when no pattern is configured. + */ + +import type { APIRoute } from "astro"; + +import { handleSitemapData } from "#api/handlers/seo.js"; +import { getSiteSettingsWithDb } from "#settings/index.js"; + +export const prerender = false; + +const TRAILING_SLASH_RE = /\/$/; +const AMP_RE = /&/g; +const LT_RE = //g; +const QUOT_RE = /"/g; +const APOS_RE = /'/g; +const SLUG_PLACEHOLDER = "{slug}"; +const ID_PLACEHOLDER = "{id}"; + +export const GET: APIRoute = async ({ params, locals, url }) => { + const { emdash } = locals; + const collectionSlug = params.collection; + + if (!emdash?.db || !collectionSlug) { + return new Response("", { + status: 500, + headers: { "Content-Type": "application/xml" }, + }); + } + + try { + const settings = await getSiteSettingsWithDb(emdash.db); + const siteUrl = (settings.url || url.origin).replace(TRAILING_SLASH_RE, ""); + + const result = await handleSitemapData(emdash.db, collectionSlug); + + if (!result.success || !result.data) { + return new Response("", { + status: 500, + headers: { "Content-Type": "application/xml" }, + }); + } + + const col = result.data.collections[0]; + if (!col) { + return new Response("", { + status: 404, + headers: { "Content-Type": "application/xml" }, + }); + } + + const lines: string[] = [ + '', + '', + ]; + + for (const entry of col.entries) { + const slug = entry.slug || entry.id; + const path = col.urlPattern + ? col.urlPattern + .replace(SLUG_PLACEHOLDER, encodeURIComponent(slug)) + .replace(ID_PLACEHOLDER, encodeURIComponent(entry.id)) + : `/${encodeURIComponent(col.collection)}/${encodeURIComponent(slug)}`; + + const loc = `${siteUrl}${path}`; + + lines.push(" "); + lines.push(` ${escapeXml(loc)}`); + lines.push(` ${escapeXml(entry.updatedAt)}`); + lines.push(" "); + } + + lines.push(""); + + return new Response(lines.join("\n"), { + status: 200, + headers: { + "Content-Type": "application/xml; charset=utf-8", + "Cache-Control": "public, max-age=3600", + }, + }); + } catch { + return new Response("", { + status: 500, + headers: { "Content-Type": "application/xml" }, + }); + } +}; + +/** Escape special XML characters in a string */ +function escapeXml(str: string): string { + return str + .replace(AMP_RE, "&") + .replace(LT_RE, "<") + .replace(GT_RE, ">") + .replace(QUOT_RE, """) + .replace(APOS_RE, "'"); +} diff --git a/packages/core/src/astro/routes/sitemap.xml.ts b/packages/core/src/astro/routes/sitemap.xml.ts index 9fb853515..8a641fdef 100644 --- a/packages/core/src/astro/routes/sitemap.xml.ts +++ b/packages/core/src/astro/routes/sitemap.xml.ts @@ -1,13 +1,11 @@ /** - * Sitemap XML endpoint + * Sitemap index endpoint * - * GET /sitemap.xml - Auto-generated sitemap from published content + * GET /sitemap.xml - Sitemap index listing one sitemap per collection. * - * Includes all published, non-noindex content across all collections. - * The site URL is read from site settings or the request URL origin. - * - * Default URL pattern: /{collection}/{slug-or-id}. Users can override - * by creating their own /sitemap.xml route in their Astro project. + * Each collection with published, indexable content gets its own + * child sitemap at /sitemap-{collection}.xml. The index includes + * a per child derived from the most recently updated entry. */ import type { APIRoute } from "astro"; @@ -35,7 +33,6 @@ export const GET: APIRoute = async ({ locals, url }) => { } try { - // Determine site URL from settings or request origin const settings = await getSiteSettingsWithDb(emdash.db); const siteUrl = (settings.url || url.origin).replace(TRAILING_SLASH_RE, ""); @@ -48,28 +45,22 @@ export const GET: APIRoute = async ({ locals, url }) => { }); } - const entries = result.data.entries; + const { collections } = result.data; - // Build XML const lines: string[] = [ '', - '', + '', ]; - for (const entry of entries) { - // Default URL pattern: /{collection}/{identifier} - // Encode path segments to handle slugs with spaces/unicode/reserved chars - const loc = `${siteUrl}/${encodeURIComponent(entry.collection)}/${encodeURIComponent(entry.identifier)}`; - - lines.push(" "); + for (const col of collections) { + const loc = `${siteUrl}/sitemap-${encodeURIComponent(col.collection)}.xml`; + lines.push(" "); lines.push(` ${escapeXml(loc)}`); - lines.push(` ${escapeXml(entry.updatedAt)}`); - lines.push(" weekly"); - lines.push(" 0.7"); - lines.push(" "); + lines.push(` ${escapeXml(col.lastmod)}`); + lines.push(" "); } - lines.push(""); + lines.push(""); return new Response(lines.join("\n"), { status: 200, diff --git a/packages/core/tests/integration/seo/seo.test.ts b/packages/core/tests/integration/seo/seo.test.ts index 67e51c15b..cf5289456 100644 --- a/packages/core/tests/integration/seo/seo.test.ts +++ b/packages/core/tests/integration/seo/seo.test.ts @@ -801,6 +801,18 @@ describe("SEO", () => { }); describe("handleSitemapData", () => { + /** Flatten the per-collection response into a flat list with collection tag. */ + function flatEntries(data: { + collections: Array<{ + collection: string; + entries: Array<{ id: string; slug: string | null; updatedAt: string }>; + }>; + }) { + return data.collections.flatMap((c) => + c.entries.map((e) => ({ collection: c.collection, ...e })), + ); + } + it("should return published content from SEO-enabled collections", async () => { await repo.create({ type: "post", @@ -819,9 +831,10 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toHaveLength(1); - expect(result.data!.entries[0]!.collection).toBe("post"); - expect(result.data!.entries[0]!.identifier).toBe("published-post"); + const entries = flatEntries(result.data!); + expect(entries).toHaveLength(1); + expect(entries[0]!.collection).toBe("post"); + expect(entries[0]!.slug).toBe("published-post"); }); it("should exclude noindex content from sitemap", async () => { @@ -845,8 +858,9 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toHaveLength(1); - expect(result.data!.entries[0]!.identifier).toBe("visible-post"); + const entries = flatEntries(result.data!); + expect(entries).toHaveLength(1); + expect(entries[0]!.slug).toBe("visible-post"); }); it("should exclude deleted content from sitemap", async () => { @@ -862,7 +876,8 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toHaveLength(0); + const entries = flatEntries(result.data!); + expect(entries).toHaveLength(0); }); it("should include content from multiple SEO-enabled collections", async () => { @@ -883,11 +898,12 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toHaveLength(2); + expect(result.data!.collections).toHaveLength(2); - const identifiers = result.data!.entries.map((e) => `${e.collection}/${e.identifier}`); - expect(identifiers).toContain("post/my-post"); - expect(identifiers).toContain("page/about"); + const entries = flatEntries(result.data!); + const slugs = entries.map((e) => `${e.collection}/${e.slug}`); + expect(slugs).toContain("post/my-post"); + expect(slugs).toContain("page/about"); }); it("should exclude content from non-SEO collections", async () => { @@ -920,11 +936,11 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toHaveLength(1); - expect(result.data!.entries[0]!.collection).toBe("post"); + expect(result.data!.collections).toHaveLength(1); + expect(result.data!.collections[0]!.collection).toBe("post"); }); - it("should use ID when slug is null", async () => { + it("should return null slug and valid id when slug is null", async () => { const created = await repo.create({ type: "post", data: { title: "No Slug Post" }, @@ -934,11 +950,13 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries[0]!.collection).toBe("post"); - expect(result.data!.entries[0]!.identifier).toBe(created.id); + const entries = flatEntries(result.data!); + expect(entries[0]!.collection).toBe("post"); + expect(entries[0]!.slug).toBeNull(); + expect(entries[0]!.id).toBe(created.id); }); - it("should include updatedAt from updated_at", async () => { + it("should include updatedAt and lastmod", async () => { await repo.create({ type: "post", slug: "test", @@ -948,12 +966,55 @@ describe("SEO", () => { const result = await handleSitemapData(db); - expect(result.data!.entries[0]!.updatedAt).toBeDefined(); - // Should be a valid date string - expect(new Date(result.data!.entries[0]!.updatedAt).getTime()).not.toBeNaN(); + const col = result.data!.collections[0]!; + expect(col.lastmod).toBeDefined(); + expect(new Date(col.lastmod).getTime()).not.toBeNaN(); + expect(col.entries[0]!.updatedAt).toBeDefined(); + expect(new Date(col.entries[0]!.updatedAt).getTime()).not.toBeNaN(); + }); + + it("should include urlPattern from collection", async () => { + await db + .updateTable("_emdash_collections") + .set({ url_pattern: "/blog/{slug}" }) + .where("slug", "=", "post") + .execute(); + + await repo.create({ + type: "post", + slug: "test", + data: { title: "Test" }, + status: "published", + }); + + const result = await handleSitemapData(db); + + expect(result.data!.collections[0]!.urlPattern).toBe("/blog/{slug}"); + }); + + it("should filter by collection when collectionSlug is provided", async () => { + await repo.create({ + type: "post", + slug: "my-post", + data: { title: "A Post" }, + status: "published", + }); + + await repo.create({ + type: "page", + slug: "about", + data: { title: "About Us" }, + status: "published", + }); + + const result = await handleSitemapData(db, "post"); + + expect(result.success).toBe(true); + expect(result.data!.collections).toHaveLength(1); + expect(result.data!.collections[0]!.collection).toBe("post"); }); - it("should return empty entries when no SEO-enabled collections exist", async () => { + it("should return empty collections when no SEO-enabled collections exist", async () => { // Disable SEO on all collections await db.updateTable("_emdash_collections").set({ has_seo: 0 }).execute(); @@ -967,14 +1028,14 @@ describe("SEO", () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toEqual([]); + expect(result.data!.collections).toEqual([]); }); - it("should return empty entries for empty database", async () => { + it("should return empty collections for empty database", async () => { const result = await handleSitemapData(db); expect(result.success).toBe(true); - expect(result.data!.entries).toEqual([]); + expect(result.data!.collections).toEqual([]); }); });