From 59849e81732122b7e6f6d94c2cdf124ccd8039a5 Mon Sep 17 00:00:00 2001 From: Gracjan Sadowicz Date: Fri, 19 Jun 2026 07:36:26 +0200 Subject: [PATCH] RDoc-3857 Sitemap should contain only canonical URLs Non-current doc versions (active 7.1/6.2 and legacy) canonicalize to the current version, so listing them in the sitemap contradicts their own canonical tag. Exclude every built version except current via the docs preset's sitemap ignorePatterns, and switch the post-build sitemap splitter from a legacy denylist to a current-version allowlist so both layers enforce the same invariant. --- docusaurus.config.ts | 18 ++++++---- scripts/split-sitemap.ts | 8 ++--- src/lib/split-sitemap/__tests__/split.test.ts | 33 ++++++++++++------- src/lib/split-sitemap/lib/split.ts | 29 ++++++++-------- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/docusaurus.config.ts b/docusaurus.config.ts index 28d5eb98c7..e6a40cf547 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -2,11 +2,13 @@ import { themes as prismThemes } from "prism-react-renderer"; import type { Config } from "@docusaurus/types"; import type * as Preset from "@docusaurus/preset-classic"; // eslint-disable-next-line @typescript-eslint/no-require-imports -const { CURRENT_VERSION, ACTIVE_VERSIONS, LEGACY_VERSIONS } = require("./scripts/lib/version-policy.js") as { - CURRENT_VERSION: string; - ACTIVE_VERSIONS: string[]; - LEGACY_VERSIONS: string[]; -}; +const { CURRENT_VERSION, ACTIVE_VERSIONS, LEGACY_VERSIONS, BUILT_VERSIONS } = + require("./scripts/lib/version-policy.js") as { + CURRENT_VERSION: string; + ACTIVE_VERSIONS: string[]; + LEGACY_VERSIONS: string[]; + BUILT_VERSIONS: string[]; + }; // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) @@ -27,6 +29,10 @@ const legacyVersionsAsNoIndex: Record = Object.fromEn LEGACY_VERSIONS.map((v) => [v, { noIndex: true }]) ); +// Versions whose docs are canonical and thus indexed in the sitemap. Only the current +// version is canonical; every other built version canonicalizes to it. +const sitemapVersions = [CURRENT_VERSION]; + const config: Config = { title: "RavenDB Documentation", tagline: "High-performance NoSQL database that just works.", @@ -115,7 +121,7 @@ const config: Config = { lastmod: "date", changefreq: null, priority: null, - ignorePatterns: LEGACY_VERSIONS.map((v) => `/${v}/**`), + ignorePatterns: BUILT_VERSIONS.filter((v) => !sitemapVersions.includes(v)).map((v) => `/${v}/**`), }, googleTagManager: { containerId: "GTM-TDH4JWF2", diff --git a/scripts/split-sitemap.ts b/scripts/split-sitemap.ts index 862ca9483f..5f1d9b2681 100644 --- a/scripts/split-sitemap.ts +++ b/scripts/split-sitemap.ts @@ -16,7 +16,7 @@ import path from "node:path"; import { fileURLToPath } from "node:url"; import { splitSitemap, type SplitSucceeded } from "../src/lib/split-sitemap/lib/split.js"; -import { LEGACY_VERSIONS } from "./lib/version-policy.js"; +import { CURRENT_VERSION } from "./lib/version-policy.js"; const BASE_URL = "https://docs.ravendb.net"; @@ -25,7 +25,7 @@ const __dirname = path.dirname(__filename); const buildDir = process.argv[2] ?? path.join(__dirname, "..", "build"); -const result = splitSitemap({ buildDir, legacyVersions: LEGACY_VERSIONS, baseUrl: BASE_URL }); +const result = splitSitemap({ buildDir, currentVersion: CURRENT_VERSION, baseUrl: BASE_URL }); if (result.skipped) { console.log(`[split-sitemap] skipped: ${result.reason}`); @@ -33,12 +33,12 @@ if (result.skipped) { } else { // Discriminated-union narrowing on boolean literals requires strictNullChecks, which // this project's tsconfig does not enable. process.exit() above makes the cast safe. - const { files, includedUrls, skippedLegacyUrls } = result as SplitSucceeded; + const { files, includedUrls, skippedVersionUrls } = result as SplitSucceeded; for (const { name, urls } of files) { console.log(`[split-sitemap] ${name}: ${urls} URLs`); } console.log( `[split-sitemap] split into ${files.length} sub-sitemaps ` + - `(${includedUrls} URLs included, ${skippedLegacyUrls} legacy URLs excluded)` + `(${includedUrls} URLs included, ${skippedVersionUrls} non-current-version URLs excluded)` ); } diff --git a/src/lib/split-sitemap/__tests__/split.test.ts b/src/lib/split-sitemap/__tests__/split.test.ts index 454e216487..da94182e8f 100644 --- a/src/lib/split-sitemap/__tests__/split.test.ts +++ b/src/lib/split-sitemap/__tests__/split.test.ts @@ -7,7 +7,7 @@ import path from "node:path"; import { splitSitemap } from "../lib/split.js"; const BASE_URL = "https://docs.ravendb.net"; -const LEGACY = ["4.2", "5.4"]; +const CURRENT = "7.2"; function urlBlock(loc: string): string { return `${loc}weekly`; @@ -33,12 +33,12 @@ function withTempBuildDir(body: (dir: string) => void): void { test("splitSitemap skips when sitemap.xml is absent", () => { withTempBuildDir((dir) => { - const result = splitSitemap({ buildDir: dir, legacyVersions: LEGACY, baseUrl: BASE_URL }); + const result = splitSitemap({ buildDir: dir, currentVersion: CURRENT, baseUrl: BASE_URL }); assert.equal(result.skipped, true); }); }); -test("splitSitemap groups URLs by section and version", () => { +test("splitSitemap groups URLs by section and keeps only the current doc version", () => { withTempBuildDir((dir) => { const urls = [ `${BASE_URL}/7.2/foo`, @@ -51,15 +51,15 @@ test("splitSitemap groups URLs by section and version", () => { `${BASE_URL}/search`, ]; fs.writeFileSync(path.join(dir, "sitemap.xml"), buildSitemap(urls)); - const result = splitSitemap({ buildDir: dir, legacyVersions: LEGACY, baseUrl: BASE_URL }); + const result = splitSitemap({ buildDir: dir, currentVersion: CURRENT, baseUrl: BASE_URL }); assert.equal(result.skipped, false); if (result.skipped) { return; } const names = result.files.map((f) => f.name).sort(); + // 6.2 is not the current version, so it gets no sitemap file. assert.deepEqual(names, [ "sitemap-cloud.xml", - "sitemap-docs-6.2.xml", "sitemap-docs-7.2.xml", "sitemap-guides.xml", "sitemap-misc.xml", @@ -72,18 +72,29 @@ test("splitSitemap groups URLs by section and version", () => { }); }); -test("splitSitemap excludes legacy-version URLs", () => { +test("splitSitemap excludes every non-current doc version (active and legacy)", () => { withTempBuildDir((dir) => { - const urls = [`${BASE_URL}/7.2/ok`, `${BASE_URL}/4.2/legacy`, `${BASE_URL}/5.4/also-legacy`]; + const urls = [ + `${BASE_URL}/7.2/ok`, + `${BASE_URL}/7.1/active`, // active, non-current → canonical points at 7.2 + `${BASE_URL}/6.2/active`, // active, non-current → canonical points at 7.2 + `${BASE_URL}/4.2/legacy`, + `${BASE_URL}/5.4/also-legacy`, + ]; fs.writeFileSync(path.join(dir, "sitemap.xml"), buildSitemap(urls)); - const result = splitSitemap({ buildDir: dir, legacyVersions: LEGACY, baseUrl: BASE_URL }); + const result = splitSitemap({ buildDir: dir, currentVersion: CURRENT, baseUrl: BASE_URL }); assert.equal(result.skipped, false); if (result.skipped) { return; } assert.equal(result.includedUrls, 1); - assert.equal(result.skippedLegacyUrls, 2); - assert.ok(!result.files.some((f) => f.name.includes("4.2") || f.name.includes("5.4"))); + assert.equal(result.skippedVersionUrls, 4); + assert.ok( + !result.files.some( + (f) => + f.name.includes("7.1") || f.name.includes("6.2") || f.name.includes("4.2") || f.name.includes("5.4") + ) + ); }); }); @@ -91,7 +102,7 @@ test("splitSitemap replaces sitemap.xml with a sitemapindex referencing each sub withTempBuildDir((dir) => { const urls = [`${BASE_URL}/7.2/foo`, `${BASE_URL}/cloud/x`]; fs.writeFileSync(path.join(dir, "sitemap.xml"), buildSitemap(urls)); - splitSitemap({ buildDir: dir, legacyVersions: LEGACY, baseUrl: BASE_URL }); + splitSitemap({ buildDir: dir, currentVersion: CURRENT, baseUrl: BASE_URL }); const indexXml = fs.readFileSync(path.join(dir, "sitemap.xml"), "utf8"); assert.match(indexXml, / entries. No trailing slash. */ baseUrl: string; } @@ -41,7 +42,7 @@ export interface SplitSucceeded { skipped: false; files: { name: string; urls: number }[]; includedUrls: number; - skippedLegacyUrls: number; + skippedVersionUrls: number; } const SECTION_MAP: Record = { @@ -58,7 +59,7 @@ const URLSET_OPEN = 'xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" ' + 'xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">'; -function getSitemapFile(loc: string, legacySet: Set, baseUrl: string): string | null { +function getSitemapFile(loc: string, currentVersion: string, baseUrl: string): string | null { const prefix = `${baseUrl}/`; const urlPath = loc.startsWith(prefix) ? loc.slice(prefix.length) : loc.replace(/^\//, ""); const firstSegment = urlPath.split("/")[0]; @@ -67,7 +68,8 @@ function getSitemapFile(loc: string, legacySet: Set, baseUrl: string): s return SECTION_MAP[firstSegment]; } if (/^\d+\.\d+$/.test(firstSegment)) { - if (legacySet.has(firstSegment)) { + // Allowlist: keep only the current (canonical) version; drop all others. + if (firstSegment !== currentVersion) { return null; } return `sitemap-docs-${firstSegment}.xml`; @@ -76,7 +78,7 @@ function getSitemapFile(loc: string, legacySet: Set, baseUrl: string): s } export function splitSitemap(options: SplitOptions): SplitResult | SplitSucceeded { - const { buildDir, legacyVersions, baseUrl } = options; + const { buildDir, currentVersion, baseUrl } = options; const sitemapPath = path.join(buildDir, "sitemap.xml"); if (!fs.existsSync(sitemapPath)) { @@ -89,18 +91,17 @@ export function splitSitemap(options: SplitOptions): SplitResult | SplitSucceede return { skipped: true, reason: "sitemap.xml contains no URLs" }; } - const legacySet = new Set(legacyVersions); const groups: Record = {}; - let skippedLegacy = 0; + let skippedVersions = 0; for (const block of urlBlocks) { const locMatch = block.match(/(.*?)<\/loc>/); if (!locMatch) { continue; } - const file = getSitemapFile(locMatch[1], legacySet, baseUrl); + const file = getSitemapFile(locMatch[1], currentVersion, baseUrl); if (!file) { - skippedLegacy++; + skippedVersions++; continue; } (groups[file] ??= []).push(block); @@ -131,7 +132,7 @@ export function splitSitemap(options: SplitOptions): SplitResult | SplitSucceede return { skipped: false, files: result, - includedUrls: urlBlocks.length - skippedLegacy, - skippedLegacyUrls: skippedLegacy, + includedUrls: urlBlocks.length - skippedVersions, + skippedVersionUrls: skippedVersions, }; }