feat: add SEO pre-rendering, meta tags, sitemap, and robots.txt

caio-pizzol · caio-pizzol · commit 9b59825d9714 · 2026-03-17T06:08:25.000-03:00
Build-time pre-rendering generates static HTML for all 10 routes so
search engines can index doc pages without executing JavaScript.
Each page gets a unique title, meta description, canonical URL,
Open Graph tags, Twitter card, and JSON-LD structured data.
Sitemap.xml and robots.txt are generated/served automatically.
diff --git a/apps/web/package.json b/apps/web/package.json
@@ -5,7 +5,7 @@
   "type": "module",
   "scripts": {
     "dev": "vite",
-    "build": "tsc && vite build",
+    "build": "tsc && vite build && bun scripts/prerender.ts",
     "preview": "vite preview",
     "typecheck": "tsc --noEmit",
     "deploy": "bun run build && wrangler pages deploy dist --project-name=ooxml-dev"
diff --git a/apps/web/public/robots.txt b/apps/web/public/robots.txt
@@ -0,0 +1,29 @@
+User-agent: *
+Allow: /
+
+Sitemap: https://ooxml.dev/sitemap.xml
+
+# Block AI training crawlers
+User-agent: GPTBot
+Disallow: /
+
+User-agent: CCBot
+Disallow: /
+
+User-agent: Google-Extended
+Disallow: /
+
+User-agent: Bytespider
+Disallow: /
+
+User-agent: ClaudeBot
+Disallow: /
+
+User-agent: Amazonbot
+Disallow: /
+
+User-agent: Applebot-Extended
+Disallow: /
+
+User-agent: meta-externalagent
+Disallow: /
diff --git a/apps/web/scripts/prerender.ts b/apps/web/scripts/prerender.ts
@@ -0,0 +1,244 @@
+/**
+ * Build-time pre-rendering script.
+ *
+ * Runs after `vite build` to generate static HTML for each route.
+ * This makes doc pages crawlable by search engines without SSR.
+ *
+ * Usage: bun apps/web/scripts/prerender.ts
+ */
+
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, resolve } from "node:path";
+import { type DocPage, docs } from "../src/data/docs";
+import { getAllPaths, getSeoMeta } from "../src/data/seo";
+
+const DIST = resolve(import.meta.dir, "../dist");
+const SITE_URL = "https://ooxml.dev";
+
+// Read the built index.html as template
+const template = readFileSync(resolve(DIST, "index.html"), "utf-8");
+
+// --- Content block → HTML converters ---
+
+function escapeHtml(str: string): string {
+	return str
+		.replace(/&/g, "&amp;")
+		.replace(/</g, "&lt;")
+		.replace(/>/g, "&gt;")
+		.replace(/"/g, "&quot;");
+}
+
+function inlineMarkdownToHtml(text: string): string {
+	return text
+		.replace(
+			/\[([^\]]+)\]\(([^)]+)\)/g,
+			(_, linkText, url) => `<a href="${escapeHtml(url)}">${escapeHtml(linkText)}</a>`,
+		)
+		.replace(/`([^`]+)`/g, (_, code) => `<code>${escapeHtml(code)}</code>`);
+}
+
+function contentBlockToHtml(block: DocPage["content"][number]): string {
+	switch (block.type) {
+		case "heading": {
+			const tag = `h${block.level}`;
+			return `<${tag}>${escapeHtml(block.text)}</${tag}>`;
+		}
+		case "paragraph":
+			return `<p>${inlineMarkdownToHtml(block.text)}</p>`;
+		case "code":
+			return `<pre><code>${escapeHtml(block.code)}</code></pre>`;
+		case "preview":
+			return `<pre><code>${escapeHtml(block.xml)}</code></pre>`;
+		case "note":
+			return `<div><strong>${escapeHtml(block.title)}</strong>${block.app ? ` <em>(${escapeHtml(block.app)})</em>` : ""}<p>${inlineMarkdownToHtml(block.text)}</p></div>`;
+		case "table":
+			return `<table><thead><tr>${block.headers.map((h) => `<th>${inlineMarkdownToHtml(h)}</th>`).join("")}</tr></thead><tbody>${block.rows.map((row) => `<tr>${row.map((cell) => `<td>${inlineMarkdownToHtml(cell)}</td>`).join("")}</tr>`).join("")}</tbody></table>`;
+		default:
+			return "";
+	}
+}
+
+function docPageToHtml(page: DocPage): string {
+	const parts: string[] = [];
+	parts.push(`<article>`);
+	if (page.badge) {
+		parts.push(`<span>${escapeHtml(page.badge)}</span>`);
+	}
+	parts.push(`<h1>${escapeHtml(page.title)}</h1>`);
+	if (page.description) {
+		parts.push(`<p>${escapeHtml(page.description)}</p>`);
+	}
+	for (const block of page.content) {
+		parts.push(contentBlockToHtml(block));
+	}
+	parts.push(`</article>`);
+	return parts.join("\n");
+}
+
+// --- Static HTML for non-doc pages ---
+
+function homePageHtml(): string {
+	return `<main>
+<h1>ooxml.dev</h1>
+<p>The OOXML spec, explained by people who actually implemented it.</p>
+<p>Interactive examples, real-world gotchas, live previews, and AI-powered search.</p>
+<a href="/docs">Browse Reference</a>
+</main>`;
+}
+
+function mcpPageHtml(): string {
+	return `<main>
+<h1>Search the ECMA-376 spec with AI</h1>
+<p>Connect your MCP-compatible client to search 18,000+ specification chunks using natural language queries.</p>
+<h2>Available Tools</h2>
+<ul>
+<li><strong>search_ecma_spec</strong> — Semantic search across the specification.</li>
+<li><strong>get_section</strong> — Retrieve a specific section by ID.</li>
+<li><strong>list_parts</strong> — Browse the specification structure.</li>
+</ul>
+<h2>What is MCP?</h2>
+<p>The Model Context Protocol (MCP) is an open standard that lets AI assistants connect to external data sources and tools.</p>
+</main>`;
+}
+
+function specPageHtml(): string {
+	return `<main>
+<h1>ECMA-376 Spec Explorer</h1>
+<p>Search and browse the ECMA-376 Office Open XML specification with semantic search and PDF viewer.</p>
+</main>`;
+}
+
+// --- Meta tags and JSON-LD ---
+
+function buildHead(path: string): string {
+	const seo = getSeoMeta(path);
+	const url = `${SITE_URL}${path}`;
+
+	const meta = [
+		`<title>${escapeHtml(seo.title)}</title>`,
+		`<meta name="description" content="${escapeHtml(seo.description)}"/>`,
+		`<link rel="canonical" href="${url}"/>`,
+		`<meta property="og:title" content="${escapeHtml(seo.title)}"/>`,
+		`<meta property="og:description" content="${escapeHtml(seo.description)}"/>`,
+		`<meta property="og:url" content="${url}"/>`,
+		`<meta property="og:type" content="${seo.type}"/>`,
+		`<meta property="og:site_name" content="ooxml.dev"/>`,
+		`<meta name="twitter:card" content="summary"/>`,
+		`<meta name="twitter:title" content="${escapeHtml(seo.title)}"/>`,
+		`<meta name="twitter:description" content="${escapeHtml(seo.description)}"/>`,
+	];
+
+	// JSON-LD structured data
+	if (seo.type === "article") {
+		const jsonLd = {
+			"@context": "https://schema.org",
+			"@type": "TechArticle",
+			headline: seo.title.split(" | ")[0].split(" — ")[0],
+			description: seo.description,
+			url,
+			author: { "@type": "Organization", name: "SuperDoc", url: "https://superdoc.dev" },
+			publisher: { "@type": "Organization", name: "ooxml.dev" },
+			about: {
+				"@type": "Thing",
+				name: "Office Open XML",
+				sameAs: "https://en.wikipedia.org/wiki/Office_Open_XML",
+			},
+		};
+		meta.push(`<script type="application/ld+json">${JSON.stringify(jsonLd)}</script>`);
+	} else if (path === "/") {
+		const jsonLd = {
+			"@context": "https://schema.org",
+			"@type": "WebSite",
+			name: "ooxml.dev",
+			url: SITE_URL,
+			description: seo.description,
+			potentialAction: {
+				"@type": "SearchAction",
+				target: { "@type": "EntryPoint", urlTemplate: `${SITE_URL}/spec?q={search_term}` },
+				"query-input": "required name=search_term",
+			},
+		};
+		meta.push(`<script type="application/ld+json">${JSON.stringify(jsonLd)}</script>`);
+	}
+
+	return meta.join("\n    ");
+}
+
+// --- Generate HTML for a given path ---
+
+function getContentHtml(path: string): string {
+	if (path === "/") return homePageHtml();
+	if (path === "/mcp") return mcpPageHtml();
+	if (path === "/spec") return specPageHtml();
+
+	// Doc pages
+	const slug = path === "/docs" ? "index" : path.replace("/docs/", "");
+	const page = docs[slug];
+	if (page) return docPageToHtml(page);
+
+	return "";
+}
+
+function renderPage(path: string): string {
+	const headTags = buildHead(path);
+	const content = getContentHtml(path);
+
+	let html = template;
+
+	// Replace <title> tag
+	html = html.replace(
+		/<title>[^<]*<\/title>/,
+		headTags.split("\n")[0], // title tag
+	);
+
+	// Inject remaining meta tags before </head>
+	const remainingMeta = headTags.split("\n").slice(1).join("\n    ");
+	html = html.replace("</head>", `    ${remainingMeta}\n  </head>`);
+
+	// Inject content into <div id="root">
+	html = html.replace('<div id="root"></div>', `<div id="root">${content}</div>`);
+
+	return html;
+}
+
+// --- Sitemap generation ---
+
+function generateSitemap(paths: string[]): string {
+	const urls = paths.map((path) => {
+		const priority = path === "/" ? "1.0" : path.startsWith("/docs/") ? "0.8" : "0.7";
+		const changefreq = path === "/" ? "weekly" : "monthly";
+		return `  <url>
+    <loc>${SITE_URL}${path}</loc>
+    <changefreq>${changefreq}</changefreq>
+    <priority>${priority}</priority>
+  </url>`;
+	});
+
+	return `<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+${urls.join("\n")}
+</urlset>`;
+}
+
+// --- Main ---
+
+const paths = getAllPaths();
+let count = 0;
+
+for (const path of paths) {
+	const html = renderPage(path);
+	const filePath =
+		path === "/" ? resolve(DIST, "index.html") : resolve(DIST, `${path.slice(1)}/index.html`);
+
+	mkdirSync(dirname(filePath), { recursive: true });
+	writeFileSync(filePath, html);
+	count++;
+	console.log(`  ✓ ${path}`);
+}
+
+// Generate sitemap
+const sitemap = generateSitemap(paths);
+writeFileSync(resolve(DIST, "sitemap.xml"), sitemap);
+console.log(`  ✓ /sitemap.xml`);
+
+console.log(`\nPre-rendered ${count} pages + sitemap.`);
diff --git a/apps/web/src/data/seo.ts b/apps/web/src/data/seo.ts
@@ -0,0 +1,63 @@
+import { docs } from "./docs";
+
+export interface SeoMeta {
+	title: string;
+	description: string;
+	type: "website" | "article";
+}
+
+const staticPages: Record<string, SeoMeta> = {
+	"/": {
+		title: "ooxml.dev — The Implementer's Guide to OOXML (ECMA-376)",
+		description:
+			"Interactive OOXML reference with live previews, implementation notes, and real-world gotchas. Built by the SuperDoc team.",
+		type: "website",
+	},
+	"/mcp": {
+		title: "ECMA-376 MCP Server — Search the OOXML Spec with AI | ooxml.dev",
+		description:
+			"Connect your AI assistant to search 18,000+ OOXML specification chunks. Works with Claude Code, Cursor, and any MCP-compatible client.",
+		type: "website",
+	},
+	"/spec": {
+		title: "ECMA-376 Spec Explorer | ooxml.dev",
+		description:
+			"Search and browse the ECMA-376 Office Open XML specification with semantic search and PDF viewer.",
+		type: "website",
+	},
+	"/docs": {
+		title: "OOXML Reference — Getting Started | ooxml.dev",
+		description:
+			"Learn the basics of OOXML (Office Open XML) and how to use this interactive reference.",
+		type: "article",
+	},
+};
+
+export function getSeoMeta(path: string): SeoMeta {
+	if (staticPages[path]) {
+		return staticPages[path];
+	}
+
+	const slug = path.replace("/docs/", "");
+	const page = docs[slug];
+	if (page) {
+		const badge = page.badge ? ` (${page.badge})` : "";
+		return {
+			title: `${page.title}${badge} — ${page.description || "OOXML Reference"} | ooxml.dev`,
+			description:
+				page.description || `${page.title} — interactive OOXML reference with live previews.`,
+			type: "article",
+		};
+	}
+
+	return staticPages["/"];
+}
+
+export function getAllPaths(): string[] {
+	const paths = ["/", "/mcp", "/spec", "/docs"];
+	for (const slug of Object.keys(docs)) {
+		if (slug === "index") continue;
+		paths.push(`/docs/${slug}`);
+	}
+	return paths;
+}
diff --git a/apps/web/src/hooks/useDocumentTitle.ts b/apps/web/src/hooks/useDocumentTitle.ts
@@ -0,0 +1,7 @@
+import { useEffect } from "react";
+
+export function useDocumentTitle(title: string) {
+	useEffect(() => {
+		document.title = title;
+	}, [title]);
+}
diff --git a/apps/web/src/pages/Home.tsx b/apps/web/src/pages/Home.tsx
@@ -1,8 +1,11 @@
 import { Link } from "react-router-dom";
 import { Footer } from "../components/Footer";
 import { Navbar } from "../components/Navbar";
+import { getSeoMeta } from "../data/seo";
+import { useDocumentTitle } from "../hooks/useDocumentTitle";
 
 export function Home() {
+	useDocumentTitle(getSeoMeta("/").title);
 	return (
 		<div className="min-h-screen bg-[var(--color-bg-primary)]">
 			<Navbar maxWidth />
diff --git a/apps/web/src/pages/Mcp.tsx b/apps/web/src/pages/Mcp.tsx
@@ -1,6 +1,8 @@
 import { useState } from "react";
 import { Link } from "react-router-dom";
 import { Navbar } from "../components/Navbar";
+import { getSeoMeta } from "../data/seo";
+import { useDocumentTitle } from "../hooks/useDocumentTitle";
 
 const MCP_ENDPOINT = `${import.meta.env.VITE_API_URL}/mcp`;
 const CLAUDE_COMMAND = `claude mcp add --transport http ecma-spec ${MCP_ENDPOINT}`;
@@ -31,6 +33,7 @@ const EXAMPLE_QUERIES = [
 type TabId = "claude" | "cursor" | "other";
 
 export function Mcp() {
+	useDocumentTitle(getSeoMeta("/mcp").title);
 	const [copiedEndpoint, setCopiedEndpoint] = useState(false);
 	const [copiedCommand, setCopiedCommand] = useState(false);
 	const [activeTab, setActiveTab] = useState<TabId>("claude");
diff --git a/apps/web/src/pages/SpecExplorer.tsx b/apps/web/src/pages/SpecExplorer.tsx
diff --git a/apps/web/src/pages/docs/Page.tsx b/apps/web/src/pages/docs/Page.tsx