diff --git a/README.md b/README.md index 7ca2540..b8c69b1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# @llmstxt — Generate `llms.txt` and `llms-full.txt` for your website +# @llmstxt — `llms.txt`, `llms-full.txt`, and Markdown for agents -Generate `llms.txt` (a table of contents) and `llms-full.txt` (full content) from your app’s pages so LLMs can quickly understand what your site covers. +Generate `llms.txt` (table of contents) and `llms-full.txt` (full content) from your app’s pages, and optionally serve **any page as Markdown** via content negotiation (`Accept: text/markdown`). - Standard: [llmstxt.org](https://llmstxt.org) -- Works with: Next.js App Router (built-in), plus any Node.js framework via `@llmstxt/core` +- Works with: Next.js App Router (`@llmstxt/next`), any Node.js framework (`@llmstxt/core`), and Next.js middleware (`@llmstxt/middleware`) ## Packages @@ -11,14 +11,16 @@ Generate `llms.txt` (a table of contents) and `llms-full.txt` (full content) fro |---|---| | [`@llmstxt/core`](./packages/core) | Scan your `app/` directory and generate `llms.txt` + `llms-full.txt` | | [`@llmstxt/next`](./packages/next) | Next.js App Router route handlers for `/llms.txt` and `/llms-full.txt` | +| [`@llmstxt/middleware`](./packages/middleware) | Next.js middleware: any page responds as Markdown on `Accept: text/markdown` | ## Keywords (SEO) -llms.txt, llms-full.txt, llmstxt, Next.js, App Router, AI, LLM, SEO, website indexing, documentation, crawlable content, sitemap alternative +llms.txt, llms-full.txt, llmstxt, markdown for agents, content negotiation, Next.js, middleware, App Router, AI, LLM, SEO, website indexing, documentation, crawlable content, sitemap alternative ## Table of Contents - [Quick Start — Next.js (App Router)](#quick-start--nextjs-app-router) +- [Quick Start — Markdown for Agents (Next.js Middleware)](#quick-start--markdown-for-agents-nextjs-middleware) - [Quick Start — Other Frameworks](#quick-start--other-frameworks-express-hono-bun-etc) - [How It Works](#how-it-works) - [API Reference (`@llmstxt/core`)](#api-reference-llmstxtcore) @@ -94,6 +96,39 @@ export const GET = createLlmsFullTxtHandler({ --- +## Quick Start — Markdown for Agents (Next.js Middleware) + +Serve the Markdown version of **any page** when a client sends `Accept: text/markdown`. + +### Install + +```bash +npm install @llmstxt/middleware +``` + +### Add middleware + +```ts +// middleware.ts (project root) +export { middleware, config } from '@llmstxt/middleware' +``` + +This returns: +- `Content-Type: text/markdown; charset=utf-8` +- `Vary: Accept` (so CDNs cache HTML and Markdown separately) +- `x-markdown-tokens` (token estimate for context sizing) +- `Content-Signal` (content usage preferences) + +Test it: + +```bash +curl https://yoursite.com/blog/my-post -H "Accept: text/markdown" +``` + +For best Markdown quality in production, plug in `@mozilla/readability` + `turndown` (see `packages/middleware/README.md`). + +--- + ## Quick Start — Other Frameworks (Express, Hono, Bun, etc.) ```bash @@ -152,6 +187,9 @@ Example output: ### `llms-full.txt` (Full content) Same scan, but also fetches and converts each page to Markdown (or plain text by default). This produces one big file containing your site’s content for ingestion. +### `Accept: text/markdown` (Markdown for agents) +`@llmstxt/middleware` adds content negotiation to your Next.js app: when a client requests any URL with `Accept: text/markdown`, it re-fetches that page as HTML, converts it to Markdown, and returns it with `Content-Type: text/markdown`, `Vary: Accept`, and `x-markdown-tokens`. + ## Why this helps SEO (and AI discoverability) Search engines don’t directly “rank” `llms.txt`, but having clean, crawlable pages with accurate titles/descriptions helps both humans and models. diff --git a/package-lock.json b/package-lock.json index 7fcb1e2..cf6564a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1857,6 +1857,10 @@ "resolved": "packages/core", "link": true }, + "node_modules/@llmstxt/middleware": { + "resolved": "packages/middleware", + "link": true + }, "node_modules/@llmstxt/next": { "resolved": "packages/next", "link": true @@ -14456,6 +14460,17 @@ "node": ">=18.0.0" } }, + "packages/middleware": { + "name": "@llmstxt/middleware", + "version": "0.1.0", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + }, + "peerDependencies": { + "next": ">=13.0.0" + } + }, "packages/next": { "name": "@llmstxt/next", "version": "0.1.0", @@ -15757,6 +15772,10 @@ "@llmstxt/core": { "version": "file:packages/core" }, + "@llmstxt/middleware": { + "version": "file:packages/middleware", + "requires": {} + }, "@llmstxt/next": { "version": "file:packages/next", "requires": { diff --git a/package.json b/package.json index de770e5..f92f1e5 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@llmstxt/monorepo", "private": true, - "description": "Monorepo for @llmstxt packages.", + "description": "Monorepo for @llmstxt (llms.txt, llms-full.txt, and Markdown for agents).", "license": "MIT", "workspaces": [ "packages/*" diff --git a/packages/middleware/README.md b/packages/middleware/README.md new file mode 100644 index 0000000..97f86f4 --- /dev/null +++ b/packages/middleware/README.md @@ -0,0 +1,89 @@ +# `@llmstxt/middleware` + +Next.js middleware that serves **any page as Markdown** when the request includes `Accept: text/markdown`. + +This mirrors Cloudflare's "Markdown for Agents" content-negotiation pattern, but runs entirely inside your own Next.js app. + +--- + +## Install + +```bash +npm install @llmstxt/middleware +``` + +--- + +## Usage (one line) + +```ts +// middleware.ts (project root) +export { middleware, config } from '@llmstxt/middleware' +``` + +Test it: + +```bash +curl https://yoursite.com/blog/my-post -H "Accept: text/markdown" +``` + +--- + +## Response headers + +- `Content-Type: text/markdown; charset=utf-8` +- `Vary: Accept` +- `x-markdown-tokens: ` +- `Content-Signal: ` + +--- + +## With options + +```ts +// middleware.ts +import { createMarkdownMiddleware } from '@llmstxt/middleware' + +export const middleware = createMarkdownMiddleware({ + // Opt-out of AI training while allowing agents to read + contentSignal: 'ai-train=no, search=yes, ai-input=yes', + + // Better HTML→Markdown output (optional) + htmlToMarkdown: async (html) => { + const TurndownService = (await import('turndown')).default + const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }) + return td.turndown(html) + }, +}) + +export const config = { + matcher: ['/((?!_next/static|_next/image|favicon.ico).*)'], +} +``` + +--- + +## Better Markdown quality (Readability + Turndown) + +```bash +npm install turndown @mozilla/readability jsdom +``` + +```ts +import { createMarkdownMiddleware } from '@llmstxt/middleware' +import TurndownService from 'turndown' +import { Readability } from '@mozilla/readability' +import { JSDOM } from 'jsdom' + +export const middleware = createMarkdownMiddleware({ + htmlToMarkdown: async (html, url) => { + const dom = new JSDOM(html, { url }) + const article = new Readability(dom.window.document).parse() + if (!article) return '' + + const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }) + return `# ${article.title}\n\n${td.turndown(article.content)}` + }, +}) +``` + diff --git a/packages/middleware/package.json b/packages/middleware/package.json new file mode 100644 index 0000000..e308ee8 --- /dev/null +++ b/packages/middleware/package.json @@ -0,0 +1,49 @@ +{ + "name": "@llmstxt/middleware", + "version": "0.1.0", + "description": "Next.js middleware that serves Markdown when requests send Accept: text/markdown.", + "license": "MIT", + "author": "Recordly", + "repository": { + "type": "git", + "url": "https://github.com/Muhammad-Hashim/llmstxt.git", + "directory": "packages/middleware" + }, + "homepage": "https://github.com/Muhammad-Hashim/llmstxt#readme", + "keywords": [ + "llmstxt", + "markdown", + "markdown for agents", + "content negotiation", + "accept header", + "nextjs", + "middleware", + "ai", + "llm", + "seo" + ], + "type": "commonjs", + "main": "./lib/index.js", + "types": "./lib/index.d.ts", + "readme": "README.md", + "files": [ + "lib/**/*" + ], + "exports": { + ".": { + "types": "./lib/index.d.ts", + "default": "./lib/index.js" + } + }, + "peerDependencies": { + "next": ">=13.0.0" + }, + "scripts": { + "build": "tsc -p tsconfig.build.json", + "clean": "node ../../scripts/rmrf.js lib" + }, + "engines": { + "node": ">=18.0.0" + } +} + diff --git a/packages/middleware/src/index.ts b/packages/middleware/src/index.ts new file mode 100644 index 0000000..7530a27 --- /dev/null +++ b/packages/middleware/src/index.ts @@ -0,0 +1,352 @@ +import type { NextRequest } from 'next/server'; + +export type MarkdownMiddlewareOptions = { + /** + * URL patterns to skip — middleware won't attempt conversion for these. + * Supports simple glob patterns, e.g. `_next/*` or `*.png`. + */ + exclude?: string[]; + + /** + * Custom HTML→Markdown converter. + * If omitted, the built-in dependency-free converter is used. + */ + htmlToMarkdown?: (html: string, url: string) => string | Promise; + + /** + * Custom token estimator for the `x-markdown-tokens` header. + * Default: `Math.ceil(text.length / 4)` (rough GPT-style approximation). + */ + estimateTokens?: (markdown: string) => number; + + /** + * Value for the `Content-Signal` header. See https://contentsignals.org + * @default 'ai-train=yes, search=yes, ai-input=yes' + */ + contentSignal?: string; + + /** + * Timeout in ms for fetching the HTML version of the page. + * @default 10000 + */ + fetchTimeoutMs?: number; + + /** + * Include a YAML front-matter block at the top of every Markdown response. + * @default true + */ + includeFrontMatter?: boolean; + + /** + * Override Cache-Control for Markdown responses. + * @default 'private, max-age=0, no-store' + */ + cacheControl?: string; +}; + +// --------------------------------------------------------------------------- +// Built-in HTML → Markdown converter (dependency-free) +// --------------------------------------------------------------------------- + +function builtinHtmlToMarkdown(html: string): string { + return ( + html + // Remove noisy sections + .replace(/]*>[\s\S]*?<\/script>/gi, '') + .replace(/]*>[\s\S]*?<\/style>/gi, '') + .replace(/]*>[\s\S]*?<\/svg>/gi, '') + .replace(/]*>[\s\S]*?<\/nav>/gi, '') + .replace(/]*>[\s\S]*?<\/footer>/gi, '') + .replace(/]*>[\s\S]*?<\/header>/gi, '') + .replace(/]*>[\s\S]*?<\/aside>/gi, '') + .replace(//g, '') + + // Headings + .replace( + /]*>([\s\S]*?)<\/h\1>/gi, + (_match: string, level: string, inner: string) => { + const text = stripTags(inner).trim(); + return text ? `\n${'#'.repeat(Number(level))} ${text}\n\n` : '\n'; + } + ) + + // Code blocks + .replace( + /]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, + (_match: string, code: string) => + `\n\`\`\`\n${decodeEntities(code.trim())}\n\`\`\`\n\n` + ) + .replace( + /]*>([\s\S]*?)<\/code>/gi, + (_match: string, code: string) => + `\`${decodeEntities(stripTags(code))}\`` + ) + + // Links + .replace( + /]*href=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi, + (_match: string, href: string, text: string) => { + const label = stripTags(text).trim(); + return label ? `[${label}](${href})` : href; + } + ) + + // Lists + .replace( + /]*>([\s\S]*?)<\/li>/gi, + (_match: string, inner: string) => { + const text = stripTags(inner).trim(); + return text ? `- ${text}\n` : ''; + } + ) + .replace(/]*>|<\/ul>|]*>|<\/ol>/gi, '\n') + + // Blockquotes + .replace( + /]*>([\s\S]*?)<\/blockquote>/gi, + (_match: string, inner: string) => { + const text = stripTags(inner).trim(); + if (!text) return '\n'; + return `\n${text + .split('\n') + .map((l: string) => `> ${l}`) + .join('\n')}\n\n`; + } + ) + + // Paragraphs and breaks + .replace(/]*>([\s\S]*?)<\/p>/gi, (_match: string, inner: string) => { + const text = stripTags(inner).trim(); + return text ? `\n${text}\n\n` : '\n'; + }) + .replace(//gi, '\n') + .replace(//gi, '\n---\n\n') + + // Emphasis + .replace( + /]*>([\s\S]*?)<\/strong>/gi, + (_, t) => `**${stripTags(t).trim()}**` + ) + .replace( + /]*>([\s\S]*?)<\/b>/gi, + (_, t) => `**${stripTags(t).trim()}**` + ) + .replace( + /]*>([\s\S]*?)<\/em>/gi, + (_, t) => `*${stripTags(t).trim()}*` + ) + .replace( + /]*>([\s\S]*?)<\/i>/gi, + (_, t) => `*${stripTags(t).trim()}*` + ) + + // Strip remaining tags + .replace(/<[^>]+>/g, '') + + // Decode entities + normalize whitespace + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' ') + .replace(/[ \t]+\n/g, '\n') + .replace(/\n{4,}/g, '\n\n\n') + .trim() + ); +} + +function stripTags(input: string): string { + return input.replace(/<[^>]+>/g, ''); +} + +function decodeEntities(str: string): string { + return str + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' '); +} + +function extractMeta(html: string): { title: string; description: string } { + const titleMatch = /]*>([^<]+)<\/title>/i.exec(html); + const descMatch = + /]+name=["']description["'][^>]+content=["']([^"']+)["']/i.exec( + html + ) ?? + /]+content=["']([^"']+)["'][^>]+name=["']description["']/i.exec( + html + ); + + return { + title: titleMatch ? decodeEntities(titleMatch[1].trim()) : '', + description: descMatch ? decodeEntities(descMatch[1].trim()) : '', + }; +} + +function defaultEstimateTokens(text: string): number { + return Math.ceil(text.length / 4); +} + +// --------------------------------------------------------------------------- +// Excludes / Accept negotiation +// --------------------------------------------------------------------------- + +const DEFAULT_EXCLUDE = [ + '_next/*', + 'api/*', + '*.ico', + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.svg', + '*.webp', + '*.woff', + '*.woff2', + '*.ttf', + '*.css', + '*.js', + '*.map', + 'llms.txt', + 'llms-full.txt', +]; + +function globToRegExp(pattern: string): RegExp { + const escaped = pattern + .replace(/[.+^${}()|[\]\\]/g, '\\$&') + .replace(/\*/g, '.*'); + return new RegExp(`^${escaped}$`); +} + +function matchesExclude(pathname: string, patterns: string[]): boolean { + const clean = pathname.startsWith('/') ? pathname.slice(1) : pathname; + return patterns.some(pattern => { + const normalized = pattern.startsWith('/') ? pattern.slice(1) : pattern; + return globToRegExp(normalized).test(clean); + }); +} + +function wantsMarkdown(request: NextRequest): boolean { + const accept = request.headers.get('accept') ?? ''; + if (!accept) return false; + + return accept.split(',').some(part => { + const [type, ...params] = part.trim().toLowerCase().split(';'); + const qParam = params.find(p => p.trim().startsWith('q=')); + const q = qParam ? Number(qParam.trim().slice(2)) : 1; + if (!Number.isFinite(q) || q <= 0) return false; + return type === 'text/markdown' || type === 'text/*'; + }); +} + +// --------------------------------------------------------------------------- +// Middleware factory +// --------------------------------------------------------------------------- + +export function createMarkdownMiddleware( + options: MarkdownMiddlewareOptions = {} +): (request: NextRequest) => Promise { + const { + exclude = DEFAULT_EXCLUDE, + htmlToMarkdown = builtinHtmlToMarkdown, + estimateTokens = defaultEstimateTokens, + contentSignal = 'ai-train=yes, search=yes, ai-input=yes', + fetchTimeoutMs = 10_000, + includeFrontMatter = true, + cacheControl = 'private, max-age=0, no-store', + } = options; + + return async function middleware(request: NextRequest): Promise { + const { NextResponse } = await import('next/server'); + + if (request.method !== 'GET' && request.method !== 'HEAD') { + return NextResponse.next(); + } + + if (!wantsMarkdown(request)) { + return NextResponse.next(); + } + + const { pathname } = request.nextUrl; + if (matchesExclude(pathname, exclude)) { + return NextResponse.next(); + } + + try { + const htmlUrl = request.nextUrl.clone(); + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), fetchTimeoutMs); + + const htmlResponse = await fetch(htmlUrl.toString(), { + signal: controller.signal, + headers: { + Accept: 'text/html', + Cookie: request.headers.get('cookie') ?? '', + Authorization: request.headers.get('authorization') ?? '', + 'Accept-Language': request.headers.get('accept-language') ?? '', + 'User-Agent': 'llmstxt-middleware/0.1.0', + }, + redirect: 'manual', + }); + + clearTimeout(timeoutId); + + if (htmlResponse.status >= 300 && htmlResponse.status < 400) { + return htmlResponse as unknown as Response; + } + + if (!htmlResponse.ok) { + return NextResponse.next(); + } + + const contentType = htmlResponse.headers.get('content-type') ?? ''; + if (!contentType.includes('text/html')) { + return NextResponse.next(); + } + + const html = await htmlResponse.text(); + const { title, description } = extractMeta(html); + + let markdown = await htmlToMarkdown(html, request.nextUrl.href); + + if (includeFrontMatter) { + const fm: string[] = ['---']; + if (title) fm.push(`title: "${title.replace(/"/g, '\\"')}"`); + fm.push(`url: "${request.nextUrl.href}"`); + if (description) + fm.push(`description: "${description.replace(/"/g, '\\"')}"`); + fm.push(`generated: "${new Date().toISOString()}"`); + fm.push('---\n'); + markdown = `${fm.join('\n')}\n${markdown}`; + } + + const tokenCount = estimateTokens(markdown); + + return new Response(request.method === 'HEAD' ? null : markdown, { + status: 200, + headers: { + 'Content-Type': 'text/markdown; charset=utf-8', + Vary: 'Accept', + 'x-markdown-tokens': String(tokenCount), + 'Content-Signal': contentSignal, + 'Cache-Control': cacheControl, + }, + }); + } catch { + return NextResponse.next(); + } + }; +} + +export const middleware = createMarkdownMiddleware(); + +export const config = { + matcher: [ + '/((?!_next/static|_next/image|favicon.ico|.*\\.(?:svg|png|jpg|jpeg|gif|webp|ico|css|js|woff|woff2|ttf|map)$).*)', + ], +}; + +export { builtinHtmlToMarkdown, defaultEstimateTokens }; diff --git a/packages/middleware/tsconfig.build.json b/packages/middleware/tsconfig.build.json new file mode 100644 index 0000000..6feccc5 --- /dev/null +++ b/packages/middleware/tsconfig.build.json @@ -0,0 +1,13 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "rootDir": "./src", + "outDir": "./lib", + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "noEmit": false + }, + "include": ["src/**/*.ts"] +} + diff --git a/packages/middleware/tsconfig.json b/packages/middleware/tsconfig.json new file mode 100644 index 0000000..f7a8479 --- /dev/null +++ b/packages/middleware/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "rootDir": ".", + "outDir": "./lib" + }, + "include": ["src/**/*.ts"] +} + diff --git a/packages/next/README.md b/packages/next/README.md index 984fd34..ebcfcd4 100644 --- a/packages/next/README.md +++ b/packages/next/README.md @@ -46,3 +46,12 @@ export const GET = createLlmsTxtHandler({ }) ``` +## Used with `@llmstxt/middleware` (optional) + +If you also want every page to respond with Markdown on `Accept: text/markdown`, add: + +```ts +// middleware.ts +export { middleware, config } from '@llmstxt/middleware' +``` +