From c05f62e9548d219308abf45462fe2969a834eef1 Mon Sep 17 00:00:00 2001 From: Ross Date: Wed, 31 Dec 2025 11:47:11 -0500 Subject: [PATCH 1/5] Feat: add AI artifacts to build and copy to markdown button --- README.md | 9 + bun.lock | 8 + package.json | 4 +- scripts/fetchedAddressData.json | 2 +- scripts/generateAiArtifacts.ts | 387 +++++++++++++++++++++ src/components/DocMarkdownCopyButton.tsx | 178 ++++++++++ src/css/docCopyButton.module.css | 36 ++ src/theme/DocItem/Content/index.js | 31 ++ src/theme/DocItem/Layout/index.js | 77 +++- src/theme/DocItem/Layout/styles.module.css | 34 ++ 10 files changed, 760 insertions(+), 6 deletions(-) create mode 100644 scripts/generateAiArtifacts.ts create mode 100644 src/components/DocMarkdownCopyButton.tsx create mode 100644 src/css/docCopyButton.module.css create mode 100644 src/theme/DocItem/Content/index.js create mode 100644 src/theme/DocItem/Layout/styles.module.css diff --git a/README.md b/README.md index e491ec2b3..5c0ca6343 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,15 @@ This command generates static content into the `build` directory and can be serv bun run build ``` +## AI / agent exports + +The build also generates AI-friendly artifacts that will be served by the hosted site: + +- `build/llms.txt` (served as `/llms.txt`) +- `build/ai/manifest.json` (served as `/ai/manifest.json`) +- `build/ai/docs.jsonl` (served as `/ai/docs.jsonl`) +- `build/ai/raw/` (served as `/ai/raw/`) + ## Configure .env The docs site pulls data from on-chain smart contracts, so an API key is necessary. The default is an Alchemy API key so the easiest thing to do is get a free api key from them at https://www.alchemy.com/pricing. diff --git a/bun.lock b/bun.lock index 135a28eb8..9cdcefee7 100644 --- a/bun.lock +++ b/bun.lock @@ -27,6 +27,8 @@ "remark-math": "^6.0.0", "solc": "^0.8.31", "solidity-docgen": "^0.5.17", + "turndown": "^7.1.2", + "turndown-plugin-gfm": "^1.0.2", "viem": "^2.41.2", }, "devDependencies": { @@ -546,6 +548,8 @@ "@mermaid-js/parser": ["@mermaid-js/parser@0.6.2", "", { "dependencies": { "langium": "3.3.1" } }, "sha512-+PO02uGF6L6Cs0Bw8RpGhikVvMWEysfAyl27qTlroUB8jSWr1lL0Sf6zi78ZxlSnmgSY2AMMKVgghnN9jTtwkQ=="], + "@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="], + "@module-federation/error-codes": ["@module-federation/error-codes@0.21.4", "", {}, "sha512-ClpL5MereWNXh+EgDjz7w4RrC1JlisQTvXDa1gLxpviHafzNDfdViVmuhi9xXVuj+EYo8KU70Y999KHhk9424Q=="], "@module-federation/runtime": ["@module-federation/runtime@0.21.4", "", { "dependencies": { "@module-federation/error-codes": "0.21.4", "@module-federation/runtime-core": "0.21.4", "@module-federation/sdk": "0.21.4" } }, "sha512-wgvGqryurVEvkicufJmTG0ZehynCeNLklv8kIk5BLIsWYSddZAE+xe4xov1kgH5fIJQAoQNkRauFFjVNlHoAkA=="], @@ -2830,6 +2834,10 @@ "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + "turndown": ["turndown@7.2.2", "", { "dependencies": { "@mixmark-io/domino": "^2.2.0" } }, "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ=="], + + "turndown-plugin-gfm": ["turndown-plugin-gfm@1.0.2", "", {}, "sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg=="], + "type-fest": ["type-fest@0.6.0", "", {}, "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg=="], "type-is": ["type-is@1.6.18", "", { "dependencies": { "media-typer": "0.3.0", "mime-types": "~2.1.24" } }, "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g=="], diff --git a/package.json b/package.json index adf5237c5..96b4f478f 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "get-branch-name": "git rev-parse --abbrev-ref HEAD > .branch-name", "start": "bun run runAddressCheck && bun run get-branch-name && BRANCH_NAME=$(cat .branch-name) IS_DEV=true docusaurus start", "start-no-check": "bun run get-branch-name && BRANCH_NAME=$(cat .branch-name) IS_DEV=true docusaurus start", - "build": "docusaurus build", + "build": "docusaurus build && bun scripts/generateAiArtifacts.ts", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", @@ -45,6 +45,8 @@ "remark-math": "^6.0.0", "solc": "^0.8.31", "solidity-docgen": "^0.5.17", + "turndown": "^7.1.2", + "turndown-plugin-gfm": "^1.0.2", "viem": "^2.41.2" }, "browserslist": { diff --git a/scripts/fetchedAddressData.json b/scripts/fetchedAddressData.json index 87a536075..dd268c53d 100644 --- a/scripts/fetchedAddressData.json +++ b/scripts/fetchedAddressData.json @@ -1,5 +1,5 @@ { - "timeLastChecked": 1766510970, + "timeLastChecked": 1767199169, "addressesData": { "v3ContractAddresses": { "topLevel": { diff --git a/scripts/generateAiArtifacts.ts b/scripts/generateAiArtifacts.ts new file mode 100644 index 000000000..98557655f --- /dev/null +++ b/scripts/generateAiArtifacts.ts @@ -0,0 +1,387 @@ +import fs from 'fs' +import path from 'path' +import crypto from 'crypto' + +type Heading = { + level: number + id?: string + text: string +} + +type DocRecordV1 = { + schemaVersion: 1 + url: string + route: string + title: string + headings: Heading[] + text: string + sha256: string + updatedAt: string + source: { + htmlPath: string + } +} + +type ManifestV1 = { + schemaVersion: 1 + generatedAt: string + siteOrigin: string + docCount: number + docsJsonlPath: string + rawDocsPath: string +} + +function ensureDir(dirPath: string) { + fs.mkdirSync(dirPath, { recursive: true }) +} + +function decodeHtmlEntities(input: string) { + const named: Record = { + amp: '&', + lt: '<', + gt: '>', + quot: '"', + apos: "'", + nbsp: ' ', + } + + return input + .replace(/&([a-zA-Z]+);/g, (match, name: string) => named[name] ?? match) + .replace(/&#x([0-9a-fA-F]+);/g, (_, hex: string) => + String.fromCodePoint(Number.parseInt(hex, 16)) + ) + .replace(/&#([0-9]+);/g, (_, num: string) => + String.fromCodePoint(Number.parseInt(num, 10)) + ) +} + +function stripTags(input: string) { + return input.replace(/<[^>]+>/g, '') +} + +function normalizeText(input: string) { + return input + .replace(/\r\n/g, '\n') + .replace(/\u200B|\u200C|\u200D|\uFEFF/g, '') + .replace(/[ \t]+\n/g, '\n') + .replace(/\n{3,}/g, '\n\n') + .replace(/[ \t]{2,}/g, ' ') + .trim() +} + +function extractAttr(tagAttrs: string, name: string) { + const re = new RegExp( + `${name}=(?:"([^"]+)"|'([^']+)'|([^\\s>]+))`, + 'i' + ) + const match = tagAttrs.match(re) + return match?.[1] ?? match?.[2] ?? match?.[3] +} + +function absoluteUrl(origin: string, href: string, pageUrl?: string) { + if (!href) return href + if (href.startsWith('http://') || href.startsWith('https://')) return href + if (href.startsWith('//')) return `https:${href}` + if (href.startsWith('/')) return `${origin}${href}` + if (href.startsWith('#') && pageUrl) return `${pageUrl}${href}` + return href +} + +function extractCanonicalUrl(html: string) { + const match = html.match(/]*rel=canonical[^>]*>/i) + if (!match) return undefined + const href = extractAttr(match[0], 'href') + return href +} + +function extractDocHtml(html: string) { + const start = html.indexOf('
') + if (start < 0) return undefined + + const divTagRe = /<\/?div\b[^>]*>/gi + divTagRe.lastIndex = start + + let depth = 0 + let contentStart = -1 + + let match: RegExpExecArray | null + while ((match = divTagRe.exec(html))) { + const tag = match[0] ?? '' + const isOpen = /^= 0) { + const contentEnd = match.index + return html.slice(contentStart, contentEnd) + } + } + + return undefined +} + +function extractHeadings(docHtml: string) { + const headings: Heading[] = [] + const re = /]*)>([\s\S]*?)<\/h\1>/gi + let match: RegExpExecArray | null + + while ((match = re.exec(docHtml))) { + const level = Number.parseInt(match[1] ?? '0', 10) + const attrs = match[2] ?? '' + const inner = (match[3] ?? '').replace( + /]*class=hash-link[^>]*>[\s\S]*?<\/a>/gi, + '' + ) + const text = normalizeText(decodeHtmlEntities(stripTags(inner))) + if (!text) continue + const id = extractAttr(attrs, 'id') + headings.push({ level, id, text }) + } + + return headings +} + +function extractTitle(html: string, docHtml?: string) { + if (docHtml) { + const h1 = docHtml.match(/]*>([\s\S]*?)<\/h1>/i) + if (h1?.[1]) return normalizeText(decodeHtmlEntities(stripTags(h1[1]))) + } + const ogTitle = html.match(/]*property=og:title[^>]*>/i) + if (ogTitle) { + const content = extractAttr(ogTitle[0], 'content') + if (content) return normalizeText(decodeHtmlEntities(content)) + } + const title = html.match(/]*>([\s\S]*?)<\/title>/i) + if (title?.[1]) return normalizeText(decodeHtmlEntities(stripTags(title[1]))) + return 'Untitled' +} + +function htmlToPlainText(docHtml: string, origin: string, pageUrl: string) { + let s = docHtml + + s = s.replace( + /]*class=hash-link[^>]*>[\s\S]*?<\/a>/gi, + '' + ) + + s = s.replace( + /]*>\s*]*)>([\s\S]*?)<\/code>\s*<\/pre>/gi, + (_, codeAttrs: string, codeInner: string) => { + const classAttr = extractAttr(codeAttrs ?? '', 'class') ?? '' + const langMatch = classAttr.match(/language-([a-zA-Z0-9_-]+)/) + const lang = langMatch?.[1] ?? '' + const code = normalizeText(decodeHtmlEntities(stripTags(codeInner))) + const fence = lang ? `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n` : `\n\n\`\`\`\n${code}\n\`\`\`\n\n` + return fence + } + ) + + s = s.replace(/]*)>/gi, (_, attrs: string) => { + const alt = decodeHtmlEntities(extractAttr(attrs ?? '', 'alt') ?? 'image') + const src = extractAttr(attrs ?? '', 'src') ?? '' + const abs = absoluteUrl(origin, src, pageUrl) + return `\n\n![${alt}](${abs})\n\n` + }) + + s = s.replace(/]*)>([\s\S]*?)<\/h\1>/gi, (_, lvl: string, _attrs: string, inner: string) => { + const level = Number.parseInt(lvl, 10) + const text = normalizeText(decodeHtmlEntities(stripTags(inner))) + if (!text) return '' + const hashes = '#'.repeat(Math.min(Math.max(level, 1), 6)) + return `\n\n${hashes} ${text}\n\n` + }) + + s = s.replace(/]*)>([\s\S]*?)<\/a>/gi, (_, attrs: string, inner: string) => { + const text = normalizeText(decodeHtmlEntities(stripTags(inner))) + const href = extractAttr(attrs ?? '', 'href') ?? '' + if (!href) return text + const abs = absoluteUrl(origin, href, pageUrl) + if (!text) return abs + if (text === abs) return abs + return `${text} (${abs})` + }) + + s = s.replace(/]*>([\s\S]*?)<\/code>/gi, (_, inner: string) => { + const text = normalizeText(decodeHtmlEntities(stripTags(inner))) + if (!text) return '' + return `\`${text}\`` + }) + + s = s.replace(/<\/(p|div|section|article)>/gi, '\n\n') + s = s.replace(/<(p|div|section|article)[^>]*>/gi, '\n\n') + s = s.replace(//gi, '\n') + + s = s.replace(/]*>/gi, '\n- ') + s = s.replace(/<\/li>/gi, '\n') + + s = s.replace(/<\/tr>/gi, '\n') + s = s.replace(/<\/t[hd]>/gi, ' | ') + + s = decodeHtmlEntities(stripTags(s)) + return normalizeText(s) +} + +function sha256(input: string) { + return crypto.createHash('sha256').update(input).digest('hex') +} + +function walkFiles(dirPath: string) { + const results: string[] = [] + const stack = [dirPath] + + while (stack.length) { + const current = stack.pop() + if (!current) continue + const entries = fs.readdirSync(current, { withFileTypes: true }) + for (const entry of entries) { + const full = path.join(current, entry.name) + if (entry.isDirectory()) stack.push(full) + else if (entry.isFile()) results.push(full) + } + } + + return results +} + +function copyRawDocs(docsDir: string, rawOutDir: string) { + if (!fs.existsSync(docsDir)) return + const files = walkFiles(docsDir) + for (const src of files) { + const ext = path.extname(src).toLowerCase() + if (ext !== '.md' && ext !== '.mdx') continue + const rel = path.relative(docsDir, src) + const dest = path.join(rawOutDir, rel) + ensureDir(path.dirname(dest)) + fs.copyFileSync(src, dest) + } +} + +function main() { + const workspaceRoot = process.cwd() + const buildDir = path.join(workspaceRoot, 'build') + const outDir = path.join(buildDir, 'ai') + const rawOutDir = path.join(outDir, 'raw') + const docsDir = path.join(workspaceRoot, 'docs') + + if (!fs.existsSync(buildDir)) { + throw new Error( + `Missing ${buildDir}. Run \`bun run build\` before generating AI artifacts.` + ) + } + + const siteOrigin = (process.env.DOCS_URL ?? 'https://docs.yearn.fi').replace( + /\/+$/, + '' + ) + + fs.rmSync(outDir, { recursive: true, force: true }) + ensureDir(outDir) + ensureDir(rawOutDir) + + const htmlFiles = walkFiles(buildDir).filter((p) => { + if (!p.endsWith('index.html')) return false + const rel = path.relative(buildDir, p).replace(/\\/g, '/') + if (rel.startsWith('assets/') || rel.startsWith('fonts/')) return false + return true + }) + + const docsJsonlPath = path.join(outDir, 'docs.jsonl') + const docsJsonlStream = fs.createWriteStream(docsJsonlPath, { + encoding: 'utf-8', + }) + + let docCount = 0 + for (const htmlPath of htmlFiles) { + const html = fs.readFileSync(htmlPath, 'utf-8') + if (!html.includes('docs-doc-page')) continue + if (!html.includes('theme-doc-markdown')) continue + + const canonical = extractCanonicalUrl(html) + const pageUrl = + canonical && canonical.startsWith('http') + ? canonical + : canonical + ? absoluteUrl(siteOrigin, canonical) + : undefined + + const relHtmlPath = path.relative(buildDir, htmlPath).replace(/\\/g, '/') + + const route = + pageUrl && pageUrl.startsWith(siteOrigin) + ? pageUrl.slice(siteOrigin.length) || '/' + : (() => { + const rel = relHtmlPath.replace(/index\.html$/i, '') + return `/${rel}`.replace(/\\/g, '/').replace(/\/+$/, '') || '/' + })() + + const docHtml = extractDocHtml(html) + if (!docHtml) continue + + const title = extractTitle(html, docHtml) + const headings = extractHeadings(docHtml) + const absUrl = pageUrl ?? absoluteUrl(siteOrigin, route) + const text = htmlToPlainText(docHtml, siteOrigin, absUrl) + const stat = fs.statSync(htmlPath) + + const record: DocRecordV1 = { + schemaVersion: 1, + url: absUrl, + route, + title, + headings, + text, + sha256: sha256(text), + updatedAt: stat.mtime.toISOString(), + source: { htmlPath: relHtmlPath }, + } + + docsJsonlStream.write(`${JSON.stringify(record)}\n`) + docCount += 1 + } + + docsJsonlStream.end() + + copyRawDocs(docsDir, rawOutDir) + + const manifest: ManifestV1 = { + schemaVersion: 1, + generatedAt: new Date().toISOString(), + siteOrigin, + docCount, + docsJsonlPath: '/ai/docs.jsonl', + rawDocsPath: '/ai/raw/', + } + fs.writeFileSync( + path.join(outDir, 'manifest.json'), + `${JSON.stringify(manifest, null, 2)}\n` + ) + + fs.writeFileSync( + path.join(buildDir, 'llms.txt'), + [ + '# Yearn Docs (docs.yearn.fi)', + '', + 'AI-readable exports:', + '- Manifest: /ai/manifest.json', + '- Plaintext corpus (JSONL): /ai/docs.jsonl', + '- Raw docs sources (MD/MDX): /ai/raw/ (mirrors repository `docs/`)', + '', + 'Notes:', + '- Prefer citing canonical page URLs on https://docs.yearn.fi', + '- Use plaintext for retrieval; fall back to raw MDX/MD for exact formatting/quotes', + '', + ].join('\n') + ) + + // eslint-disable-next-line no-console + console.log(`Generated AI artifacts: ${docCount} docs -> ${path.relative(workspaceRoot, outDir)}`) +} + +main() diff --git a/src/components/DocMarkdownCopyButton.tsx b/src/components/DocMarkdownCopyButton.tsx new file mode 100644 index 000000000..5d7dc8408 --- /dev/null +++ b/src/components/DocMarkdownCopyButton.tsx @@ -0,0 +1,178 @@ +import React, { useCallback, useEffect, useRef, useState } from 'react' +import { Button } from '@site/src/components/shadcn/button/button' +import styles from '@site/src/css/docCopyButton.module.css' +import { Check, Copy, AlertTriangle } from 'lucide-react' + +const RESET_DELAY_MS = 2200 + +const cleanupDocContent = (root: HTMLElement) => { + const selectorsToRemove = [ + '.hash-link', + '.theme-code-block__copy-button', + '.theme-code-block__button', + 'button', + 'svg', + 'style', + 'script', + 'input', + 'textarea', + 'select', + ] + + root.querySelectorAll(selectorsToRemove.join(',')).forEach((node) => { + node.remove() + }) +} + +const getCodeLanguage = (codeEl: HTMLElement | null) => { + if (!codeEl) return '' + const direct = + codeEl.getAttribute('data-language') || + codeEl.parentElement?.getAttribute('data-language') + if (direct) return direct + const className = codeEl.getAttribute('class') || '' + const match = className.match(/language-([^\s]+)/) + return match ? match[1] : '' +} + +const toMarkdown = async (root: HTMLElement) => { + const [turndownModule, gfmModule] = await Promise.all([ + import('turndown'), + import('turndown-plugin-gfm'), + ]) + const TurndownService = + (turndownModule as any).default || (turndownModule as any) + const turndownService = new TurndownService({ + codeBlockStyle: 'fenced', + headingStyle: 'atx', + bulletListMarker: '-', + }) + const gfm = + (gfmModule as any).gfm || (gfmModule as any).default || (gfmModule as any) + if (gfm) { + turndownService.use(gfm) + } + + turndownService.addRule('fencedCodeBlockWithLanguage', { + filter: (node) => { + if (!(node instanceof HTMLElement)) return false + return ( + node.nodeName === 'PRE' && + node.firstElementChild?.nodeName === 'CODE' + ) + }, + replacement: (_content, node) => { + const pre = node as HTMLElement + const code = pre.querySelector('code') + const language = getCodeLanguage(code) + const text = code?.textContent || pre.textContent || '' + const trimmed = text.replace(/\n$/, '') + const fence = '```' + return `\n\n${fence}${language ? language : ''}\n${trimmed}\n${fence}\n\n` + }, + }) + + const markdown = turndownService.turndown(root) + return markdown.trim() +} + +const writeToClipboard = async (text: string) => { + if (navigator.clipboard && window.isSecureContext) { + await navigator.clipboard.writeText(text) + return + } + + const textarea = document.createElement('textarea') + textarea.value = text + textarea.setAttribute('readonly', '') + textarea.style.position = 'fixed' + textarea.style.left = '-9999px' + textarea.style.top = '0' + document.body.appendChild(textarea) + textarea.focus() + textarea.select() + const success = document.execCommand('copy') + textarea.remove() + if (!success) { + throw new Error('Clipboard copy failed') + } +} + +const DocMarkdownCopyButton = () => { + const [status, setStatus] = useState<'idle' | 'copying' | 'copied' | 'error'>( + 'idle' + ) + const timeoutRef = useRef(null) + + useEffect(() => { + return () => { + if (timeoutRef.current) { + window.clearTimeout(timeoutRef.current) + } + } + }, []) + + const resetStatus = () => { + if (timeoutRef.current) { + window.clearTimeout(timeoutRef.current) + } + timeoutRef.current = window.setTimeout(() => { + setStatus('idle') + }, RESET_DELAY_MS) + } + + const handleCopy = useCallback(async () => { + try { + setStatus('copying') + const docContent = document.querySelector( + '.theme-doc-markdown' + ) as HTMLElement | null + if (!docContent) { + throw new Error('Doc content not found') + } + const clone = docContent.cloneNode(true) as HTMLElement + cleanupDocContent(clone) + const markdown = await toMarkdown(clone) + await writeToClipboard(markdown) + setStatus('copied') + resetStatus() + } catch (error) { + console.error('Failed to copy markdown', error) + setStatus('error') + resetStatus() + } + }, []) + + const label = + status === 'copying' + ? 'Copying...' + : status === 'copied' + ? 'Copied markdown' + : status === 'error' + ? 'Copy failed' + : 'Copy page as Markdown' + + const Icon = + status === 'copied' + ? Check + : status === 'error' + ? AlertTriangle + : Copy + + return ( +
+ +
+ ) +} + +export default DocMarkdownCopyButton diff --git a/src/css/docCopyButton.module.css b/src/css/docCopyButton.module.css new file mode 100644 index 000000000..0e2c09eae --- /dev/null +++ b/src/css/docCopyButton.module.css @@ -0,0 +1,36 @@ +.copyToolbar { + display: flex; + justify-content: flex-start; + align-items: center; + margin: 0; + padding-left: var( + --doc-copy-padding-horizontal, + calc(var(--ifm-toc-padding-horizontal) * 2) + ); + padding-right: var( + --doc-copy-padding-horizontal, + calc(var(--ifm-toc-padding-horizontal) * 2) + ); +} + +.copyButton { + gap: 0.5rem; + max-width: none; + border: none; + background: transparent; + font-weight: 400; + padding: 0.25rem 0; + color: var(--ifm-font-color-base); + text-decoration: none; +} + +.copyButton.copyButton:hover { + background: transparent; + color: var(--ifm-color-primary); + text-decoration: none; +} + +.copyButton.copyButton:focus-visible { + box-shadow: none; + text-decoration: underline; +} diff --git a/src/theme/DocItem/Content/index.js b/src/theme/DocItem/Content/index.js new file mode 100644 index 000000000..14ab46956 --- /dev/null +++ b/src/theme/DocItem/Content/index.js @@ -0,0 +1,31 @@ +import React from 'react' +import clsx from 'clsx' +import { ThemeClassNames } from '@docusaurus/theme-common' +import { useDoc } from '@docusaurus/plugin-content-docs/client' +import Heading from '@theme/Heading' +import MDXContent from '@theme/MDXContent' + +const useSyntheticTitle = () => { + const { metadata, frontMatter, contentTitle } = useDoc() + const shouldRender = + !frontMatter.hide_title && typeof contentTitle === 'undefined' + if (!shouldRender) { + return null + } + return metadata.title +} + +export default function DocItemContent({ children }) { + const syntheticTitle = useSyntheticTitle() + + return ( +
+ {syntheticTitle && ( +
+ {syntheticTitle} +
+ )} + {children} +
+ ) +} diff --git a/src/theme/DocItem/Layout/index.js b/src/theme/DocItem/Layout/index.js index 5ae7953b8..f8ae7200e 100644 --- a/src/theme/DocItem/Layout/index.js +++ b/src/theme/DocItem/Layout/index.js @@ -1,16 +1,85 @@ import React from 'react' -import Layout from '@theme-original/DocItem/Layout' +import clsx from 'clsx' +import { useWindowSize } from '@docusaurus/theme-common' import { useDoc } from '@docusaurus/plugin-content-docs/client' +import DocItemPaginator from '@theme/DocItem/Paginator' +import DocVersionBanner from '@theme/DocVersionBanner' +import DocVersionBadge from '@theme/DocVersionBadge' +import DocItemFooter from '@theme/DocItem/Footer' +import DocItemTOCMobile from '@theme/DocItem/TOC/Mobile' +import DocItemTOCDesktop from '@theme/DocItem/TOC/Desktop' +import DocItemContent from '@theme/DocItem/Content' +import DocBreadcrumbs from '@theme/DocBreadcrumbs' +import ContentVisibility from '@theme/ContentVisibility' import { ContractDataProvider } from '@site/src/context/ContractDataContext' +import DocMarkdownCopyButton from '@site/src/components/DocMarkdownCopyButton' +import styles from './styles.module.css' -export default function LayoutWrapper(props) { +const useDocTOC = () => { + const { frontMatter, toc } = useDoc() + const windowSize = useWindowSize() + + const hidden = frontMatter.hide_table_of_contents + const canRender = !hidden && toc.length > 0 + + const mobile = canRender ? : undefined + + const desktop = + canRender && (windowSize === 'desktop' || windowSize === 'ssr') ? ( + + ) : undefined + + return { + hidden, + mobile, + desktop, + } +} + +const DocItemLayoutContent = ({ children }) => { + const docTOC = useDocTOC() + const { metadata } = useDoc() + + return ( +
+
+ + +
+
+ + + {docTOC.mobile} + {docTOC.mobile && ( +
+ +
+ )} + {children} + +
+ +
+
+ {docTOC.desktop && ( +
+ + {docTOC.desktop} +
+ )} +
+ ) +} + +export default function DocItemLayoutWrapper(props) { const { frontMatter } = useDoc() + const content = return frontMatter.rpcCalls ? ( - + {content} ) : ( - + content ) } diff --git a/src/theme/DocItem/Layout/styles.module.css b/src/theme/DocItem/Layout/styles.module.css new file mode 100644 index 000000000..bf14f3665 --- /dev/null +++ b/src/theme/DocItem/Layout/styles.module.css @@ -0,0 +1,34 @@ +.docItemContainer header + *, +.docItemContainer article > *:first-child { + margin-top: 0; +} + +@media (min-width: 997px) { + .docItemCol { + max-width: 75% !important; + } +} + +.docSidebar { + display: flex; + flex-direction: column; + gap: 0.75rem; +} + +@media (max-width: 996px) { + .docSidebar { + display: none; + } + + .mobileCopyButton { + margin-top: -0.5rem; + margin-bottom: 1rem; + --doc-copy-padding-horizontal: var(--ifm-toc-padding-horizontal); + } +} + +@media (min-width: 997px) { + .mobileCopyButton { + display: none; + } +} From 9d282b853fe9b25237d42e3aab8711d2d1e2aa07 Mon Sep 17 00:00:00 2001 From: Ross Date: Fri, 2 Jan 2026 10:40:47 -0500 Subject: [PATCH 2/5] Feat: update schema version and enhance AI document retrieval capabilities --- scripts/generateAiArtifacts.ts | 77 +++++++++++++++++++-------- skills/yearn-docs-site-query/SKILL.md | 69 ++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 22 deletions(-) create mode 100644 skills/yearn-docs-site-query/SKILL.md diff --git a/scripts/generateAiArtifacts.ts b/scripts/generateAiArtifacts.ts index 98557655f..098e69986 100644 --- a/scripts/generateAiArtifacts.ts +++ b/scripts/generateAiArtifacts.ts @@ -9,8 +9,9 @@ type Heading = { } type DocRecordV1 = { - schemaVersion: 1 + schemaVersion: 2 url: string + canonicalUrl?: string route: string title: string headings: Heading[] @@ -19,11 +20,12 @@ type DocRecordV1 = { updatedAt: string source: { htmlPath: string + rawPath?: string } } type ManifestV1 = { - schemaVersion: 1 + schemaVersion: 2 generatedAt: string siteOrigin: string docCount: number @@ -94,6 +96,21 @@ function extractCanonicalUrl(html: string) { return href } +function resolveSiteOrigin() { + const explicit = process.env.DOCS_URL?.trim() + if (explicit) return explicit.replace(/\/+$/, '') + + const vercel = process.env.VERCEL_URL?.trim() + if (vercel) { + const withProto = vercel.startsWith('http://') || vercel.startsWith('https://') + ? vercel + : `https://${vercel}` + return withProto.replace(/\/+$/, '') + } + + return 'https://docs.yearn.fi' +} + function extractDocHtml(html: string) { const start = html.indexOf('
') if (start < 0) return undefined @@ -263,6 +280,26 @@ function copyRawDocs(docsDir: string, rawOutDir: string) { } } +function findRawSourceRelativePath(docsDir: string, relHtmlPath: string) { + const rel = relHtmlPath.replace(/\\/g, '/') + + const candidates: string[] = [] + if (rel.endsWith('/index.html')) { + const dir = rel.slice(0, -'/index.html'.length) + candidates.push(`${dir}.md`, `${dir}.mdx`, `${dir}/index.md`, `${dir}/index.mdx`) + } else if (rel.endsWith('index.html')) { + const base = rel.slice(0, -'index.html'.length).replace(/\/+$/, '') + candidates.push(`${base}.md`, `${base}.mdx`, `${base}/index.md`, `${base}/index.mdx`) + } + + for (const candidate of candidates) { + const full = path.join(docsDir, candidate) + if (fs.existsSync(full) && fs.statSync(full).isFile()) return candidate + } + + return undefined +} + function main() { const workspaceRoot = process.cwd() const buildDir = path.join(workspaceRoot, 'build') @@ -276,10 +313,7 @@ function main() { ) } - const siteOrigin = (process.env.DOCS_URL ?? 'https://docs.yearn.fi').replace( - /\/+$/, - '' - ) + const siteOrigin = resolveSiteOrigin() fs.rmSync(outDir, { recursive: true, force: true }) ensureDir(outDir) @@ -303,19 +337,13 @@ function main() { if (!html.includes('docs-doc-page')) continue if (!html.includes('theme-doc-markdown')) continue - const canonical = extractCanonicalUrl(html) - const pageUrl = - canonical && canonical.startsWith('http') - ? canonical - : canonical - ? absoluteUrl(siteOrigin, canonical) - : undefined + const canonicalUrl = extractCanonicalUrl(html) const relHtmlPath = path.relative(buildDir, htmlPath).replace(/\\/g, '/') const route = - pageUrl && pageUrl.startsWith(siteOrigin) - ? pageUrl.slice(siteOrigin.length) || '/' + canonicalUrl && canonicalUrl.startsWith(siteOrigin) + ? canonicalUrl.slice(siteOrigin.length) || '/' : (() => { const rel = relHtmlPath.replace(/index\.html$/i, '') return `/${rel}`.replace(/\\/g, '/').replace(/\/+$/, '') || '/' @@ -326,20 +354,23 @@ function main() { const title = extractTitle(html, docHtml) const headings = extractHeadings(docHtml) - const absUrl = pageUrl ?? absoluteUrl(siteOrigin, route) - const text = htmlToPlainText(docHtml, siteOrigin, absUrl) + const url = absoluteUrl(siteOrigin, route) + const rawRel = findRawSourceRelativePath(docsDir, relHtmlPath) + const rawPath = rawRel ? `/ai/raw/${rawRel}` : undefined + const text = htmlToPlainText(docHtml, siteOrigin, url) const stat = fs.statSync(htmlPath) const record: DocRecordV1 = { - schemaVersion: 1, - url: absUrl, + schemaVersion: 2, + url, + canonicalUrl: canonicalUrl && canonicalUrl.startsWith('http') ? canonicalUrl : undefined, route, title, headings, text, sha256: sha256(text), updatedAt: stat.mtime.toISOString(), - source: { htmlPath: relHtmlPath }, + source: { htmlPath: relHtmlPath, rawPath }, } docsJsonlStream.write(`${JSON.stringify(record)}\n`) @@ -351,7 +382,7 @@ function main() { copyRawDocs(docsDir, rawOutDir) const manifest: ManifestV1 = { - schemaVersion: 1, + schemaVersion: 2, generatedAt: new Date().toISOString(), siteOrigin, docCount, @@ -366,7 +397,9 @@ function main() { fs.writeFileSync( path.join(buildDir, 'llms.txt'), [ - '# Yearn Docs (docs.yearn.fi)', + '# Yearn Docs', + '', + `Site: ${siteOrigin}`, '', 'AI-readable exports:', '- Manifest: /ai/manifest.json', diff --git a/skills/yearn-docs-site-query/SKILL.md b/skills/yearn-docs-site-query/SKILL.md new file mode 100644 index 000000000..e9de1bcc2 --- /dev/null +++ b/skills/yearn-docs-site-query/SKILL.md @@ -0,0 +1,69 @@ +--- +name: yearn-docs-site-query +description: Query and cite Yearn documentation via the hosted docs site's AI exports (llms.txt, /ai/manifest.json, /ai/docs.jsonl, and optional /ai/raw/). Use when answering questions from docs.yearn.fi (or a Vercel preview) without cloning the repo, and when you need reliable retrieval + citations from the published docs. +--- + +# Yearn Docs: hosted retrieval + +## Inputs + +- `BASE`: The docs site origin to query (e.g. a Vercel preview or `https://docs.yearn.fi`). + +Example: + +```text +BASE=https://yearn-docs-git-feat-copy-page-content-to-markdown-yearn.vercel.app +``` + +## Endpoints (relative to `BASE`) + +- `GET /llms.txt` — human/agent pointer file. +- `GET /ai/manifest.json` — machine-readable pointers (paths, counts, origin). +- `GET /ai/docs.jsonl` — plaintext corpus (JSON Lines: 1 JSON object per doc page). +- `GET /ai/raw/...` — optional raw `.md`/`.mdx` source mirror (if present in corpus records). + +## Retrieval workflow + +1. Fetch `GET {BASE}/ai/manifest.json` and `GET {BASE}/llms.txt`. +2. Download `GET {BASE}/ai/docs.jsonl` and parse it as JSONL (stream line-by-line). +3. Rank candidate records for the user query: + - Tokenize the query and score matches. + - Boost matches in `title` and `headings[].text` over matches in `text`. +4. Select the top records (e.g. 3–8), then select the best sections: + - Split `text` by headings and keep the most relevant chunks. +5. Answer using only the selected chunks (don’t hallucinate fields not present in the docs). + +## Record shape (docs.jsonl) + +Expect at least: + +- `title`: page title +- `headings`: array of `{ level, id?, text }` +- `text`: extracted plaintext (may include fenced code blocks) +- `route`: the path on the site host (e.g. `/developers/addresses`) +- `url`: canonical URL for the page (often on `https://docs.yearn.fi/...`) +- `source.htmlPath`: where it came from in the static build + +Optional/newer: + +- `canonicalUrl` +- `source.rawPath`: path under `/ai/raw/...` for exact `.md`/`.mdx` + +## Citing and linking + +- Prefer citing the canonical URL in the record: + - Use `canonicalUrl` if present, else `url`. +- If `url` points to the wrong origin for the environment you’re using, cite `BASE + route`. + +## Exact-source fallback (for precise quotes/formatting) + +When you need exact formatting, tables, or the original MDX: + +1. If `source.rawPath` exists: `GET {BASE}{source.rawPath}` and quote from that. +2. Otherwise fetch the rendered page: `GET {BASE}{route}` and extract the relevant section from HTML. + +## Caching + +- Cache `GET {BASE}/ai/docs.jsonl` per session. +- If you can, revalidate with `ETag`/`If-None-Match` on subsequent runs. + From e989e59c163510f86dc026ccf62c21247adee2fc Mon Sep 17 00:00:00 2001 From: Ross Date: Fri, 2 Jan 2026 10:41:51 -0500 Subject: [PATCH 3/5] update docs base URL in skill --- skills/yearn-docs-site-query/SKILL.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skills/yearn-docs-site-query/SKILL.md b/skills/yearn-docs-site-query/SKILL.md index e9de1bcc2..e3d82085f 100644 --- a/skills/yearn-docs-site-query/SKILL.md +++ b/skills/yearn-docs-site-query/SKILL.md @@ -12,7 +12,7 @@ description: Query and cite Yearn documentation via the hosted docs site's AI ex Example: ```text -BASE=https://yearn-docs-git-feat-copy-page-content-to-markdown-yearn.vercel.app +BASE=https://docs.yearn.fi ``` ## Endpoints (relative to `BASE`) @@ -66,4 +66,3 @@ When you need exact formatting, tables, or the original MDX: - Cache `GET {BASE}/ai/docs.jsonl` per session. - If you can, revalidate with `ETag`/`If-None-Match` on subsequent runs. - From cde68ce2a2412235cdef3af74ecd3ac09bef1a99 Mon Sep 17 00:00:00 2001 From: Ross Date: Fri, 2 Jan 2026 12:32:13 -0500 Subject: [PATCH 4/5] improve skill --- skills/yearn-docs-site-query/SKILL.md | 54 +- .../scripts/yearn_docs_query.py | 526 ++++++++++++++++++ 2 files changed, 579 insertions(+), 1 deletion(-) create mode 100644 skills/yearn-docs-site-query/scripts/yearn_docs_query.py diff --git a/skills/yearn-docs-site-query/SKILL.md b/skills/yearn-docs-site-query/SKILL.md index e3d82085f..09695d34c 100644 --- a/skills/yearn-docs-site-query/SKILL.md +++ b/skills/yearn-docs-site-query/SKILL.md @@ -7,7 +7,7 @@ description: Query and cite Yearn documentation via the hosted docs site's AI ex ## Inputs -- `BASE`: The docs site origin to query (e.g. a Vercel preview or `https://docs.yearn.fi`). +- `BASE` (optional): The docs site origin to query (e.g. a Vercel preview). Defaults to `https://docs.yearn.fi`. Example: @@ -15,6 +15,58 @@ Example: BASE=https://docs.yearn.fi ``` +Tip: if you use a bash one-liner like `BASE=... python3 ... --base "$BASE" ...`, `$BASE` expands before the assignment takes effect. Prefer either `--base 'https://...'` directly, or `export BASE=...` first, or omit `--base` and rely on the default. + +## One-approval workflow (recommended) + +To avoid repeated network approvals in restricted environments, use the bundled helper. It auto-checks for updates (conditional request) and only downloads `docs.jsonl` when it has changed, so you typically need only **one approval per invocation**: + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" search "yCHAD multisig signers" +``` + +If you want absolutely no network calls, force offline mode (no network at all): + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" --offline search "veYFI gauge" +``` + +To fetch a page by route: + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" get /developers/security/multisig +``` + +To clear cached files for that `BASE`: + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" cleanup +``` + +To see whether your cached data is stale (age/etag/manifest info): + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" status +``` + +To check the remote for updates and refresh the cache if needed (will require network approval in restricted environments): + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" --check-updates status +``` + +To skip update checks and use cached data (if present): + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" --no-auto-update search "multisig signers" +``` + +To force a full re-download (even if unchanged): + +```bash +python3 skills/yearn-docs-site-query/scripts/yearn_docs_query.py --base "$BASE" --refresh status +``` + ## Endpoints (relative to `BASE`) - `GET /llms.txt` — human/agent pointer file. diff --git a/skills/yearn-docs-site-query/scripts/yearn_docs_query.py b/skills/yearn-docs-site-query/scripts/yearn_docs_query.py new file mode 100644 index 000000000..f288c9cbe --- /dev/null +++ b/skills/yearn-docs-site-query/scripts/yearn_docs_query.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import pathlib +import re +import sys +import time +import urllib.error +import urllib.request +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Tuple + + +def _normalize_base(base: str) -> str: + base = base.strip() + if not base: + raise ValueError("BASE is empty") + if not base.startswith("http://") and not base.startswith("https://"): + base = "https://" + base + return base.rstrip("/") + + +def _sha256_hex(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def _default_cache_dir() -> pathlib.Path: + # Prefer XDG cache. + xdg = os.environ.get("XDG_CACHE_HOME") + if xdg: + return pathlib.Path(xdg) / "yearn-docs-site-query" + + home = os.path.expanduser("~") + if home and home != "~": + return pathlib.Path(home) / ".cache" / "yearn-docs-site-query" + + # Fallback: local relative cache. + return pathlib.Path(".yearn-docs-cache") + + +def _read_text(path: pathlib.Path) -> Optional[str]: + try: + return path.read_text("utf-8") + except FileNotFoundError: + return None + + +def _write_text(path: pathlib.Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def _read_json(path: pathlib.Path) -> Optional[Dict[str, Any]]: + raw = _read_text(path) + if raw is None: + return None + return json.loads(raw) + + +def _write_json(path: pathlib.Path, obj: Dict[str, Any]) -> None: + _write_text(path, json.dumps(obj, indent=2, sort_keys=True) + "\n") + + +def _http_get( + url: str, + *, + headers: Optional[Dict[str, str]] = None, + timeout_s: int = 30, +) -> Tuple[int, Dict[str, str], bytes]: + req = urllib.request.Request(url, headers=headers or {}, method="GET") + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + status = getattr(resp, "status", 200) + resp_headers = {k.lower(): v for k, v in resp.headers.items()} + body = resp.read() + return status, resp_headers, body + except urllib.error.HTTPError as e: + status = e.code + resp_headers = {k.lower(): v for k, v in e.headers.items()} if e.headers else {} + body = e.read() if hasattr(e, "read") else b"" + return status, resp_headers, body + + +def _save_bytes(path: pathlib.Path, data: bytes) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + + +def _load_bytes(path: pathlib.Path) -> Optional[bytes]: + try: + return path.read_bytes() + except FileNotFoundError: + return None + + +def _tokenize(query: str) -> List[str]: + query = query.lower().strip() + if not query: + return [] + return [t for t in re.split(r"[^\w]+", query) if t] + + +@dataclass +class SearchHit: + score: int + title: str + url: str + route: str + snippet: str + + +def _score_record(record: Dict[str, Any], tokens: List[str]) -> Tuple[int, str]: + title = (record.get("title") or "").lower() + headings = " ".join((h.get("text") or "") for h in (record.get("headings") or [])).lower() + text = (record.get("text") or "").lower() + + score = 0 + coverage = 0 + for tok in tokens: + in_title = title.count(tok) + in_headings = headings.count(tok) + in_text = text.count(tok) + + if in_title or in_headings or in_text: + coverage += 1 + + score += 10 * in_title + score += 6 * in_headings + score += 1 * in_text + + # Prefer documents that match more distinct query tokens, and penalize partial matches. + score += 200 * coverage + missing = max(0, len(tokens) - coverage) + score -= 300 * missing + + snippet = "" + if tokens: + tok = tokens[0] + idx = text.find(tok) + if idx >= 0: + start = max(0, idx - 120) + end = min(len(text), idx + 240) + snippet = (record.get("text") or "")[start:end].replace("\n", " ").strip() + return score, snippet + + +def _iter_jsonl(path: pathlib.Path) -> Iterable[Dict[str, Any]]: + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + + +def _fetch_corpus( + *, + base: str, + cache_dir: pathlib.Path, + refresh: bool, + offline: bool, + check_updates: bool, + auto_update: bool, +) -> Tuple[pathlib.Path, Dict[str, Any]]: + base_key = _sha256_hex(base)[:12] + root = cache_dir / base_key + root.mkdir(parents=True, exist_ok=True) + + manifest_path = root / "manifest.json" + docs_path = root / "docs.jsonl" + meta_path = root / "meta.json" + + meta = _read_json(meta_path) or {} + + if offline: + manifest = _read_json(manifest_path) + if manifest is None: + raise SystemExit(f"Offline mode: missing cached manifest at {manifest_path}") + if not docs_path.exists(): + raise SystemExit(f"Offline mode: missing cached corpus at {docs_path}") + return docs_path, manifest + + # Cache-first: avoid network if we already have a corpus and aren't asked to check/refresh. + cached_manifest = _read_json(manifest_path) + if ( + not refresh + and not check_updates + and not auto_update + and cached_manifest is not None + and docs_path.exists() + ): + return docs_path, cached_manifest + + # Determine the docs.jsonl URL without necessarily fetching the manifest. + docs_rel = (cached_manifest or {}).get("docsJsonlPath") or meta.get("docsRel") or "/ai/docs.jsonl" + if not str(docs_rel).startswith("/"): + docs_rel = "/" + str(docs_rel) + docs_url = f"{base}{docs_rel}" + + # Auto-update: do a conditional GET against docs.jsonl (ETag / Last-Modified) and download only if changed. + if auto_update and not refresh and docs_path.exists(): + headers: Dict[str, str] = {} + if meta.get("etag"): + headers["If-None-Match"] = meta["etag"] + if meta.get("lastModified"): + headers["If-Modified-Since"] = meta["lastModified"] + + if headers: + status, resp_headers, body = _http_get(docs_url, headers=headers, timeout_s=60) + if status == 304: + meta["checkedAt"] = int(time.time()) + _write_json(meta_path, meta) + return docs_path, cached_manifest or {} + if status == 200: + _save_bytes(docs_path, body) + meta["etag"] = resp_headers.get("etag", meta.get("etag")) + meta["lastModified"] = resp_headers.get("last-modified", meta.get("lastModified")) + meta["fetchedAt"] = int(time.time()) + meta["checkedAt"] = meta["fetchedAt"] + meta["docsUrl"] = docs_url + meta["docsRel"] = docs_rel + _write_json(meta_path, meta) + return docs_path, cached_manifest or {} + + # If the update check fails, fall back to cached data instead of failing the whole query. + if cached_manifest is not None and docs_path.exists(): + print(f"warning: update check failed ({status}); using cached corpus", file=sys.stderr) + meta["checkedAt"] = int(time.time()) + _write_json(meta_path, meta) + return docs_path, cached_manifest + + manifest_url = f"{base}/ai/manifest.json" + manifest: Dict[str, Any] + if check_updates or refresh or cached_manifest is None: + status, _headers, body = _http_get(manifest_url, timeout_s=30) + if status != 200: + raise SystemExit(f"Failed to fetch manifest ({status}): {manifest_url}") + _save_bytes(manifest_path, body) + manifest = json.loads(body.decode("utf-8")) + meta["manifestGeneratedAt"] = manifest.get("generatedAt") + meta["manifestDocCount"] = manifest.get("docCount") + meta["manifestUrl"] = manifest_url + else: + manifest = cached_manifest + + docs_rel = manifest.get("docsJsonlPath") or docs_rel or "/ai/docs.jsonl" + if not str(docs_rel).startswith("/"): + docs_rel = "/" + str(docs_rel) + docs_url = f"{base}{docs_rel}" + meta["docsUrl"] = docs_url + meta["docsRel"] = docs_rel + + if not refresh and docs_path.exists(): + etag = meta.get("etag") + if etag: + status2, headers2, body2 = _http_get( + docs_url, + headers={"If-None-Match": etag}, + timeout_s=60, + ) + if status2 == 304: + meta["checkedAt"] = int(time.time()) + _write_json(meta_path, meta) + return docs_path, manifest + if status2 == 200: + _save_bytes(docs_path, body2) + meta["etag"] = headers2.get("etag", etag) + meta["lastModified"] = headers2.get("last-modified", meta.get("lastModified")) + meta["fetchedAt"] = int(time.time()) + meta["checkedAt"] = meta["fetchedAt"] + _write_json(meta_path, meta) + return docs_path, manifest + raise SystemExit(f"Failed to refresh corpus ({status2}): {docs_url}") + + # No etag: keep cached unless forced. + return docs_path, manifest + + status3, headers3, body3 = _http_get(docs_url, timeout_s=120) + if status3 != 200: + raise SystemExit(f"Failed to fetch corpus ({status3}): {docs_url}") + _save_bytes(docs_path, body3) + meta["etag"] = headers3.get("etag") + meta["lastModified"] = headers3.get("last-modified") + meta["fetchedAt"] = int(time.time()) + meta["checkedAt"] = meta["fetchedAt"] + _write_json(meta_path, meta) + return docs_path, manifest + + +def _cmd_search(args: argparse.Namespace) -> int: + base = _normalize_base(args.base) + cache_dir = pathlib.Path(args.cache_dir) if args.cache_dir else _default_cache_dir() + + docs_path, _manifest = _fetch_corpus( + base=base, + cache_dir=cache_dir, + refresh=args.refresh, + offline=args.offline, + check_updates=args.check_updates, + auto_update=args.auto_update, + ) + + tokens = _tokenize(args.query) + if not tokens: + print("Empty query.") + return 2 + + hits: List[SearchHit] = [] + for record in _iter_jsonl(docs_path): + score, snippet = _score_record(record, tokens) + if score <= 0: + continue + hits.append( + SearchHit( + score=score, + title=record.get("title") or "", + url=record.get("url") or "", + route=record.get("route") or "", + snippet=snippet, + ) + ) + + hits.sort(key=lambda h: h.score, reverse=True) + hits = hits[: args.max_results] + + if args.format == "json": + print( + json.dumps( + [h.__dict__ for h in hits], + indent=2, + ) + ) + return 0 + + for i, h in enumerate(hits, 1): + print(f"{i}. {h.title}") + print(f" url: {h.url}") + print(f" route: {h.route}") + print(f" score: {h.score}") + if h.snippet: + print(f" snippet: {h.snippet}") + return 0 + + +def _cmd_get(args: argparse.Namespace) -> int: + base = _normalize_base(args.base) + cache_dir = pathlib.Path(args.cache_dir) if args.cache_dir else _default_cache_dir() + + docs_path, _manifest = _fetch_corpus( + base=base, + cache_dir=cache_dir, + refresh=args.refresh, + offline=args.offline, + check_updates=args.check_updates, + auto_update=args.auto_update, + ) + + target_route = args.route.strip() + if not target_route.startswith("/"): + target_route = "/" + target_route + + for record in _iter_jsonl(docs_path): + if (record.get("route") or "") == target_route: + if args.format == "json": + print(json.dumps(record, indent=2)) + else: + print(record.get("text") or "") + return 0 + + print(f"Not found: {target_route}") + return 1 + + +def _cmd_cleanup(args: argparse.Namespace) -> int: + base = _normalize_base(args.base) + cache_dir = pathlib.Path(args.cache_dir) if args.cache_dir else _default_cache_dir() + base_key = _sha256_hex(base)[:12] + root = cache_dir / base_key + if root.exists(): + for p in sorted(root.rglob("*"), reverse=True): + if p.is_file(): + p.unlink() + elif p.is_dir(): + try: + p.rmdir() + except OSError: + pass + try: + root.rmdir() + except OSError: + pass + print(f"Removed cache: {root}") + else: + print(f"No cache found: {root}") + return 0 + + +def _cmd_status(args: argparse.Namespace) -> int: + base = _normalize_base(args.base) + cache_dir = pathlib.Path(args.cache_dir) if args.cache_dir else _default_cache_dir() + base_key = _sha256_hex(base)[:12] + root = cache_dir / base_key + manifest_path = root / "manifest.json" + docs_path = root / "docs.jsonl" + meta_path = root / "meta.json" + + meta = _read_json(meta_path) or {} + manifest = _read_json(manifest_path) or {} + + def fmt_age(ts: Optional[int]) -> str: + if not ts: + return "unknown" + age = int(time.time()) - int(ts) + if age < 60: + return f"{age}s" + if age < 3600: + return f"{age//60}m" + if age < 86400: + return f"{age//3600}h" + return f"{age//86400}d" + + exists = root.exists() and docs_path.exists() and manifest_path.exists() + print(f"base: {base}") + print(f"cacheDir: {root}") + print(f"cached: {exists}") + if exists: + print(f"etag: {meta.get('etag') or ''}") + print(f"fetchedAt: {meta.get('fetchedAt') or ''} (age {fmt_age(meta.get('fetchedAt'))})") + if meta.get("checkedAt"): + print(f"checkedAt: {meta.get('checkedAt')} (age {fmt_age(meta.get('checkedAt'))})") + if manifest.get("generatedAt"): + print(f"manifest.generatedAt: {manifest.get('generatedAt')}") + if manifest.get("docCount") is not None: + print(f"manifest.docCount: {manifest.get('docCount')}") + + if args.check_updates: + # Touch the network once and update cache if needed. + _fetch_corpus( + base=base, + cache_dir=cache_dir, + refresh=args.refresh, + offline=False, + check_updates=True, + auto_update=False, + ) + print("updateCheck: done") + return 0 + + +def main(argv: List[str]) -> int: + parser = argparse.ArgumentParser( + prog="yearn_docs_query", + description="Fetch and query Yearn hosted docs AI corpus (docs.jsonl) with local caching.", + ) + parser.add_argument( + "--base", + default=os.environ.get("BASE", "https://docs.yearn.fi"), + help="Docs site origin (e.g. https://docs.yearn.fi or a Vercel preview). Defaults to https://docs.yearn.fi (or BASE env var if set).", + ) + parser.add_argument( + "--cache-dir", + default="", + help="Override cache directory (default: XDG cache or ~/.cache/yearn-docs-site-query).", + ) + parser.add_argument( + "--offline", + action="store_true", + help="Do not make network requests; require cached files to exist.", + ) + parser.add_argument( + "--refresh", + action="store_true", + help="Force refresh of cached corpus (ignore existing cache / etag).", + ) + parser.add_argument( + "--check-updates", + action="store_true", + help="Revalidate cached corpus against the remote (may download if changed).", + ) + parser.add_argument( + "--auto-update", + action="store_true", + default=True, + help="Check remote docs.jsonl via conditional request and download only if changed (default: on).", + ) + parser.add_argument( + "--no-auto-update", + dest="auto_update", + action="store_false", + help="Do not check the remote; use cached corpus if present (unless --refresh/--check-updates).", + ) + + sub = parser.add_subparsers(dest="cmd", required=True) + + p_search = sub.add_parser("search", help="Search the corpus and print best matches.") + p_search.add_argument("query", help="Search query.") + p_search.add_argument("--max-results", type=int, default=8) + p_search.add_argument("--format", choices=["text", "json"], default="text") + p_search.set_defaults(func=_cmd_search) + + p_get = sub.add_parser("get", help="Get a page by route and print its text (or full JSON).") + p_get.add_argument("route", help="Route path, e.g. /developers/security/multisig") + p_get.add_argument("--format", choices=["text", "json"], default="text") + p_get.set_defaults(func=_cmd_get) + + p_cleanup = sub.add_parser("cleanup", help="Delete cached corpus for this BASE.") + p_cleanup.set_defaults(func=_cmd_cleanup) + + p_status = sub.add_parser("status", help="Show cache status; optionally check for updates.") + p_status.set_defaults(func=_cmd_status) + + args = parser.parse_args(argv) + # If the user explicitly passed an empty --base (common shell one-liner footgun), + # fall back to the BASE env var. + if not (args.base or "").strip(): + args.base = os.environ.get("BASE", "https://docs.yearn.fi") + + return int(args.func(args)) + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) From 90965305aae394a5c2ac7068a014ec69cf74537f Mon Sep 17 00:00:00 2001 From: Ross Date: Fri, 2 Jan 2026 14:35:21 -0500 Subject: [PATCH 5/5] chore: move skill to public as it is for outside users to use to access these docs --- {skills => static/skills}/yearn-docs-site-query/SKILL.md | 0 .../skills}/yearn-docs-site-query/scripts/yearn_docs_query.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {skills => static/skills}/yearn-docs-site-query/SKILL.md (100%) rename {skills => static/skills}/yearn-docs-site-query/scripts/yearn_docs_query.py (100%) diff --git a/skills/yearn-docs-site-query/SKILL.md b/static/skills/yearn-docs-site-query/SKILL.md similarity index 100% rename from skills/yearn-docs-site-query/SKILL.md rename to static/skills/yearn-docs-site-query/SKILL.md diff --git a/skills/yearn-docs-site-query/scripts/yearn_docs_query.py b/static/skills/yearn-docs-site-query/scripts/yearn_docs_query.py similarity index 100% rename from skills/yearn-docs-site-query/scripts/yearn_docs_query.py rename to static/skills/yearn-docs-site-query/scripts/yearn_docs_query.py