diff --git a/.gitignore b/.gitignore index dcb1cd7..c495598 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,7 @@ dev/ PLAN.md # XSD/spec artifacts: pulled by scripts/fetch-xsd.ts; never committed. -data/xsd-cache/ \ No newline at end of file +data/xsd-cache/ + +# MS-OI29500 ingest cache (Microsoft Learn markdown pages). +data/ms-oi29500-cache/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index cdb8617..6fe4dbf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,10 +33,11 @@ apps/ packages/ shared/ Database client, embedding client, types scripts/ - ingest-pdf/ ECMA PDF -> spec_content (semantic search corpus) - ingest-xsd/ ECMA XSDs -> schema graph (structural query corpus) - sources-sync.ts data/sources.json -> reference_sources - db-migrate.ts Apply db/migrations/*.sql in order + ingest-ecma-376-pdfs/ ECMA-376 PDFs -> spec_content (semantic search corpus) + ingest-ecma-376-xsds/ ECMA-376 XSDs -> schema graph (structural query corpus) + ingest-ms-oi29500/ MS-OI29500 Learn pages -> behavior_notes (Word/Office implementation behavior) + sources-sync.ts data/sources.json -> reference_sources + db-migrate.ts Apply db/migrations/*.sql in order db/ schema.sql PostgreSQL + pgvector + XSD schema graph migrations/ Numbered, idempotent SQL migrations @@ -136,7 +137,7 @@ PDF → extract (Python) → chunk (6KB) → embed (Voyage) → upload (PostgreS bun run pdf:ingest 1 ./pdfs/ECMA-376-Part1.pdf # full pipeline for one part ``` -See `scripts/ingest-pdf/README.md`. +See `scripts/ingest-ecma-376-pdfs/README.md`. **XSD (structural corpus, into `xsd_*` tables)**: @@ -149,7 +150,7 @@ bun run xsd:fetch # URL + sha256 from data/so bun run xsd:ingest ``` -See `scripts/ingest-xsd/README.md`. +See `scripts/ingest-ecma-376-xsds/README.md`. ## Database diff --git a/apps/mcp-server/src/index.ts b/apps/mcp-server/src/index.ts index 41fa8bf..5ec2e8f 100644 --- a/apps/mcp-server/src/index.ts +++ b/apps/mcp-server/src/index.ts @@ -124,7 +124,7 @@ export default { await writer.write(encoder.encode(":keepalive\n\n")); } } catch { - // Client disconnected — stream closed + // Client disconnected - stream closed } })(), ); diff --git a/apps/mcp-server/src/mcp.ts b/apps/mcp-server/src/mcp.ts index 0621c4c..5834ce6 100644 --- a/apps/mcp-server/src/mcp.ts +++ b/apps/mcp-server/src/mcp.ts @@ -140,7 +140,7 @@ function handleInitialize(id: number | string | null): JsonRpcResponse { version: "0.1.0", }, instructions: - "OOXML (ECMA-376 / Office Open XML) reference server. Two tool families: prose search over the spec PDFs (ooxml_search, ooxml_section, ooxml_parts) and deterministic schema lookup over the parsed XSDs (ooxml_element, ooxml_type, ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace).", + "OOXML (ECMA-376 / Office Open XML) reference server. Three tool families: prose search over the spec PDFs (ooxml_search, ooxml_section, ooxml_parts), deterministic schema lookup over the parsed XSDs (ooxml_element, ooxml_type, ooxml_children, ooxml_attributes, ooxml_enum, ooxml_namespace), and Office implementation behavior notes from MS-OI29500 (ooxml_behavior, plus inline notes on ooxml_element/ooxml_type when the symbol has any).", }, }; } diff --git a/apps/mcp-server/src/ooxml-queries.ts b/apps/mcp-server/src/ooxml-queries.ts index 07a83df..332027b 100644 --- a/apps/mcp-server/src/ooxml-queries.ts +++ b/apps/mcp-server/src/ooxml-queries.ts @@ -18,6 +18,8 @@ type Sql = any; */ const COMMON_PREFIXES: Record = { w: "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + x: "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + p: "http://schemas.openxmlformats.org/presentationml/2006/main", r: "http://schemas.openxmlformats.org/officeDocument/2006/relationships", s: "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", m: "http://schemas.openxmlformats.org/officeDocument/2006/math", @@ -70,7 +72,7 @@ export function parseQName(raw: string): QNameParseResult { if (!namespace) { return { ok: false, - reason: `unknown prefix '${prefix}'. Use a known prefix (w, r, s, m, a, wp, pic, c, dgm), or Clark form {namespace}localName.`, + reason: `unknown prefix '${prefix}'. Use a known prefix (w, x, p, r, s, m, a, wp, pic, c, dgm), or Clark form {namespace}localName.`, }; } return { ok: true, qname: { namespace, localName, rawPrefix: prefix } }; @@ -613,3 +615,330 @@ export async function getNamespaceInfo(sql: Sql, uri: string): Promise { + const rows = await sql` + SELECT bn.id, bn.symbol_id, bn.app, bn.version_scope, bn.claim_type, bn.summary, + bn.standard_text, bn.behavior_text, bn.confidence, bn.resolution_confidence, + bn.section_id, bn.source_anchor, bn.source_commit, bn.target_ref, bn.claim_label, + src.name AS source_name, src.url AS source_url + FROM behavior_notes bn + LEFT JOIN reference_sources src ON src.id = bn.source_id + WHERE bn.symbol_id = ${symbolId} + ORDER BY bn.source_id, bn.source_anchor, bn.claim_index + `; + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + return (rows as any[]).map(rowToBehaviorNote); +} + +// --- word_observations / verification layer --------------------------------- + +export interface NoteVerification { + noteId: number; + status: "confirmed" | "refined" | "contradicted" | "not_reproducible"; + observationFinding: string; + scenario: string; + fixtureName: string | null; + wordVersion: string | null; + beforeXml: string | null; + afterXml: string | null; + notes: string | null; +} + +/** Fetch the latest verification per note for a list of note ids. Notes + * without any observation simply don't appear in the result. */ +export async function fetchVerifications( + sql: Sql, + noteIds: number[], +): Promise> { + const map = new Map(); + if (noteIds.length === 0) return map; + const rows = await sql` + SELECT DISTINCT ON (bno.behavior_note_id) + bno.behavior_note_id AS note_id, + bno.status, + bno.notes, + obs.scenario, + obs.finding, + obs.before_xml, + obs.after_xml, + fix.name AS fixture_name, + fix.word_version + FROM behavior_note_observations bno + JOIN word_observations obs ON obs.id = bno.observation_id + LEFT JOIN word_fixtures fix ON fix.id = obs.fixture_id + WHERE bno.behavior_note_id = ANY(${noteIds}::int[]) + ORDER BY bno.behavior_note_id, bno.created_at DESC + `; + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + for (const r of rows as any[]) { + map.set(r.note_id as number, { + noteId: r.note_id as number, + status: r.status as NoteVerification["status"], + observationFinding: r.finding as string, + scenario: r.scenario as string, + fixtureName: r.fixture_name as string | null, + wordVersion: r.word_version as string | null, + beforeXml: r.before_xml as string | null, + afterXml: r.after_xml as string | null, + notes: r.notes as string | null, + }); + } + return map; +} + +export interface WordObservation { + id: number; + fixtureId: number | null; + fixtureName: string | null; + wordVersion: string | null; + scenario: string; + finding: string; + beforeXml: string | null; + afterXml: string | null; + observedAt: string; + linkedNotes: Array<{ + noteId: number; + status: NoteVerification["status"]; + notes: string | null; + sectionId: string | null; + sourceAnchor: string | null; + }>; +} + +export interface WordObservationFilter { + fixtureName?: string; + scenario?: string; + query?: string; + status?: string; + limit?: number; +} + +export async function fetchWordObservations( + sql: Sql, + filter: WordObservationFilter, +): Promise { + const limit = filter.limit ?? 30; + const queryPattern = filter.query ? `%${filter.query}%` : null; + // Status filter has to apply BEFORE the LIMIT — otherwise an old + // confirmed observation can be excluded by the limit when newer + // unstatused observations sit ahead of it. Use an EXISTS join. + const obsRows = await sql` + SELECT obs.id, obs.fixture_id, obs.scenario, obs.finding, + obs.before_xml, obs.after_xml, obs.observed_at, + fix.name AS fixture_name, fix.word_version + FROM word_observations obs + LEFT JOIN word_fixtures fix ON fix.id = obs.fixture_id + WHERE (${filter.fixtureName ?? null}::text IS NULL OR fix.name = ${filter.fixtureName ?? null}) + AND (${filter.scenario ?? null}::text IS NULL OR obs.scenario = ${filter.scenario ?? null}) + AND (${queryPattern}::text IS NULL + OR obs.finding ILIKE ${queryPattern} + OR obs.before_xml ILIKE ${queryPattern} + OR obs.after_xml ILIKE ${queryPattern}) + AND (${filter.status ?? null}::text IS NULL + OR EXISTS ( + SELECT 1 FROM behavior_note_observations bno + WHERE bno.observation_id = obs.id + AND bno.status = ${filter.status ?? null} + )) + ORDER BY obs.observed_at DESC + LIMIT ${limit} + `; + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + const observations = obsRows as any[]; + if (observations.length === 0) return []; + + const obsIds = observations.map((r) => r.id as number); + const linkRows = await sql` + SELECT bno.observation_id, bno.behavior_note_id, bno.status, bno.notes, + bn.section_id, bn.source_anchor + FROM behavior_note_observations bno + LEFT JOIN behavior_notes bn ON bn.id = bno.behavior_note_id + WHERE bno.observation_id = ANY(${obsIds}::int[]) + ORDER BY bno.observation_id + `; + const byObs = new Map(); + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + for (const r of linkRows as any[]) { + const oid = r.observation_id as number; + if (!byObs.has(oid)) byObs.set(oid, []); + byObs.get(oid)?.push({ + noteId: r.behavior_note_id as number, + status: r.status as NoteVerification["status"], + notes: r.notes as string | null, + sectionId: r.section_id as string | null, + sourceAnchor: r.source_anchor as string | null, + }); + } + + return observations.map((r) => ({ + id: r.id as number, + fixtureId: r.fixture_id as number | null, + fixtureName: r.fixture_name as string | null, + wordVersion: r.word_version as string | null, + scenario: r.scenario as string, + finding: r.finding as string, + beforeXml: r.before_xml as string | null, + afterXml: r.after_xml as string | null, + observedAt: r.observed_at as string, + linkedNotes: byObs.get(r.id as number) ?? [], + })); +} + +export interface BehaviorNoteFilter { + /** Resolve a qname to a name+namespace and find notes on top-level OR + * local xsd_symbols rows with that (vocabulary, name). */ + symbolName?: string; + symbolNamespace?: string; + /** Substring match on bn.section_id (e.g. '17.4.37' or '2.1.149'). */ + sectionId?: string; + /** Exact match on bn.source_anchor. */ + sourceAnchor?: string; + /** Free-text ILIKE search across standard_text + behavior_text + summary. */ + query?: string; + app?: string; + claimType?: string; + limit?: number; +} + +/** + * Dedicated-tool integration: flexible filter that handles the common + * `ooxml_behavior` query patterns. At least one of (symbolName, sectionId, + * sourceAnchor, query) should be set; the SQL tolerates all-null but will + * return everything in that case (LIMIT-bounded). + */ +export async function fetchBehaviorNotes( + sql: Sql, + filter: BehaviorNoteFilter, +): Promise { + const limit = filter.limit ?? 50; + const queryPattern = filter.query ? `%${filter.query}%` : null; + const sectionPattern = filter.sectionId ? `%${filter.sectionId}%` : null; + // Resolve symbolName → set of xsd_symbols.id (top-level + local). When no + // name is given we look at all rows; the IN-list trick uses an empty array + // fallback so the optional-name case works. + let symbolIds: number[] | null = null; + if (filter.symbolName) { + const ns = filter.symbolNamespace ?? DEFAULT_NAMESPACE; + // Same profile-scoped pattern as lookupSymbol et al. so we don't pull + // symbol IDs from a future profile (e.g. strict) when behavior_notes + // only attach to transitional rows. + const symRows = await sql` + SELECT DISTINCT s.id + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + JOIN xsd_profiles p ON p.id = sp.profile_id + WHERE s.local_name = ${filter.symbolName} + AND ns.uri = ${ns} + AND p.name = ${"transitional"} + `; + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + symbolIds = (symRows as any[]).map((r) => r.id as number); + if (symbolIds.length === 0) symbolIds = [-1]; // no match → empty result + } + + // Word-boundary regex for the unresolved-notes fallback. ILIKE substring + // match would let qname=foo pull in notes whose target_ref says "foobar". + // MS-OI29500 names are all word-chars (letters/digits/underscore); we + // require a non-word-char (or string boundary) on either side. + const targetRefPattern = filter.symbolName + ? `(^|[^A-Za-z0-9_])${escapeRegex(filter.symbolName)}([^A-Za-z0-9_]|$)` + : null; + + const rows = await sql` + SELECT bn.id, bn.symbol_id, bn.app, bn.version_scope, bn.claim_type, bn.summary, + bn.standard_text, bn.behavior_text, bn.confidence, bn.resolution_confidence, + bn.section_id, bn.source_anchor, bn.source_commit, bn.target_ref, bn.claim_label, + src.name AS source_name, src.url AS source_url + FROM behavior_notes bn + LEFT JOIN reference_sources src ON src.id = bn.source_id + WHERE + -- name filter: either symbol_id matches, OR target_ref mentions the + -- name with a non-word-char delimiter on each side (so 'tbl' doesn't + -- match 'tblPr'). + (${symbolIds === null}::boolean + OR bn.symbol_id = ANY(${symbolIds ?? []}::int[]) + OR (${targetRefPattern}::text IS NOT NULL + AND bn.target_ref IS NOT NULL + AND bn.target_ref ~ ${targetRefPattern})) + AND (${sectionPattern}::text IS NULL OR bn.section_id ILIKE ${sectionPattern}) + AND (${filter.sourceAnchor ?? null}::text IS NULL OR bn.source_anchor = ${filter.sourceAnchor ?? null}) + AND (${queryPattern}::text IS NULL + OR bn.standard_text ILIKE ${queryPattern} + OR bn.behavior_text ILIKE ${queryPattern} + OR bn.summary ILIKE ${queryPattern}) + AND (${filter.app ?? null}::text IS NULL OR bn.app = ${filter.app ?? null}) + AND (${filter.claimType ?? null}::text IS NULL OR bn.claim_type = ${filter.claimType ?? null}) + ORDER BY bn.source_id, bn.source_anchor, bn.claim_index + LIMIT ${limit} + `; + // biome-ignore lint/suspicious/noExplicitAny: row shape is loose. + return (rows as any[]).map(rowToBehaviorNote); +} diff --git a/apps/mcp-server/src/ooxml-tools.ts b/apps/mcp-server/src/ooxml-tools.ts index 1608cc0..30a2962 100644 --- a/apps/mcp-server/src/ooxml-tools.ts +++ b/apps/mcp-server/src/ooxml-tools.ts @@ -13,8 +13,13 @@ import { neon } from "@neondatabase/serverless"; import type { ToolDef } from "./mcp"; import { type AttrEntry, + type BehaviorNote, type ChildEdge, type EnumEntry, + fetchBehaviorNotes, + fetchBehaviorNotesBySymbol, + fetchVerifications, + fetchWordObservations, getAttributes, getChildren, getEnums, @@ -24,8 +29,10 @@ import { lookupSymbolByTypeRef, lookupType, type NamespaceInfo, + type NoteVerification, parseQName, type SymbolHit, + type WordObservation, } from "./ooxml-queries"; export const DEFAULT_PROFILE = "transitional"; @@ -119,6 +126,73 @@ export const OOXML_TOOL_DEFS: ToolDef[] = [ required: ["uri"], }, }, + { + name: "ooxml_implementation_notes", + description: + "Microsoft-documented Office implementation notes from MS-OI29500. These are claims Microsoft has published about how Word / Excel / PowerPoint diverge from the spec — they are NOT necessarily verified against the live Word binary. Each row carries a citation back to its source page; some rows also carry linked observations (see ooxml_word_behavior) that confirm, refine, or contradict the claim against an authored fixture. Filter by element/type qname, MS section ID (e.g. '17.4.37' or '2.1.149'), source page GUID, free-text query, app, or claim_type. At least one filter is required. Most entries attach to local element decls and are reachable only through this tool, not via ooxml_element.", + inputSchema: { + type: "object" as const, + properties: { + qname: { + type: "string", + description: + "Element/type qname like 'w:tbl' or 'CT_Tbl'. Searches behavior notes attached to top-level OR local symbols with this name, plus notes whose target_ref mentions it.", + }, + section_id: { + type: "string", + description: + "Substring of bn.section_id, e.g. '17.4.37' (ECMA section) or '2.1.149' (MS-OI29500 entry id).", + }, + source_anchor: { + type: "string", + description: "MS-OI29500 page GUID (exact match).", + }, + query: { + type: "string", + description: + "Free-text ILIKE search across the standard text, the Word-behavior text, and the rendered summary.", + }, + app: { + type: "string", + description: "Filter by app: 'Word', 'Excel', 'PowerPoint', or 'Office'.", + }, + claim_type: { + type: "string", + description: + "Filter by claim_type: ignores, requires_despite_optional, writes, reads_but_does_not_write, repairs, layout_behavior, does_not_support, varies_from_spec.", + }, + limit: { type: "number", description: "Max results (default 50)." }, + }, + }, + }, + { + name: "ooxml_word_behavior", + description: + "Ground-truth observations of how Word ACTUALLY behaves, captured from authored Word fixtures (not Microsoft's documented claims). Each observation records a 'before' and 'after' XML fragment plus a finding string, and is optionally linked to one or more documented notes from ooxml_implementation_notes with a status (confirmed / refined / contradicted / not_reproducible). Use this when you need verified facts rather than documented intent. Filter by fixture name, scenario, free-text query, or verification status.", + inputSchema: { + type: "object" as const, + properties: { + fixture_name: { + type: "string", + description: "Exact fixture name, e.g. 'arabic-bold-test'.", + }, + scenario: { + type: "string", + description: "Scenario tag, e.g. 'authored', 'open-and-save', 'open-and-render'.", + }, + query: { + type: "string", + description: "Free-text search across the finding and the before/after XML fragments.", + }, + status: { + type: "string", + description: + "Filter to observations linked to a note with this status: 'confirmed', 'refined', 'contradicted', 'not_reproducible'.", + }, + limit: { type: "number", description: "Max results (default 30)." }, + }, + }, + }, ]; export type OoxmlToolName = @@ -127,7 +201,9 @@ export type OoxmlToolName = | "ooxml_children" | "ooxml_attributes" | "ooxml_enum" - | "ooxml_namespace"; + | "ooxml_namespace" + | "ooxml_implementation_notes" + | "ooxml_word_behavior"; const OOXML_TOOL_NAMES: ReadonlySet = new Set(OOXML_TOOL_DEFS.map((t) => t.name)); @@ -175,7 +251,12 @@ export async function runOoxmlTool( profile, ); } - return formatSymbolReport("Element", hit, profile); + const notes = await fetchBehaviorNotesBySymbol(sql, hit.id); + const verifications = await fetchVerifications( + sql, + notes.map((n) => n.id), + ); + return formatSymbolReport("Element", hit, profile, notes, verifications); } case "ooxml_type": { @@ -188,10 +269,17 @@ export async function runOoxmlTool( profile, ); } + const notes = await fetchBehaviorNotesBySymbol(sql, hit.id); + const verifications = await fetchVerifications( + sql, + notes.map((n) => n.id), + ); return formatSymbolReport( hit.kind === "simpleType" ? "SimpleType" : "ComplexType", hit, profile, + notes, + verifications, ); } @@ -264,6 +352,65 @@ export async function runOoxmlTool( return formatNamespaceReport(info); } + case "ooxml_implementation_notes": { + // fall through to existing handler logic; verifications are fetched + // after the notes query so we can render the [confirmed]/[refined]/etc + // badges in the dedicated tool's output too. + const filter: Parameters[1] = { + app: args.app as string | undefined, + claimType: args.claim_type as string | undefined, + sourceAnchor: args.source_anchor as string | undefined, + sectionId: args.section_id as string | undefined, + query: args.query as string | undefined, + limit: args.limit as number | undefined, + }; + const qname = args.qname as string | undefined; + if (qname) { + const q = parseQName(qname); + if (!q.ok) return formatNotFound(`could not parse qname: ${q.reason}`); + filter.symbolName = q.qname.localName; + filter.symbolNamespace = q.qname.namespace; + } + if ( + !filter.symbolName && + !filter.sectionId && + !filter.sourceAnchor && + !filter.query && + !filter.app && + !filter.claimType + ) { + return [ + "## Missing filter", + "", + "`ooxml_implementation_notes` needs at least one of:", + "- `qname` - element/type name like 'w:tbl' or 'CT_Tbl'", + "- `section_id` - substring like '17.4.37' or '2.1.149'", + "- `source_anchor` - MS-OI29500 page GUID", + "- `query` - free-text search", + "- `app` - 'Word', 'Excel', 'PowerPoint', or 'Office'", + "- `claim_type` - e.g. 'does_not_support', 'varies_from_spec'", + ].join("\n"); + } + const notes = await fetchBehaviorNotes(sql, filter); + const verifications = await fetchVerifications( + sql, + notes.map((n) => n.id), + ); + return formatBehaviorReport(notes, filter, qname, verifications); + } + + case "ooxml_word_behavior": { + const filter: Parameters[1] = { + fixtureName: args.fixture_name as string | undefined, + scenario: args.scenario as string | undefined, + query: args.query as string | undefined, + status: args.status as string | undefined, + limit: args.limit as number | undefined, + }; + const obs = await fetchWordObservations(sql, filter); + return formatObservationsReport(obs, filter); + } + default: { const _exhaustive: never = name; throw new Error(`Unhandled OOXML tool: ${_exhaustive}`); @@ -273,7 +420,13 @@ export async function runOoxmlTool( // --- Formatting -------------------------------------------------------- -function formatSymbolReport(label: string, hit: SymbolHit, profile: string): string { +function formatSymbolReport( + label: string, + hit: SymbolHit, + profile: string, + notes: BehaviorNote[] = [], + verifications: Map = new Map(), +): string { const lines: string[] = []; lines.push(`## ${label}: ${hit.localName}`); lines.push(""); @@ -284,6 +437,150 @@ function formatSymbolReport(label: string, hit: SymbolHit, profile: string): str lines.push(`- namespace: ${hit.namespaceUri}`); if (hit.typeRef) lines.push(`- type_ref: ${hit.typeRef}`); if (hit.sourceName) lines.push(`- source: ${hit.sourceName}`); + if (notes.length > 0) { + lines.push(""); + appendBehaviorSection(lines, notes, verifications); + } + return lines.join("\n"); +} + +/** + * Per-page Learn URL for a behavior-note source. The `reference_sources.url` + * stored in the manifest is the doc landing page (with its own GUID). Naively + * appending `/` produced `.../ms-oi29500//`, + * which 404s. Each known source maps to a fixed page-base path; we append + * source_anchor to that. + */ +const SOURCE_PAGE_BASE: Record = { + "ms-oi29500": "https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500", +}; + +function buildNoteUrl(sourceName: string | null, anchor: string | null): string | null { + if (!sourceName || !anchor) return null; + const base = SOURCE_PAGE_BASE[sourceName]; + return base ? `${base}/${anchor}` : null; +} + +/** + * Append a "Documented behavior notes" section to a structural report. These + * are claims Microsoft has documented; rows linked to a Word fixture + * observation get a [confirmed] / [refined] / [contradicted] / + * [not_reproducible] tag, otherwise [unverified]. + */ +function appendBehaviorSection( + lines: string[], + notes: BehaviorNote[], + verifications: Map, +): void { + lines.push(`## Documented behavior notes (${notes.length}, MS-OI29500)`); + lines.push(""); + lines.push( + "_Microsoft-documented claims. Rows tagged [unverified] have not been checked against authored Word fixtures; use ooxml_word_behavior to see which ones are._", + ); + lines.push(""); + const byAnchor = new Map(); + for (const n of notes) { + const k = n.sourceAnchor ?? "(no anchor)"; + if (!byAnchor.has(k)) byAnchor.set(k, []); + byAnchor.get(k)?.push(n); + } + for (const [_anchor, group] of byAnchor) { + const first = group[0]; + const heading = first.sectionId ? `${first.sectionId}` : (first.sourceAnchor ?? "(no anchor)"); + const src = first.sourceName ? ` - ${first.sourceName}` : ""; + lines.push(`### ${heading}${src}`); + const url = buildNoteUrl(first.sourceName, first.sourceAnchor); + if (url) lines.push(`(${url})`); + for (const n of group) { + const labelTag = n.claimLabel ? `${n.claimLabel}.` : "-"; + const scopeTag = n.versionScope ? ` _scope: ${n.versionScope}_` : ""; + const v = verifications.get(n.id); + const verifyTag = v ? `[${v.status}]` : "[unverified]"; + if (n.standardText) lines.push(`${labelTag} *${n.standardText}*`); + const claim = n.behaviorText ?? n.summary; + lines.push(` - ${claim} \`(${n.app}, ${n.claimType})\` ${verifyTag}${scopeTag}`); + if (v) { + lines.push(` observation: ${v.observationFinding}`); + if (v.fixtureName) { + const wv = v.wordVersion ? `, ${v.wordVersion}` : ""; + lines.push(` fixture: ${v.fixtureName}${wv}`); + } + } + } + lines.push(""); + } +} + +function formatBehaviorReport( + notes: BehaviorNote[], + filter: { + symbolName?: string; + sectionId?: string; + sourceAnchor?: string; + query?: string; + app?: string; + claimType?: string; + }, + qname: string | undefined, + verifications: Map = new Map(), +): string { + const lines: string[] = []; + const filterDesc: string[] = []; + if (qname) filterDesc.push(`qname=${qname}`); + if (filter.sectionId) filterDesc.push(`section=${filter.sectionId}`); + if (filter.sourceAnchor) filterDesc.push(`anchor=${filter.sourceAnchor}`); + if (filter.query) filterDesc.push(`query="${filter.query}"`); + if (filter.app) filterDesc.push(`app=${filter.app}`); + if (filter.claimType) filterDesc.push(`claim_type=${filter.claimType}`); + lines.push(`## Documented implementation notes (MS-OI29500) - ${filterDesc.join(", ")}`); + lines.push(""); + if (notes.length === 0) { + lines.push("_no matching notes._"); + return lines.join("\n"); + } + appendBehaviorSection(lines, notes, verifications); + return lines.join("\n"); +} + +function formatObservationsReport( + observations: WordObservation[], + filter: { fixtureName?: string; scenario?: string; query?: string; status?: string }, +): string { + const filterDesc: string[] = []; + if (filter.fixtureName) filterDesc.push(`fixture=${filter.fixtureName}`); + if (filter.scenario) filterDesc.push(`scenario=${filter.scenario}`); + if (filter.query) filterDesc.push(`query="${filter.query}"`); + if (filter.status) filterDesc.push(`status=${filter.status}`); + const lines: string[] = []; + lines.push(`## Word observations (ground truth) - ${filterDesc.join(", ") || "all"}`); + lines.push(""); + if (observations.length === 0) { + lines.push("_no matching observations._"); + return lines.join("\n"); + } + lines.push( + "_Each observation is a finding from an authored Word fixture. Linked notes carry a verification status: confirmed / refined / contradicted / not_reproducible._", + ); + lines.push(""); + for (const o of observations) { + const fix = o.fixtureName + ? `${o.fixtureName}${o.wordVersion ? ` (${o.wordVersion})` : ""}` + : "(no fixture)"; + lines.push(`### ${fix} - ${o.scenario}`); + lines.push(`Finding: ${o.finding}`); + if (o.beforeXml) lines.push(`\nBefore:\n\`\`\`xml\n${o.beforeXml}\n\`\`\``); + if (o.afterXml) lines.push(`\nAfter:\n\`\`\`xml\n${o.afterXml}\n\`\`\``); + if (o.linkedNotes.length > 0) { + lines.push(""); + lines.push("Linked notes:"); + for (const ln of o.linkedNotes) { + const cite = ln.sectionId ?? `note ${ln.noteId}`; + const note = ln.notes ? ` - ${ln.notes}` : ""; + lines.push(` - [${ln.status}] ${cite}${note}`); + } + } + lines.push(""); + } return lines.join("\n"); } diff --git a/data/sources.json b/data/sources.json index 4503849..10ceba1 100644 --- a/data/sources.json +++ b/data/sources.json @@ -45,6 +45,16 @@ "url": "https://ecma-international.org/wp-content/uploads/ECMA-376-4_5th_edition_december_2016.zip", "license_note": "Published by Ecma International. See the ECMA-376 publications page for the current download and licensing terms before redistribution.", "sha256": "bd25da1109f73762356596918bf5ff8b74a1331642dba5f1c1d1dfc6bed34ecd" + }, + { + "$comment": "MS-OI29500 is ingested from Microsoft Learn's native markdown endpoint (`?accept=text/markdown`) per Phase 0 evaluation. The pinned PDF URL is for citation/provenance; not parsed. version = published revision ID; edition = revision date.", + "name": "ms-oi29500", + "kind": "open_spec", + "edition": "2025-02-18", + "version": "23.0", + "url": "https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/1fd4a662-8623-49c0-82f0-18fa91b413b8", + "license_note": "Microsoft Open Specifications Documentation copyright notice permits copies for implementing the technologies described, and distribution of portions in implementations or accompanying documentation. Pinned per-doc PDF: https://officeprotocoldocs-f5hpbjgea6b8gneq.b02.azurefd.net/files/MS-OI29500/%5bMS-OI29500%5d.pdf. Bundle (all MS Office Open Specs PDFs): https://officeprotocoldocs-f5hpbjgea6b8gneq.b02.azurefd.net/files/Zip_Files/MSOFFSTAND.zip - useful for future expansion to MS-DOCX et al.", + "sha256": "2212feb8506cd0399835427d971f14fd0b38257769339076c565343f8704e4ca" } ] } diff --git a/db/migrations/0006_behavior_notes_for_ms_oi29500.sql b/db/migrations/0006_behavior_notes_for_ms_oi29500.sql new file mode 100644 index 0000000..51ab1a5 --- /dev/null +++ b/db/migrations/0006_behavior_notes_for_ms_oi29500.sql @@ -0,0 +1,85 @@ +-- Prepare behavior_notes for MS-OI29500 ingest. +-- +-- Four classes of change, all idempotent: +-- +-- 1. Survive XSD re-ingest. The original FK on symbol_id used ON DELETE +-- CASCADE; a future xsd:ingest run that drops a symbol would also wipe +-- every behavior_note attached to it. Switch to ON DELETE SET NULL. +-- Orphaned notes are reattachable by re-running ms:ingest --resolve-only. +-- +-- 2. Citation columns for imported sources. MS-OI29500 entries live at +-- `/` URLs; we store the GUID as `source_anchor` and the +-- lettered claim label (a, b, c, …) as `claim_label`. `claim_index` is a +-- fallback for pages without lettered groups. `target_ref` records the +-- schema target when symbol_id is NULL (vocab not ingested, ambiguous, +-- no-match). +-- +-- 3. Two-sided claim text. MS-OI29500 always frames a claim as "spec says X" +-- + "Word does Y". We keep both sides in `standard_text` / +-- `behavior_text` rather than collapsing into `summary`. `summary` stays +-- as a renderable short form for tool output. +-- +-- 4. Two new claim_type values. `varies_from_spec` is a generic divergence +-- that doesn't fit the existing six verbs; better than mis-classifying +-- as `writes` with low confidence. `does_not_support` covers Word's +-- common "does not support this attribute" pattern more precisely than +-- `ignores`. +-- +-- A partial unique index on (source_id, source_anchor, claim_label or index) +-- is the natural key for upsert. Hand-curated rows (which won't have +-- source_anchor set) are excluded by the WHERE clause. + +-- 1. Cascade behavior on symbol_id. +ALTER TABLE behavior_notes + DROP CONSTRAINT IF EXISTS behavior_notes_symbol_id_fkey; +ALTER TABLE behavior_notes + ADD CONSTRAINT behavior_notes_symbol_id_fkey + FOREIGN KEY (symbol_id) REFERENCES xsd_symbols(id) ON DELETE SET NULL; + +-- 2. Citation + identity columns. +ALTER TABLE behavior_notes + ADD COLUMN IF NOT EXISTS source_anchor TEXT, + ADD COLUMN IF NOT EXISTS claim_label TEXT, + ADD COLUMN IF NOT EXISTS claim_index INT NOT NULL DEFAULT 0, + ADD COLUMN IF NOT EXISTS target_ref TEXT; + +-- 3. Two-sided claim text + parser confidence. +ALTER TABLE behavior_notes + ADD COLUMN IF NOT EXISTS standard_text TEXT, + ADD COLUMN IF NOT EXISTS behavior_text TEXT, + ADD COLUMN IF NOT EXISTS resolution_confidence TEXT; + +ALTER TABLE behavior_notes + DROP CONSTRAINT IF EXISTS behavior_notes_resolution_confidence_check; +ALTER TABLE behavior_notes + ADD CONSTRAINT behavior_notes_resolution_confidence_check + CHECK (resolution_confidence IS NULL + OR resolution_confidence IN ('high', 'medium', 'low')); + +-- 4. Extended claim_type enum. +ALTER TABLE behavior_notes + DROP CONSTRAINT IF EXISTS behavior_notes_claim_type_check; +ALTER TABLE behavior_notes + ADD CONSTRAINT behavior_notes_claim_type_check + CHECK (claim_type IN ( + 'ignores', + 'requires_despite_optional', + 'writes', + 'reads_but_does_not_write', + 'repairs', + 'layout_behavior', + 'does_not_support', + 'varies_from_spec' + )); + +-- Natural-key unique index for upsert. claim_label is the preferred +-- discriminator (the a/b/c letter on the source page); claim_index is the +-- fallback when a page has no lettered groups. The partial WHERE clause +-- skips hand-curated rows that don't carry source_anchor. +CREATE UNIQUE INDEX IF NOT EXISTS idx_behavior_notes_natural_key + ON behavior_notes ( + source_id, + source_anchor, + COALESCE(claim_label, claim_index::text) + ) + WHERE source_id IS NOT NULL AND source_anchor IS NOT NULL; diff --git a/db/migrations/0007_fix_behavior_notes_natural_key.sql b/db/migrations/0007_fix_behavior_notes_natural_key.sql new file mode 100644 index 0000000..e03cb6a --- /dev/null +++ b/db/migrations/0007_fix_behavior_notes_natural_key.sql @@ -0,0 +1,20 @@ +-- Fix the natural-key unique index for imported behavior_notes. +-- +-- 0006 created an index keyed on `COALESCE(claim_label, claim_index::text)`, +-- which assumed claim_label discriminated rows. It doesn't: a single claim +-- group on an MS-OI29500 page can have multiple behavior bullets that all +-- share the same letter (e.g. claim "i" with three "Word does X" bullets). +-- The COALESCE collapses them to one key. +-- +-- The right discriminator is `claim_index`, which the ingest builds as +-- `claimIdx * 100 + behaviorIdx` and is unique per page. claim_label stays +-- as an informational column for display. +-- +-- Idempotent: drop the old index, create the new one. Both `IF EXISTS` / +-- `IF NOT EXISTS` guards make re-running safe. + +DROP INDEX IF EXISTS idx_behavior_notes_natural_key; + +CREATE UNIQUE INDEX IF NOT EXISTS idx_behavior_notes_natural_key + ON behavior_notes (source_id, source_anchor, claim_index) + WHERE source_id IS NOT NULL AND source_anchor IS NOT NULL; diff --git a/db/migrations/0008_behavior_notes_source_commit.sql b/db/migrations/0008_behavior_notes_source_commit.sql new file mode 100644 index 0000000..1da99ad --- /dev/null +++ b/db/migrations/0008_behavior_notes_source_commit.sql @@ -0,0 +1,17 @@ +-- Add `source_commit` to behavior_notes for per-row provenance. +-- +-- The MS-OI29500 ingest fetches Microsoft Learn markdown, which is mutable +-- (Microsoft can revise an individual page without bumping the doc-level +-- revision number). The pinned PDF in `reference_sources` covers the doc as +-- a whole, but doesn't tell us which exact commit of which Learn page we +-- parsed. +-- +-- Each Learn page's YAML frontmatter exposes `git_commit_id` for the +-- backing markdown file. Recording that here makes a re-ingest reproducible +-- (same input commits → same output rows) and lets reviewers diff the +-- exact source state we ingested. +-- +-- Hand-curated rows leave it NULL. + +ALTER TABLE behavior_notes + ADD COLUMN IF NOT EXISTS source_commit TEXT; diff --git a/db/migrations/0009_word_observations.sql b/db/migrations/0009_word_observations.sql new file mode 100644 index 0000000..ad3c933 --- /dev/null +++ b/db/migrations/0009_word_observations.sql @@ -0,0 +1,78 @@ +-- Add the verification layer for behavior_notes. +-- +-- behavior_notes (from MS-OI29500) are Microsoft-DOCUMENTED claims about +-- Office behavior. They are not necessarily what Word actually does — we +-- verified during Phase 4 dogfooding that the docs are directionally +-- accurate but glossed over critical edge cases (e.g. on ``, MS-OI29500 says "Word requires val != 0" +-- but Word silently strips the entire trHeight on save). +-- +-- The three new tables let us record ground-truth observations from +-- authored Word fixtures and tie them back to the documented claims: +-- +-- word_fixtures one row per .docx the Word API generated. +-- Includes sha256, generator script, and +-- Word version so observations are +-- reproducible. +-- +-- word_observations one row per "Word does X with input Y" +-- finding. Stores the relevant XML fragment +-- before and after the operation that +-- triggered the observation. +-- +-- behavior_note_observations join table linking notes to observations +-- with a verification status. A single note +-- can have multiple observations (different +-- Word versions or input shapes); the join +-- row is the unit that carries +-- confirmed / refined / contradicted / +-- not_reproducible. +-- +-- Idempotent. Re-running is a no-op. + +CREATE TABLE IF NOT EXISTS word_fixtures ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, -- e.g. 'arabic-bold-test' + description TEXT, + sha256 TEXT, -- of the .docx blob + generator_script TEXT, -- PowerShell or 'create_document(...)' call + word_version TEXT, -- e.g. 'Word 16.0' + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS word_observations ( + id SERIAL PRIMARY KEY, + fixture_id INT REFERENCES word_fixtures(id) ON DELETE CASCADE, + scenario TEXT NOT NULL, -- 'authored', 'open-and-save', 'open-and-render' + finding TEXT NOT NULL, -- short prose finding + before_xml TEXT, -- relevant fragment before + after_xml TEXT, -- relevant fragment after + observed_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS behavior_note_observations ( + id SERIAL PRIMARY KEY, + behavior_note_id INT NOT NULL REFERENCES behavior_notes(id) ON DELETE CASCADE, + observation_id INT NOT NULL REFERENCES word_observations(id) ON DELETE CASCADE, + status TEXT NOT NULL CHECK (status IN ( + 'confirmed', -- Word's behavior matches the documented claim. + 'refined', -- Claim is directionally true but the actual + -- behavior is more specific (e.g. doc says + -- "Word requires X" but Word's repair path is + -- to silently strip the directive). + 'contradicted', -- Claim is wrong as written; Word does + -- something different. + 'not_reproducible' -- Could not reproduce the documented behavior + -- in the fixture. + )), + notes TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (behavior_note_id, observation_id) +); + +CREATE INDEX IF NOT EXISTS idx_word_observations_fixture + ON word_observations(fixture_id); +CREATE INDEX IF NOT EXISTS idx_behavior_note_observations_note + ON behavior_note_observations(behavior_note_id); +CREATE INDEX IF NOT EXISTS idx_behavior_note_observations_obs + ON behavior_note_observations(observation_id); diff --git a/db/schema.sql b/db/schema.sql index d7b1ef5..fb01a03 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -168,9 +168,14 @@ CREATE TABLE xsd_enums ( order_index INT DEFAULT 0 ); +-- behavior_notes: editorial / imported claims about how an app behaves vs the +-- spec. symbol_id is ON DELETE SET NULL so notes survive XSD re-ingest. +-- source_anchor + claim_label form the natural key for imported rows +-- (currently MS-OI29500); hand-curated rows leave them NULL and use +-- alternative identification (note_key et al. as that path develops). CREATE TABLE behavior_notes ( id SERIAL PRIMARY KEY, - symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE, + symbol_id INT REFERENCES xsd_symbols(id) ON DELETE SET NULL, app TEXT NOT NULL, version_scope TEXT, claim_type TEXT NOT NULL CHECK (claim_type IN ( @@ -179,15 +184,88 @@ CREATE TABLE behavior_notes ( 'writes', 'reads_but_does_not_write', 'repairs', - 'layout_behavior' + 'layout_behavior', + 'does_not_support', + 'varies_from_spec' )), summary TEXT NOT NULL, source_id INT REFERENCES reference_sources(id), section_id TEXT, + -- `confidence` is editorial: how sure are we the claim is TRUE? Imported + -- rows from authoritative sources (MS-OI29500) get 'high'; hand-curated + -- rows can hedge. confidence TEXT CHECK (confidence IN ('high', 'medium', 'low')), + -- Imported-source citation: + source_anchor TEXT, -- e.g. MS-OI29500 page GUID + source_commit TEXT, -- per-row provenance: git_commit_id from the source page + claim_label TEXT, -- 'a', 'b', 'c', ... when present on source + claim_index INT NOT NULL DEFAULT 0, + target_ref TEXT, -- fallback citation when symbol_id is NULL + -- Two-sided claim text from the source (kept verbatim alongside `summary`). + standard_text TEXT, + behavior_text TEXT, + -- `resolution_confidence` is mechanical: how sure is the parser+resolver + -- about the EXTRACTION (claim_type classification + symbol attachment)? + -- Distinct from `confidence` above. For imported rows: min of (claim_type + -- classifier confidence, symbol resolver confidence). + resolution_confidence TEXT CHECK (resolution_confidence IS NULL + OR resolution_confidence IN ('high', 'medium', 'low')), created_at TIMESTAMPTZ DEFAULT NOW() ); +-- Natural-key unique index for upserting imported rows. claim_index encodes +-- (claimIdx * 100 + behaviorIdx) at ingest time so it's unique per +-- (source, page) - claim_label is shared across behaviors in a single claim +-- group and would collide here. +CREATE UNIQUE INDEX idx_behavior_notes_natural_key + ON behavior_notes (source_id, source_anchor, claim_index) + WHERE source_id IS NOT NULL AND source_anchor IS NOT NULL; + +-- ---------------------------------------------------------------------------- +-- Verification layer: ground-truth observations from authored Word fixtures. +-- +-- behavior_notes are claims Microsoft has DOCUMENTED. Word's actual behavior +-- can confirm, refine, contradict, or fail to reproduce them. word_fixtures +-- + word_observations + behavior_note_observations capture that ground +-- truth so the MCP can rank verified rows above unverified ones. +-- ---------------------------------------------------------------------------- + +CREATE TABLE word_fixtures ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + description TEXT, + sha256 TEXT, + generator_script TEXT, + word_version TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE word_observations ( + id SERIAL PRIMARY KEY, + fixture_id INT REFERENCES word_fixtures(id) ON DELETE CASCADE, + scenario TEXT NOT NULL, + finding TEXT NOT NULL, + before_xml TEXT, + after_xml TEXT, + observed_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE TABLE behavior_note_observations ( + id SERIAL PRIMARY KEY, + behavior_note_id INT NOT NULL REFERENCES behavior_notes(id) ON DELETE CASCADE, + observation_id INT NOT NULL REFERENCES word_observations(id) ON DELETE CASCADE, + status TEXT NOT NULL CHECK (status IN ( + 'confirmed', 'refined', 'contradicted', 'not_reproducible' + )), + notes TEXT, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (behavior_note_id, observation_id) +); + +CREATE INDEX idx_word_observations_fixture ON word_observations(fixture_id); +CREATE INDEX idx_behavior_note_observations_note ON behavior_note_observations(behavior_note_id); +CREATE INDEX idx_behavior_note_observations_obs ON behavior_note_observations(observation_id); + CREATE INDEX idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind); CREATE INDEX idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id); CREATE INDEX idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id); diff --git a/package.json b/package.json index 1d4317b..45f1de6 100644 --- a/package.json +++ b/package.json @@ -21,14 +21,15 @@ "db:shell": "docker compose exec db psql -U postgres -d ecma_spec", "db:migrate": "bun scripts/db-migrate.ts", "sources:sync": "bun scripts/sources-sync.ts", - "pdf:ingest": "bun scripts/ingest-pdf/pipeline.ts", - "pdf:chunk": "bun scripts/ingest-pdf/chunk.ts", - "pdf:embed": "bun scripts/ingest-pdf/embed.ts", - "pdf:upload": "bun scripts/ingest-pdf/upload.ts", + "pdf:ingest": "bun scripts/ingest-ecma-376-pdfs/pipeline.ts", + "pdf:chunk": "bun scripts/ingest-ecma-376-pdfs/chunk.ts", + "pdf:embed": "bun scripts/ingest-ecma-376-pdfs/embed.ts", + "pdf:upload": "bun scripts/ingest-ecma-376-pdfs/upload.ts", "pdf:setup": "pip install -r scripts/requirements.txt", - "xsd:fetch": "bun scripts/ingest-xsd/fetch.ts", - "xsd:ingest": "bun scripts/ingest-xsd/ingest.ts", - "test": "export TEST_DATABASE_URL=${TEST_DATABASE_URL:-postgresql://postgres:postgres@localhost:5432/ecma_spec} && bun test tests/db/ && bun test tests/ingest-xsd/ && bun test tests/mcp-server/" + "xsd:fetch": "bun scripts/ingest-ecma-376-xsds/fetch.ts", + "xsd:ingest": "bun scripts/ingest-ecma-376-xsds/ingest.ts", + "ms:ingest": "bun scripts/ingest-ms-oi29500/ingest.ts", + "test": "export TEST_DATABASE_URL=${TEST_DATABASE_URL:-postgresql://postgres:postgres@localhost:5432/ecma_spec} && bun test tests/db/ && bun test tests/ingest-ecma-376-xsds/ && bun test tests/ingest-ms-oi29500/ && bun test tests/mcp-server/" }, "devDependencies": { "@biomejs/biome": "^2.3.13", diff --git a/scripts/ingest-pdf/README.md b/scripts/ingest-ecma-376-pdfs/README.md similarity index 100% rename from scripts/ingest-pdf/README.md rename to scripts/ingest-ecma-376-pdfs/README.md diff --git a/scripts/ingest-pdf/chunk.ts b/scripts/ingest-ecma-376-pdfs/chunk.ts similarity index 92% rename from scripts/ingest-pdf/chunk.ts rename to scripts/ingest-ecma-376-pdfs/chunk.ts index 4fe0c87..6208a81 100644 --- a/scripts/ingest-pdf/chunk.ts +++ b/scripts/ingest-ecma-376-pdfs/chunk.ts @@ -5,10 +5,10 @@ * Respects section boundaries and handles XML examples specially. * * Usage: - * bun scripts/ingest-pdf/chunk.ts + * bun scripts/ingest-ecma-376-pdfs/chunk.ts * * Example: - * bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json + * bun scripts/ingest-ecma-376-pdfs/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json */ interface ExtractedSection { @@ -151,10 +151,12 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest-pdf/chunk.ts "); + console.log("Usage: bun scripts/ingest-ecma-376-pdfs/chunk.ts "); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest-pdf/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json"); + console.log( + " bun scripts/ingest-ecma-376-pdfs/chunk.ts ./extracted/part1 ./chunks/part1-chunks.json", + ); process.exit(1); } diff --git a/scripts/ingest-pdf/embed.ts b/scripts/ingest-ecma-376-pdfs/embed.ts similarity index 89% rename from scripts/ingest-pdf/embed.ts rename to scripts/ingest-ecma-376-pdfs/embed.ts index 8ab7e53..1d23802 100644 --- a/scripts/ingest-pdf/embed.ts +++ b/scripts/ingest-ecma-376-pdfs/embed.ts @@ -4,14 +4,14 @@ * Generates embeddings for chunks using the configured provider. * * Usage: - * bun scripts/ingest-pdf/embed.ts + * bun scripts/ingest-ecma-376-pdfs/embed.ts * * Environment variables: * EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai) * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json + * EMBEDDING_PROVIDER=openai bun scripts/ingest-ecma-376-pdfs/embed.ts ./chunks/part1-chunks.json ./embedded/part1-embedded.json */ import { @@ -93,7 +93,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest-pdf/embed.ts "); + console.log("Usage: bun scripts/ingest-ecma-376-pdfs/embed.ts "); console.log(""); console.log("Environment variables:"); console.log(" EMBEDDING_PROVIDER - openai, google, voyage, or cohere (default: openai)"); @@ -101,7 +101,7 @@ async function main() { console.log(""); console.log("Example:"); console.log( - " EMBEDDING_PROVIDER=openai bun scripts/ingest-pdf/embed.ts ./chunks/part1.json ./embedded/part1.json", + " EMBEDDING_PROVIDER=openai bun scripts/ingest-ecma-376-pdfs/embed.ts ./chunks/part1.json ./embedded/part1.json", ); process.exit(1); } diff --git a/scripts/ingest-pdf/extract.py b/scripts/ingest-ecma-376-pdfs/extract.py similarity index 100% rename from scripts/ingest-pdf/extract.py rename to scripts/ingest-ecma-376-pdfs/extract.py diff --git a/scripts/ingest-pdf/fix-page-numbers.py b/scripts/ingest-ecma-376-pdfs/fix-page-numbers.py similarity index 100% rename from scripts/ingest-pdf/fix-page-numbers.py rename to scripts/ingest-ecma-376-pdfs/fix-page-numbers.py diff --git a/scripts/ingest-pdf/pipeline.ts b/scripts/ingest-ecma-376-pdfs/pipeline.ts similarity index 83% rename from scripts/ingest-pdf/pipeline.ts rename to scripts/ingest-ecma-376-pdfs/pipeline.ts index dacc32a..28f4d89 100644 --- a/scripts/ingest-pdf/pipeline.ts +++ b/scripts/ingest-ecma-376-pdfs/pipeline.ts @@ -4,7 +4,7 @@ * Runs the complete ingestion process: extract -> chunk -> embed -> upload * * Usage: - * bun scripts/ingest-pdf/pipeline.ts + * bun scripts/ingest-ecma-376-pdfs/pipeline.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string @@ -12,7 +12,7 @@ * OPENAI_API_KEY / GOOGLE_API_KEY / etc. * * Example: - * bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf + * bun scripts/ingest-ecma-376-pdfs/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf */ import { $ } from "bun"; @@ -21,7 +21,7 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest-pdf/pipeline.ts "); + console.log("Usage: bun scripts/ingest-ecma-376-pdfs/pipeline.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); @@ -29,7 +29,7 @@ async function main() { console.log(" OPENAI_API_KEY / GOOGLE_API_KEY / etc."); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest-pdf/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); + console.log(" bun scripts/ingest-ecma-376-pdfs/pipeline.ts 1 ./pdfs/ECMA-376-Part1.pdf"); process.exit(1); } @@ -92,7 +92,7 @@ async function main() { try { await $`${pythonPath} -c "import pymupdf4llm" 2>/dev/null`; console.log(`Using Python: ${pythonPath}`); - await $`${pythonPath} scripts/ingest-pdf/extract.py ${pdfPath} ${extractedDir}`; + await $`${pythonPath} scripts/ingest-ecma-376-pdfs/extract.py ${pdfPath} ${extractedDir}`; extractSuccess = true; break; } catch { @@ -110,17 +110,17 @@ async function main() { // Step 2: Chunk console.log("\n[2/4] Chunking content..."); console.log("-".repeat(40)); - await $`bun scripts/ingest-pdf/chunk.ts ${extractedDir} ${chunksFile}`; + await $`bun scripts/ingest-ecma-376-pdfs/chunk.ts ${extractedDir} ${chunksFile}`; // Step 3: Embed console.log("\n[3/4] Generating embeddings..."); console.log("-".repeat(40)); - await $`bun scripts/ingest-pdf/embed.ts ${chunksFile} ${embeddedFile}`; + await $`bun scripts/ingest-ecma-376-pdfs/embed.ts ${chunksFile} ${embeddedFile}`; // Step 4: Upload console.log("\n[4/4] Uploading to database..."); console.log("-".repeat(40)); - await $`bun scripts/ingest-pdf/upload.ts ${partNumber} ${embeddedFile}`; + await $`bun scripts/ingest-ecma-376-pdfs/upload.ts ${partNumber} ${embeddedFile}`; console.log(`\n${"=".repeat(60)}`); console.log("Pipeline complete!"); diff --git a/scripts/ingest-pdf/upload.ts b/scripts/ingest-ecma-376-pdfs/upload.ts similarity index 87% rename from scripts/ingest-pdf/upload.ts rename to scripts/ingest-ecma-376-pdfs/upload.ts index c17cffc..7f9201f 100644 --- a/scripts/ingest-pdf/upload.ts +++ b/scripts/ingest-ecma-376-pdfs/upload.ts @@ -4,13 +4,13 @@ * Uploads embedded chunks to the database. * * Usage: - * bun scripts/ingest-pdf/upload.ts + * bun scripts/ingest-ecma-376-pdfs/upload.ts * * Environment variables: * DATABASE_URL - PostgreSQL connection string * * Example: - * bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json + * bun scripts/ingest-ecma-376-pdfs/upload.ts 1 ./embedded/part1-embedded.json */ import { createDbClient } from "../../packages/shared/src/db/index.ts"; @@ -30,13 +30,13 @@ async function main() { const args = process.argv.slice(2); if (args.length < 2) { - console.log("Usage: bun scripts/ingest-pdf/upload.ts "); + console.log("Usage: bun scripts/ingest-ecma-376-pdfs/upload.ts "); console.log(""); console.log("Environment variables:"); console.log(" DATABASE_URL - PostgreSQL connection string"); console.log(""); console.log("Example:"); - console.log(" bun scripts/ingest-pdf/upload.ts 1 ./embedded/part1-embedded.json"); + console.log(" bun scripts/ingest-ecma-376-pdfs/upload.ts 1 ./embedded/part1-embedded.json"); process.exit(1); } diff --git a/scripts/ingest-xsd/README.md b/scripts/ingest-ecma-376-xsds/README.md similarity index 100% rename from scripts/ingest-xsd/README.md rename to scripts/ingest-ecma-376-xsds/README.md diff --git a/scripts/ingest-xsd/ast.ts b/scripts/ingest-ecma-376-xsds/ast.ts similarity index 100% rename from scripts/ingest-xsd/ast.ts rename to scripts/ingest-ecma-376-xsds/ast.ts diff --git a/scripts/ingest-xsd/fetch.ts b/scripts/ingest-ecma-376-xsds/fetch.ts similarity index 100% rename from scripts/ingest-xsd/fetch.ts rename to scripts/ingest-ecma-376-xsds/fetch.ts diff --git a/scripts/ingest-xsd/ingest.ts b/scripts/ingest-ecma-376-xsds/ingest.ts similarity index 100% rename from scripts/ingest-xsd/ingest.ts rename to scripts/ingest-ecma-376-xsds/ingest.ts diff --git a/scripts/ingest-xsd/parse-schema.ts b/scripts/ingest-ecma-376-xsds/parse-schema.ts similarity index 100% rename from scripts/ingest-xsd/parse-schema.ts rename to scripts/ingest-ecma-376-xsds/parse-schema.ts diff --git a/scripts/ingest-xsd/qname.ts b/scripts/ingest-ecma-376-xsds/qname.ts similarity index 100% rename from scripts/ingest-xsd/qname.ts rename to scripts/ingest-ecma-376-xsds/qname.ts diff --git a/scripts/ingest-xsd/types.ts b/scripts/ingest-ecma-376-xsds/types.ts similarity index 100% rename from scripts/ingest-xsd/types.ts rename to scripts/ingest-ecma-376-xsds/types.ts diff --git a/scripts/ingest-xsd/vocabulary.ts b/scripts/ingest-ecma-376-xsds/vocabulary.ts similarity index 98% rename from scripts/ingest-xsd/vocabulary.ts rename to scripts/ingest-ecma-376-xsds/vocabulary.ts index c3846a2..c96dfd4 100644 --- a/scripts/ingest-xsd/vocabulary.ts +++ b/scripts/ingest-ecma-376-xsds/vocabulary.ts @@ -63,7 +63,7 @@ export function vocabularyForNamespace(uri: string): string { const v = NAMESPACE_TO_VOCABULARY[uri]; if (!v) { throw new Error( - `Unknown namespace URI: ${uri}. Add it to NAMESPACE_TO_VOCABULARY in scripts/ingest-xsd/vocabulary.ts.`, + `Unknown namespace URI: ${uri}. Add it to NAMESPACE_TO_VOCABULARY in scripts/ingest-ecma-376-xsds/vocabulary.ts.`, ); } return v; diff --git a/scripts/ingest-ms-oi29500/app-inference.ts b/scripts/ingest-ms-oi29500/app-inference.ts new file mode 100644 index 0000000..a59bf3f --- /dev/null +++ b/scripts/ingest-ms-oi29500/app-inference.ts @@ -0,0 +1,87 @@ +/** + * Map an MS-OI29500 entry's (partNumber, ecmaSection) to the Office app the + * note describes. + * + * MS-OI29500 covers Word, Excel, and PowerPoint behaviors. Tagging every row + * as 'Word' (the previous default) made `app=Word` queries return Excel and + * PowerPoint behavior alongside Word's. Section numbers cleanly disambiguate + * the typical case; legitimately cross-app pages use 'Office'. + */ + +export type Office = "Word" | "Excel" | "PowerPoint" | "Office"; + +/** + * Infer the Office app a behavior note describes. Two-stage: + * + * 1. If `behaviorText` mentions exactly one app name (Word / Excel / + * PowerPoint), trust the text. This catches Part 4 / DrawingML pages + * whose section number is generic but whose behavior text is + * app-specific. + * 2. Otherwise fall back to section-based inference. + * + * `Office` is the bucket for legitimately cross-app or undeterminable rows. + */ +export function inferApp( + partNumber: number | null, + ecmaSection: string | null, + behaviorText?: string, +): Office { + if (behaviorText) { + const w = /\bWord\b/.test(behaviorText); + const e = /\bExcel\b/.test(behaviorText); + const p = /\bPowerPoint\b/.test(behaviorText); + const hits = (w ? 1 : 0) + (e ? 1 : 0) + (p ? 1 : 0); + if (hits === 1) { + if (w) return "Word"; + if (e) return "Excel"; + if (p) return "PowerPoint"; + } + // Two or more apps mentioned (or zero) → fall through to section-based. + } + + if (partNumber === 4) { + // Part 4 is the Transitional Migration spec - VML and legacy DrawingML + // extensions used across multiple apps. Tag generic. + return "Office"; + } + if (!ecmaSection) return "Office"; + const major = parseInt(ecmaSection.match(/^(\d+)/)?.[1] ?? "", 10); + switch (major) { + case 11: + return "Word"; // WML overview + case 13: + return "PowerPoint"; // PML overview + case 17: + return "Word"; // WordprocessingML + case 18: + return "Excel"; // SpreadsheetML + case 19: + return "PowerPoint"; // PML elements (Part 1) + case 20: + case 21: + return "Office"; // DrawingML - used by all three apps + case 22: { + // 22.1 = math (Word); 22.2-22.x = shared / extended properties. + const sub = ecmaSection.match(/^22\.(\d+)/); + const minor = sub ? parseInt(sub[1], 10) : 0; + return minor === 1 ? "Word" : "Office"; + } + default: + // Sections 2-10 are conformance / introduction / shared content. + return "Office"; + } +} + +const RANK: Record<"high" | "medium" | "low", number> = { high: 3, medium: 2, low: 1 }; + +/** min by confidence rank (low < medium < high). NULL inputs are skipped. */ +export function minConfidence( + ...values: Array<"high" | "medium" | "low" | null | undefined> +): "high" | "medium" | "low" | null { + let best: "high" | "medium" | "low" | null = null; + for (const v of values) { + if (!v) continue; + if (best === null || RANK[v] < RANK[best]) best = v; + } + return best; +} diff --git a/scripts/ingest-ms-oi29500/claim-type.ts b/scripts/ingest-ms-oi29500/claim-type.ts new file mode 100644 index 0000000..d3bb5b4 --- /dev/null +++ b/scripts/ingest-ms-oi29500/claim-type.ts @@ -0,0 +1,116 @@ +/** + * Map a behavior text snippet to a `behavior_notes.claim_type` enum value. + * + * The patterns reflect Microsoft's MS-OI29500 phrasing conventions. Order + * matters: more specific patterns are checked first. Unmatched text falls + * through to `varies_from_spec` with low confidence - that's better than + * mis-classifying as `writes`, which would pollute the table. + * + * Confidence here is the parser's certainty about the classification, not + * Microsoft's certainty about the claim. It maps to + * `behavior_notes.resolution_confidence`. + */ + +export type ClaimType = + | "ignores" + | "requires_despite_optional" + | "writes" + | "reads_but_does_not_write" + | "repairs" + | "layout_behavior" + | "does_not_support" + | "varies_from_spec"; + +export interface ClaimTypeResult { + claimType: ClaimType; + confidence: "high" | "medium" | "low"; +} + +interface Pattern { + regex: RegExp; + claimType: ClaimType; + confidence: "high" | "medium" | "low"; +} + +// Order matters: more specific patterns first. Patterns are case-insensitive +// and match anywhere in the text (no word boundaries on regex starts so we +// pick up phrases mid-sentence). +const PATTERNS: Pattern[] = [ + // Read/write asymmetry - most specific. + { + regex: /\b(?:reads?|interprets?)\b[^.]*\b(?:does not write|will not write|ignores on write)\b/i, + claimType: "reads_but_does_not_write", + confidence: "high", + }, + + // Explicit ignores. + { + regex: /\b(?:Word|Office|PowerPoint|Excel)\s+ignores\b/i, + claimType: "ignores", + confidence: "high", + }, + + // "does not support" / "does not allow" - the does_not_support enum was + // added precisely because these are common in MS-OI29500 and don't fit + // "ignores" cleanly (Word may also fail / repair / drop). + { regex: /\bdoes not support\b/i, claimType: "does_not_support", confidence: "high" }, + { regex: /\bdoes not allow\b/i, claimType: "does_not_support", confidence: "high" }, + + // Writes / saves - Word emits this even though spec doesn't require it, + // or emits in a non-standard way. + { + regex: /\b(?:Word|Office|PowerPoint|Excel)\s+(?:will\s+)?(?:saves?|writes?|emits?|stores?)\b/i, + claimType: "writes", + confidence: "high", + }, + + // Repairs / treats-invalid-as. + { regex: /\brepairs?\b/i, claimType: "repairs", confidence: "medium" }, + { + regex: /\btreats?\b[^.]*\b(?:as|like)\b[^.]*\b(?:if invalid|when invalid)\b/i, + claimType: "repairs", + confidence: "medium", + }, + + // Layout / rendering / interpretation. "Treats X as Y" without an + // invalidity clause typically signals layout/interpretation. + { + regex: /\b(?:Word|Office|Excel|PowerPoint)\s+renders?\b/i, + claimType: "layout_behavior", + confidence: "medium", + }, + { + regex: /\b(?:Word|Office|Excel|PowerPoint)\s+(?:displays?|interprets?)\b/i, + claimType: "layout_behavior", + confidence: "medium", + }, + { regex: /\bfor\s+layout\b/i, claimType: "layout_behavior", confidence: "medium" }, + + // Required-despite-optional. + { + regex: /\b(?:Word|Office|Excel|PowerPoint)\s+requires\b/i, + claimType: "requires_despite_optional", + confidence: "medium", + }, + { + regex: /\btreats?\s+(?:the\s+)?(?:optional\s+)?[^.]*\bas\s+required\b/i, + claimType: "requires_despite_optional", + confidence: "medium", + }, + + // Generic reads. + { + regex: /\b(?:Word|Office|Excel|PowerPoint)\s+reads?\b/i, + claimType: "reads_but_does_not_write", + confidence: "medium", + }, +]; + +export function classifyClaim(behaviorText: string): ClaimTypeResult { + for (const p of PATTERNS) { + if (p.regex.test(behaviorText)) { + return { claimType: p.claimType, confidence: p.confidence }; + } + } + return { claimType: "varies_from_spec", confidence: "low" }; +} diff --git a/scripts/ingest-ms-oi29500/fetch.ts b/scripts/ingest-ms-oi29500/fetch.ts new file mode 100644 index 0000000..fd2eb2d --- /dev/null +++ b/scripts/ingest-ms-oi29500/fetch.ts @@ -0,0 +1,82 @@ +/** + * Fetch the MS-OI29500 toc.json + per-page native markdown into a local + * cache directory. Re-runs are no-ops on cached pages unless `--refresh` is + * passed. Produces a list of `{ guid, tocTitle, contentPath }` for downstream + * parsing/ingest. + */ + +import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +export const TOC_URL = + "https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/toc.json"; +export const PAGE_URL = (guid: string) => + `https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/${guid}?accept=text/markdown`; +export const CACHE_DIR = "data/ms-oi29500-cache"; +const FETCH_DELAY_MS = 200; +const STALE_AFTER_DAYS = 7; + +interface TocEntry { + href?: string; + toc_title?: string; + children?: TocEntry[]; + items?: TocEntry[]; +} + +export interface TocPage { + href: string; + tocTitle: string; +} + +export function flattenToc(items: TocEntry[]): TocPage[] { + const out: TocPage[] = []; + const walk = (entries: TocEntry[]) => { + for (const e of entries) { + const href = e.href ?? ""; + const title = e.toc_title ?? ""; + if (href && !href.startsWith("/") && !href.startsWith("http")) { + out.push({ href, tocTitle: title }); + } + if (e.children) walk(e.children); + if (e.items) walk(e.items); + } + }; + walk(items); + return out; +} + +/** Filter to actual implementation note pages (`2.x.x` titles). */ +export function filterImplementationNotes(pages: TocPage[]): TocPage[] { + return pages.filter((p) => /^2\.\d+\.\d+\s/.test(p.tocTitle)); +} + +async function fetchCached(url: string, cachePath: string, refresh: boolean): Promise { + if (!refresh && existsSync(cachePath)) { + const ageMs = Date.now() - statSync(cachePath).mtimeMs; + if (ageMs < STALE_AFTER_DAYS * 86400_000) return readFileSync(cachePath, "utf8"); + } + const res = await fetch(url); + if (!res.ok) throw new Error(`Fetch failed ${res.status}: ${url}`); + const body = await res.text(); + writeFileSync(cachePath, body); + await new Promise((r) => setTimeout(r, FETCH_DELAY_MS)); + return body; +} + +export async function fetchToc(refresh = false): Promise { + mkdirSync(CACHE_DIR, { recursive: true }); + const tocPath = join(CACHE_DIR, "toc.json"); + const tocText = await fetchCached(TOC_URL, tocPath, refresh); + const toc = JSON.parse(tocText) as { items: TocEntry[] }; + return flattenToc(toc.items); +} + +export async function fetchPage( + page: TocPage, + opts: { refresh?: boolean; verbose?: boolean } = {}, +): Promise<{ guid: string; tocTitle: string; content: string; cachePath: string }> { + const cachePath = join(CACHE_DIR, `${page.href}.md`); + const content = await fetchCached(PAGE_URL(page.href), cachePath, opts.refresh ?? false); + if (opts.verbose) console.log(` fetched ${page.href} (${content.length} bytes)`); + return { guid: page.href, tocTitle: page.tocTitle, content, cachePath }; +} diff --git a/scripts/ingest-ms-oi29500/ingest.ts b/scripts/ingest-ms-oi29500/ingest.ts new file mode 100644 index 0000000..2032c62 --- /dev/null +++ b/scripts/ingest-ms-oi29500/ingest.ts @@ -0,0 +1,334 @@ +/** + * Ingest MS-OI29500 implementation notes into `behavior_notes`. + * + * Pipeline: + * 1. Fetch toc.json + per-page native markdown (cached locally). + * 2. Parse each page into structured claim groups. + * 3. Resolve each entry's (name, ecmaSection, partNumber) to an + * `xsd_symbols` row when possible (transitional profile). + * 4. Classify each behavior bullet's claim_type from its verb pattern. + * 5. Replace all rows for source_id=ms-oi29500, then bulk-insert the new + * ones. Idempotent: same input → same row counts. + * + * Usage: + * DATABASE_URL=... bun scripts/ingest-ms-oi29500/ingest.ts # full corpus + * DATABASE_URL=... bun scripts/ingest-ms-oi29500/ingest.ts --count 100 # first N pages (for testing) + * DATABASE_URL=... bun scripts/ingest-ms-oi29500/ingest.ts --refresh # invalidate page cache + * DATABASE_URL=... bun scripts/ingest-ms-oi29500/ingest.ts --dry-run # parse + resolve, skip DB writes + */ + +import type { Sql } from "postgres"; + +import { createDbClient } from "../../packages/shared/src/db/index.ts"; + +import { inferApp, minConfidence } from "./app-inference.ts"; +import { classifyClaim } from "./claim-type.ts"; +import { fetchPage, fetchToc, filterImplementationNotes, type TocPage } from "./fetch.ts"; +import { entryIdFromTocTitle, type ParsedClaim, parsePage } from "./parse.ts"; +import { loadSymbolMap, resolveSymbol, type SymbolMap } from "./resolve.ts"; + +const SOURCE_NAME = "ms-oi29500"; +// Editorial confidence in the truth of the claim. MS-OI29500 is published by +// Microsoft and authoritative for Office implementation behavior; we set 'high' +// across the board. Hand-curated rows may use 'medium' / 'low' separately. +const SOURCE_EDITORIAL_CONFIDENCE = "high" as const; +const BATCH_CHUNK = 500; + +interface CliArgs { + count: number | null; + refresh: boolean; + dryRun: boolean; + verbose: boolean; +} + +interface BehaviorNoteRow { + symbol_id: number | null; + app: string; + version_scope: string | null; + claim_type: string; + summary: string; + source_id: number; + section_id: string | null; + confidence: string | null; + source_anchor: string; + source_commit: string | null; + claim_label: string | null; + claim_index: number; + target_ref: string | null; + standard_text: string; + behavior_text: string; + resolution_confidence: string | null; +} + +interface IngestStats { + pagesTotal: number; + pagesIngestable: number; + pagesSkipped: number; + rowsInserted: number; + resolvedTopLevel: number; + resolvedLocal: number; + unresolvedNoVocab: number; + unresolvedNoMatch: number; + unresolvedAmbiguous: number; +} + +function parseArgs(): CliArgs { + const args: CliArgs = { count: null, refresh: false, dryRun: false, verbose: false }; + const argv = process.argv.slice(2); + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--count") args.count = parseInt(argv[++i] ?? "", 10); + else if (a === "--refresh") args.refresh = true; + else if (a === "--dry-run") args.dryRun = true; + else if (a === "--verbose") args.verbose = true; + else if (a === "--help" || a === "-h") { + console.log( + "Usage: bun scripts/ingest-ms-oi29500/ingest.ts [--count N] [--refresh] [--dry-run] [--verbose]", + ); + process.exit(0); + } else throw new Error(`Unknown argument: ${a}`); + } + return args; +} + +async function lookupSourceId(sql: Sql, name: string): Promise { + const rows = await sql>` + SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1 + `; + if (rows.length === 0) { + throw new Error( + `reference_sources row not found for '${name}'. Run 'bun run sources:sync' first.`, + ); + } + return rows[0].id; +} + +function buildRow(args: { + sourceId: number; + sourceAnchor: string; + sourceCommit: string | null; + sectionId: string | null; + app: string; + claim: ParsedClaim; + claimIndex: number; + behaviorIndex: number; + behaviorText: string; + versionScope: string | null; + resolutionSymbolId: number | null; + resolutionConfidence: "high" | "medium" | "low" | null; + targetRef: string | null; +}): BehaviorNoteRow { + const { claimType, confidence: classifierConfidence } = classifyClaim(args.behaviorText); + const summary = + args.behaviorText.length > 280 ? `${args.behaviorText.slice(0, 277)}...` : args.behaviorText; + + // claim_index uniquely identifies this row within the page (claim_label + // alone collides for multi-bullet groups). claim_label stays as the + // human-readable letter ('a', 'b', ...) for display. + const compositeIndex = args.claimIndex * 100 + args.behaviorIndex; + + // resolution_confidence is the worst-of (claim-type classifier, symbol + // resolver). If either is shaky, the row is shaky. + const resolutionConfidence = minConfidence(classifierConfidence, args.resolutionConfidence); + + return { + symbol_id: args.resolutionSymbolId, + app: args.app, + version_scope: args.versionScope, + claim_type: claimType, + summary, + source_id: args.sourceId, + section_id: args.sectionId, + // Editorial confidence: MS-OI29500 is authoritative. + confidence: SOURCE_EDITORIAL_CONFIDENCE, + source_anchor: args.sourceAnchor, + source_commit: args.sourceCommit, + claim_label: args.claim.label ?? null, + claim_index: compositeIndex, + target_ref: args.targetRef, + standard_text: args.claim.standardText, + behavior_text: args.behaviorText, + resolution_confidence: resolutionConfidence, + }; +} + +async function inChunks(items: T[], size: number, fn: (chunk: T[]) => Promise) { + for (let i = 0; i < items.length; i += size) { + await fn(items.slice(i, i + size)); + } +} + +async function ingestRows(sql: Sql, sourceId: number, rows: BehaviorNoteRow[], verbose: boolean) { + await sql.begin(async (tx) => { + const deleted = await tx`DELETE FROM behavior_notes WHERE source_id = ${sourceId}`; + if (verbose) console.log(` cleared ${deleted.count} existing rows for source_id=${sourceId}`); + + await inChunks(rows, BATCH_CHUNK, async (chunk) => { + await tx` + INSERT INTO behavior_notes ${tx( + chunk, + "symbol_id", + "app", + "version_scope", + "claim_type", + "summary", + "source_id", + "section_id", + "confidence", + "source_anchor", + "source_commit", + "claim_label", + "claim_index", + "target_ref", + "standard_text", + "behavior_text", + "resolution_confidence", + )} + `; + if (verbose) console.log(` inserted ${chunk.length} rows`); + }); + }); +} + +async function main() { + const args = parseArgs(); + + const url = process.env.DATABASE_URL; + if (!url) throw new Error("Set DATABASE_URL"); + + const db = createDbClient(url); + + console.log(`Loading source row '${SOURCE_NAME}' and symbol map...`); + const sourceId = await lookupSourceId(db.sql, SOURCE_NAME); + const map: SymbolMap = await loadSymbolMap(db.sql); + console.log(` source_id=${sourceId}; ${map.all.length} symbols loaded.\n`); + + console.log("Fetching toc.json..."); + const allEntries = await fetchToc(args.refresh); + const candidates = filterImplementationNotes(allEntries); + console.log(` ${candidates.length} implementation-note pages in toc.\n`); + + const subset: TocPage[] = args.count !== null ? candidates.slice(0, args.count) : candidates; + console.log(`Fetching ${subset.length} page(s) (cached unless --refresh)...`); + + const rows: BehaviorNoteRow[] = []; + const stats: IngestStats = { + pagesTotal: subset.length, + pagesIngestable: 0, + pagesSkipped: 0, + rowsInserted: 0, + resolvedTopLevel: 0, + resolvedLocal: 0, + unresolvedNoVocab: 0, + unresolvedNoMatch: 0, + unresolvedAmbiguous: 0, + }; + + for (let i = 0; i < subset.length; i++) { + const page = subset[i]; + const fetched = await fetchPage(page, { refresh: args.refresh }); + const entryId = entryIdFromTocTitle(page.tocTitle); + const parsed = parsePage(fetched.content, { entryId }); + + if (i % 50 === 0 && i > 0) { + console.log(` progress: ${i}/${subset.length} pages, ${rows.length} rows so far`); + } + + if (!parsed.ingestable) { + stats.pagesSkipped++; + continue; + } + stats.pagesIngestable++; + + const t = parsed.parsedTitle; + const ecmaSection = t?.ecmaSection ?? null; + const partNumber = t?.partNumber ?? null; + const name = t?.name ?? null; + const sourceCommit = parsed.frontmatter.git_commit_id ?? null; + + // Resolve once per page (the symbol context is the same across claims). + // target_ref is always populated, even on resolved rows. Migration 0006 + // changed `behavior_notes.symbol_id` FK to ON DELETE SET NULL, so a + // future xsd:ingest will null out symbol_id on attached rows. Without a + // target_ref to fall back on, those notes become unreachable via + // ooxml_behavior(qname=...) until a full ms:ingest re-runs. Keeping + // target_ref populated restores the qname word-boundary fallback. + let resolutionSymbolId: number | null = null; + let resolutionConfidence: "high" | "medium" | "low" | null = null; + const targetRef = + ecmaSection && name + ? `Section ${ecmaSection}, ${name}` + : `Section ${ecmaSection ?? "?"}, ${name ?? "?"}`; + + if (ecmaSection && name) { + const outcome = resolveSymbol(map, name, ecmaSection, partNumber); + if (outcome.resolved) { + resolutionSymbolId = outcome.symbolId; + resolutionConfidence = outcome.confidence; + if (outcome.isLocal) stats.resolvedLocal++; + else stats.resolvedTopLevel++; + } else { + if (outcome.reason === "no-vocabulary") stats.unresolvedNoVocab++; + else if (outcome.reason === "no-match") stats.unresolvedNoMatch++; + else if (outcome.reason === "ambiguous") stats.unresolvedAmbiguous++; + } + } else { + stats.unresolvedNoVocab++; + } + + // Section_id stores both the ECMA section ('17.4.37') and the entry_id + // ('2.1.149') as a compact citation. Tools format these for display. + const sectionId = entryId + ? `${entryId} (Part ${partNumber ?? "?"} §${ecmaSection ?? "?"})` + : ecmaSection; + + parsed.claims.forEach((claim, claimIdx) => { + claim.behaviors.forEach((behavior, behaviorIdx) => { + rows.push( + buildRow({ + sourceId, + sourceAnchor: page.href, + sourceCommit, + sectionId, + app: inferApp(partNumber, ecmaSection, behavior.text), + claim, + claimIndex: claimIdx, + behaviorIndex: behaviorIdx, + behaviorText: behavior.text, + versionScope: behavior.versionScope, + resolutionSymbolId, + resolutionConfidence, + targetRef, + }), + ); + }); + }); + } + + console.log( + `\nParsed: ${stats.pagesIngestable} ingestable, ${stats.pagesSkipped} skipped, ${rows.length} rows assembled.`, + ); + console.log(` resolved (top-level): ${stats.resolvedTopLevel}`); + console.log(` resolved (local): ${stats.resolvedLocal}`); + console.log(` no-vocabulary: ${stats.unresolvedNoVocab}`); + console.log(` no-match: ${stats.unresolvedNoMatch}`); + console.log(` ambiguous: ${stats.unresolvedAmbiguous}`); + + if (args.dryRun) { + console.log("\n--dry-run: skipping DB writes."); + await db.close(); + return; + } + + console.log("\nWriting to behavior_notes..."); + await ingestRows(db.sql, sourceId, rows, args.verbose); + stats.rowsInserted = rows.length; + console.log(`Done. Inserted ${stats.rowsInserted} behavior_notes rows.`); + + await db.close(); +} + +main().catch((err) => { + console.error("Ingest failed:", err); + process.exit(1); +}); diff --git a/scripts/ingest-ms-oi29500/parse.ts b/scripts/ingest-ms-oi29500/parse.ts new file mode 100644 index 0000000..4463676 --- /dev/null +++ b/scripts/ingest-ms-oi29500/parse.ts @@ -0,0 +1,346 @@ +/** + * MS-OI29500 markdown parser. + * + * Input: a single Microsoft Learn page fetched via `?accept=text/markdown`, + * plus optionally the toc_title (which carries the entry_id like "2.1.1779" + * that's not present in the H1). + * Output: a structured representation of the page's title, frontmatter, and + * lettered claim groups with their italic "spec says" text and indented + * "Word does" behavior bullets. + * + * The native Microsoft Learn markdown shape is regular: + * + * --- + * + * --- + * + * # [MS-OI29500]: Part 4 Section 14.9.1.1, txbxContent (Rich Text Box ...) | Microsoft Learn + * + * - *For additional notes...* [oMath, §22.1.2.77(f)](...) ... <-- preamble (skip) + * + * a. *The standard states that text box content can be placed inside endnotes...* + * + * - Word does not allow textbox content inside endnotes... + * + * b. *The standard specifies this element as part of the WordprocessingML namespace.* + * + * - Word will save an mce choice for VML content... + * - This note applies to the following products: Office 2013 Client (Strict)... + * + * Version-scope bullets (`This note applies to the following products: ...`) + * are attached to the previous behavior in the same claim group, not emitted + * as their own behavior row. + */ + +export interface MsImplementationPage { + /** MS-internal entry id (e.g. "2.1.1779"), supplied by the caller from the + * toc_title since it's not present in the H1. */ + entryId: string | null; + /** Native frontmatter parsed as a key-value map (string-only values). */ + frontmatter: Record; + /** The H1 of the page, with " | Microsoft Learn" stripped. */ + rawTitle: string | null; + /** Title parsed into structured parts when the canonical shape matches. */ + parsedTitle: ParsedTitle | null; + /** Whether this page is a candidate for behavior_notes ingest. Requires + * parsable Part/Section + at least one claim group + at least one behavior + * row across all claims (a claim group without behaviors yields no + * behavior_notes rows and is treated as skip). */ + ingestable: boolean; + /** Reason for skipping when not ingestable. */ + skipReason: string | null; + /** Lettered claim groups extracted from the body. */ + claims: ParsedClaim[]; +} + +export interface ParsedTitle { + entryId: string | null; + partNumber: number | null; + ecmaSection: string | null; + name: string | null; + description: string | null; +} + +export interface ParsedClaim { + /** "a", "b", ... All observed pages use lettered headers; null is reserved + * for a future single-anonymous-claim path that we have not yet seen in + * the corpus. */ + label: string | null; + /** Standard text from the italic block, normalized (single-line). */ + standardText: string; + /** One row per ` -` bullet. Excludes version-scope marker bullets, which + * attach to the immediately preceding behavior. */ + behaviors: ParsedBehavior[]; +} + +export interface ParsedBehavior { + /** Verbatim behavior text, normalized to single-line. */ + text: string; + /** Set when a "This note applies to the following products: ..." marker + * follows this behavior. */ + versionScope: string | null; +} + +const FRONTMATTER_RE = /^---\n([\s\S]*?)\n---\n/; +const H1_RE = /^#\s+(.+?)\s*$/m; +const TITLE_PARSE_RE = + /^(?:\[MS-OI29500\]:\s*)?(?:Part\s+(\d+)\s+Section\s+([\d.]+),\s+)?(.+?)(?:\s*\|\s*Microsoft Learn)?$/; +const ENTRY_ID_RE = /^(\d+\.\d+(?:\.\d+)*)\s+(.*)$/; +const NAME_DESC_RE = /^(.+?)\s*\(([^)]+)\)\s*$/; + +function normalizeLineEndings(s: string): string { + // Microsoft Learn serves CRLF in markdown bodies. Normalize to LF so the + // rest of the parser can rely on `\n` line splits and `.` regex matching + // (JS regex `.` does not match `\r`). + return s.replace(/\r\n?/g, "\n"); +} + +function parseFrontmatter(content: string): { frontmatter: Record; body: string } { + const normalized = normalizeLineEndings(content); + const m = normalized.match(FRONTMATTER_RE); + if (!m) return { frontmatter: {}, body: normalized }; + const fmText = m[1]; + const body = normalized.slice(m[0].length); + + const fm: Record = {}; + for (const line of fmText.split("\n")) { + const kv = line.match(/^([a-zA-Z_][\w-]*):\s*(.*?)\s*$/); + if (!kv) continue; + const key = kv[1]; + let value = kv[2]; + if ( + (value.startsWith("'") && value.endsWith("'")) || + (value.startsWith('"') && value.endsWith('"')) + ) { + value = value.slice(1, -1); + } + fm[key] = value; + } + return { frontmatter: fm, body }; +} + +function parseTitle(rawTitle: string): ParsedTitle | null { + const m = rawTitle.match(TITLE_PARSE_RE); + if (!m) return null; + + const partNumber = m[1] ? parseInt(m[1], 10) : null; + const ecmaSection = m[2] || null; + let nameAndDesc = m[3].trim(); + + let entryId: string | null = null; + const entryM = nameAndDesc.match(ENTRY_ID_RE); + if (entryM) { + entryId = entryM[1]; + nameAndDesc = entryM[2]; + } + + let name: string | null = nameAndDesc; + let description: string | null = null; + const descM = nameAndDesc.match(NAME_DESC_RE); + if (descM) { + name = descM[1].trim(); + description = descM[2].trim(); + } + + return { entryId, partNumber, ecmaSection, name, description }; +} + +function normalize(s: string): string { + return s.replace(/\s+/g, " ").trim(); +} + +const VERSION_SCOPE_PREFIX = "this note applies to the following products:"; + +function isVersionScopeMarker(text: string): boolean { + return text.toLowerCase().startsWith(VERSION_SCOPE_PREFIX); +} + +function extractVersionScope(text: string): string { + return text.slice(VERSION_SCOPE_PREFIX.length).trim(); +} + +/** + * Find lettered claim headers in the body. The shape is `^[a-z]\. \*...\*` - + * the italic spec text may span lines (markdown rewraps), so we accept any + * content up to the next blank line. + */ +function findLetteredHeaders(body: string): Array<{ + label: string; + standardText: string; + startIndex: number; + endIndex: number; +}> { + const lines = body.split("\n"); + const headers: Array<{ + label: string; + standardText: string; + startIndex: number; + endIndex: number; + }> = []; + + // Compute byte offsets per line for slicing later + const offsets: number[] = []; + let off = 0; + for (const line of lines) { + offsets.push(off); + off += line.length + 1; + } + + const headerStart = /^([a-z])\.\s+\*/; + + for (let i = 0; i < lines.length; i++) { + const m = lines[i].match(headerStart); + if (!m) continue; + const label = m[1]; + // Collect the italic block: starts on this line at the `*`, may continue + // across following lines until a `*` closes it. Microsoft's converter + // generally keeps the italic on a single paragraph. + const startIndex = offsets[i]; + + // Slice from the `*` position through subsequent non-empty lines. + const firstStarIdx = lines[i].indexOf("*"); + const collected: string[] = [lines[i].slice(firstStarIdx + 1)]; + let j = i; + // Read forward until we find a closing star outside of inline emphasis. + // In practice a single italic span runs to end-of-paragraph; we accumulate + // lines until a blank line. + if (!collected[0].endsWith("*")) { + j = i + 1; + while (j < lines.length && lines[j].trim().length > 0) { + collected.push(lines[j]); + if (lines[j].trimEnd().endsWith("*")) break; + j++; + } + } + const concat = collected.join(" "); + const endStarIdx = concat.lastIndexOf("*"); + if (endStarIdx === -1) continue; // malformed + const standardText = normalize(concat.slice(0, endStarIdx)); + + // Find where this group's behavior block ends: next lettered header or EOF. + let blockEndLine = lines.length; + for (let k = j + 1; k < lines.length; k++) { + if (lines[k].match(headerStart)) { + blockEndLine = k; + break; + } + } + const endIndex = offsets[blockEndLine] ?? body.length; + headers.push({ label, standardText, startIndex, endIndex }); + } + + return headers; +} + +/** + * Extract behavior bullets from a slice of body. Bullets are 4-space-indented + * dash items: `^ - text`. Multi-line bullet content continues with another + * 4+ space indent until a blank line or a non-indented line. + */ +function extractBehaviorBullets(slice: string): ParsedBehavior[] { + const lines = slice.split("\n"); + const bullets: ParsedBehavior[] = []; + let current: string[] | null = null; + + const flush = () => { + if (!current) return; + const joined = normalize(current.join(" ")); + if (joined.length === 0) { + current = null; + return; + } + if (isVersionScopeMarker(joined)) { + // Attach scope to the previous behavior. If there is no previous + // behavior (rare: scope-only claim), drop it - there's no row for it + // to belong to. Real corpus has not exhibited this case. + const scope = extractVersionScope(joined); + if (bullets.length > 0) { + bullets[bullets.length - 1].versionScope = scope; + } + } else { + bullets.push({ text: joined, versionScope: null }); + } + current = null; + }; + + for (const line of lines) { + const startsBullet = /^ {4}- (.*)$/.exec(line); + const continuesBullet = /^ {6}\S/.test(line) || /^ {8}\S/.test(line); + if (startsBullet) { + flush(); + current = [startsBullet[1]]; + } else if (current !== null && continuesBullet) { + current.push(line.trim()); + } else if (line.trim() === "") { + flush(); + } else if (current !== null) { + flush(); + } + } + flush(); + return bullets; +} + +/** + * Extract the entry_id (e.g. "2.1.1779") from a toc_title shaped like + * "2.1.1779 Part 4 Section 14.9.1.1, txbxContent (...)". Returns null when + * the toc_title doesn't follow the expected pattern. + */ +export function entryIdFromTocTitle(tocTitle: string | null | undefined): string | null { + if (!tocTitle) return null; + const m = tocTitle.match(/^(\d+\.\d+(?:\.\d+)*)\s/); + return m ? m[1] : null; +} + +export function parsePage( + content: string, + opts?: { entryId?: string | null }, +): MsImplementationPage { + const { frontmatter, body } = parseFrontmatter(content); + const entryId = opts?.entryId ?? null; + + const h1Match = body.match(H1_RE); + const rawTitle = h1Match ? h1Match[1].replace(/\s*\|\s*Microsoft Learn\s*$/, "").trim() : null; + const parsedTitle = rawTitle ? parseTitle(rawTitle) : null; + if (parsedTitle && entryId) parsedTitle.entryId = entryId; + + let skipReason: string | null = null; + let ingestable = true; + if (!parsedTitle || parsedTitle.partNumber == null) { + ingestable = false; + skipReason = "title lacks Part/Section"; + } + + const headers = findLetteredHeaders(body); + const claims: ParsedClaim[] = []; + for (const h of headers) { + const slice = body.slice(h.startIndex, h.endIndex); + const behaviors = extractBehaviorBullets(slice); + claims.push({ label: h.label, standardText: h.standardText, behaviors }); + } + + const totalBehaviors = claims.reduce((a, c) => a + c.behaviors.length, 0); + + if (ingestable) { + if (claims.length === 0) { + ingestable = false; + skipReason = "no claim groups detected"; + } else if (totalBehaviors === 0) { + // Pages with claim headers but no usable behavior text (e.g. behaviors + // stored in markdown tables) produce no behavior_notes rows. Skip them + // rather than counting as a successful parse. + ingestable = false; + skipReason = "claims found but no behavior bullets (likely table-based)"; + } + } + + return { + entryId, + frontmatter, + rawTitle, + parsedTitle, + ingestable, + skipReason, + claims, + }; +} diff --git a/scripts/ingest-ms-oi29500/resolve.ts b/scripts/ingest-ms-oi29500/resolve.ts new file mode 100644 index 0000000..4f45fd6 --- /dev/null +++ b/scripts/ingest-ms-oi29500/resolve.ts @@ -0,0 +1,197 @@ +/** + * Phase 2 symbol resolver. + * + * Maps an MS-OI29500 entry's `(name, ecma_section)` to a row in `xsd_symbols` + * for the `transitional` profile. + * + * Conservative: when multiple plausible candidates exist with no clear way to + * pick (ambiguous kind/vocab), the resolver returns `resolved: false` with a + * `targetRef` rather than guessing. Wrong attachment is worse than no + * attachment. + */ + +import type { Sql } from "postgres"; + +export interface SymbolRow { + id: number; + vocabulary_id: string; + local_name: string; + kind: string; + parent_symbol_id: number | null; +} + +export interface SymbolMap { + /** All transitional symbols. Source of truth; lookups derive from this. */ + all: SymbolRow[]; + /** (vocabulary_id, local_name) → rows. */ + byVocabAndName: Map; +} + +export type ResolutionOutcome = + | { + resolved: true; + symbolId: number; + symbolKind: string; + vocabulary: string; + confidence: "high" | "medium"; + /** True when the matched symbol is a local element decl + * (parent_symbol_id is set). The current MCP `ooxml_element` + * lookup filters to top-level only, so local matches are reachable + * only through `ooxml_behavior`. Ingest still attaches the + * behavior note via symbol_id so the dedicated tool can surface + * it; the inline tool will silently skip it. */ + isLocal: boolean; + } + | { + resolved: false; + reason: "no-vocabulary" | "no-match" | "ambiguous"; + targetRef: string; + candidates?: Array<{ id: number; vocabulary: string; kind: string }>; + }; + +export async function loadSymbolMap(sql: Sql): Promise { + const rows = await sql` + SELECT s.id, s.vocabulary_id, s.local_name, s.kind, s.parent_symbol_id + FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_profiles p ON p.id = sp.profile_id + WHERE p.name = 'transitional' + `; + const byVocabAndName = new Map(); + for (const r of rows) { + const key = `${r.vocabulary_id}|${r.local_name}`; + if (!byVocabAndName.has(key)) byVocabAndName.set(key, []); + byVocabAndName.get(key)!.push(r); + } + return { all: rows, byVocabAndName }; +} + +/** + * Map an ECMA (partNumber, section) to a candidate vocabulary list. The + * ordering matters: when a name resolves in multiple vocabularies, we pick + * the first hit in this list. + * + * Part 4 (Transitional Migration Features) is VML / legacy and not in our + * ingested XSD set - every Part 4 page maps to no vocabulary, producing a + * target_ref instead of a wrong symbol attachment. + * + * Part 1 sections 13 (PML) and 18 (SML) are also outside the current XSD + * scope; they are documented here returning [] for the same reason. + */ +function inferVocabularies(partNumber: number | null, section: string): string[] { + const m = section.match(/^(\d+)/); + if (!m) return []; + const major = parseInt(m[1], 10); + + // Part 4 = Transitional Migration Features = VML and legacy DrawingML; + // not currently ingested. + if (partNumber === 4) return []; + + switch (major) { + case 13: + return []; // PresentationML - not ingested + case 14: + case 15: + return []; // VML + case 17: + return ["wml-main"]; + case 18: + return []; // SpreadsheetML - not ingested + case 19: + return []; // PresentationML - not ingested + case 20: + return ["dml-main", "dml-pic", "dml-wp", "shared-types"]; + case 21: + return ["dml-chart", "dml-diagram", "dml-chartDrawing", "dml-main"]; + case 22: { + // 22.x is split: 22.1 = math, 22.9 = shared simple types, others + // (22.2 ext-props, 22.3 custom-props, 22.4 bibliography, etc.) are + // not in our ingest scope. + const sub = section.match(/^22\.(\d+)/); + const minor = sub ? parseInt(sub[1], 10) : 0; + if (minor === 1) return ["shared-math"]; + if (minor === 9) return ["shared-types"]; + return []; + } + default: + return []; + } +} + +const KIND_PRIORITY = [ + "element", + "complexType", + "simpleType", + "attributeGroup", + "group", + "attribute", +]; + +function kindRank(kind: string): number { + const idx = KIND_PRIORITY.indexOf(kind); + return idx === -1 ? KIND_PRIORITY.length : idx; +} + +export function resolveSymbol( + map: SymbolMap, + name: string, + ecmaSection: string, + partNumber: number | null, +): ResolutionOutcome { + const vocabs = inferVocabularies(partNumber, ecmaSection); + if (vocabs.length === 0) { + return { + resolved: false, + reason: "no-vocabulary", + targetRef: `Section ${ecmaSection}, ${name}`, + }; + } + + const candidates: SymbolRow[] = []; + for (const v of vocabs) { + const rows = map.byVocabAndName.get(`${v}|${name}`) ?? []; + candidates.push(...rows); + } + + if (candidates.length === 0) { + return { + resolved: false, + reason: "no-match", + targetRef: `Section ${ecmaSection}, ${name} (searched: ${vocabs.join(", ")})`, + }; + } + + // Prefer top-level (parent_symbol_id IS NULL) over local element decls. + const topLevel = candidates.filter((c) => c.parent_symbol_id === null); + const pool = topLevel.length > 0 ? topLevel : candidates; + + const sorted = [...pool].sort((a, b) => kindRank(a.kind) - kindRank(b.kind)); + const bestKind = sorted[0].kind; + const bestKindMatches = sorted.filter((c) => c.kind === bestKind); + + if (bestKindMatches.length === 1) { + const winner = bestKindMatches[0]; + // High confidence when only one vocab was tried; medium when we had to + // pick across multiple DrawingML vocabs. + const confidence: "high" | "medium" = vocabs.length === 1 ? "high" : "medium"; + return { + resolved: true, + symbolId: winner.id, + symbolKind: winner.kind, + vocabulary: winner.vocabulary_id, + confidence, + isLocal: winner.parent_symbol_id !== null, + }; + } + + return { + resolved: false, + reason: "ambiguous", + targetRef: `Section ${ecmaSection}, ${name}`, + candidates: bestKindMatches.map((c) => ({ + id: c.id, + vocabulary: c.vocabulary_id, + kind: c.kind, + })), + }; +} diff --git a/scripts/seed-word-observations.ts b/scripts/seed-word-observations.ts new file mode 100644 index 0000000..800f2d3 --- /dev/null +++ b/scripts/seed-word-observations.ts @@ -0,0 +1,271 @@ +/** + * Seed the verification layer with the 4 observations recorded during Phase + * 4 dogfooding (the Arabic-bold test, the cols/num test, the trHeight val=0 + * test, and the style-rPr test). Each observation links to the relevant + * MS-OI29500 note(s) by source_anchor. + * + * Re-running is idempotent: fixtures upsert on `name`; observations and + * join rows are skipped when their (fixture, scenario) / (note, observation) + * pair already exists. + * + * Usage: + * DATABASE_URL=... bun scripts/seed-word-observations.ts + */ + +import type { Sql } from "postgres"; +import { createDbClient } from "../packages/shared/src/db/index.ts"; + +interface FixtureSpec { + name: string; + description: string; + sha256: string; + generatorScript: string; + wordVersion: string; +} + +interface ObservationSpec { + fixtureName: string; + scenario: string; + finding: string; + beforeXml: string | null; + afterXml: string | null; + links: Array<{ + sourceAnchor: string; + /** Letter of the specific sub-claim on the page (a, b, c, ...). MS-OI29500 + * pages can have multiple claim groups; without this, a link defaults to + * claim 'a' and may end up tagging the wrong sub-claim. */ + claimLabel: string; + status: "confirmed" | "refined" | "contradicted" | "not_reproducible"; + notes: string | null; + }>; +} + +const FIXTURES: FixtureSpec[] = [ + { + name: "arabic-bold-test", + description: + "One paragraph with bold English and bold Arabic runs side by side. Used to inspect whether Word emits w:b or w:bCs for cs/rtl runs.", + sha256: "5acf5e29f847afe964e805dd5b5138cde4963a22e501e84b787018eb68bb078f", + generatorScript: + 'mcp__word-api__create_document(content=[heading, paragraph with runs: bold English + bold Arabic "النص العربي"])', + wordVersion: "Word 16.0", + }, + { + name: "cols-test", + description: + "Three unequal-width columns after a continuous section break. Used to check Word's `` num attribute behavior.", + sha256: "732f3b7691525d0ff4bd7f80bcb7709447e39ff2d50ca3752a22ba6e9845404b", + generatorScript: + "mcp__word-api__create_document(content=[paragraph, section_break continuous, columns count=3 equal_width=false widths=[2.5, 1.8, 1.2]])", + wordVersion: "Word 16.0", + }, + { + name: "rowheight-val-zero", + description: + 'Hand-authored DOCX with opened and saved by Word, to observe Word\'s repair behavior.', + sha256: "b82336766dd86d25b1d2ba6220f04d511ed903cc017c3d3d3eb2507c0d15632a", + generatorScript: + 'PowerShell: author 1x1 table with Row.Height=24 / HeightRule=2, then patch document.xml to val="0", reopen and SaveAs2.', + wordVersion: "Word 16.0", + }, + { + name: "style-rpr-test", + description: + "Document containing built-in Heading 1, Heading 2, Quote styles plus an inline-formatted paragraph. Used to inspect every styles.xml rPr block for disallowed children.", + sha256: "2c6b384a3a1f8ed695db23747a9d78ea26197555657723a3df1fdd1d4d22ff4a", + generatorScript: + "mcp__word-api__create_document(content=[heading L1, heading L2, paragraph style=Quote, paragraph with runs])", + wordVersion: "Word 16.0", + }, +]; + +const OBSERVATIONS: ObservationSpec[] = [ + { + fixtureName: "arabic-bold-test", + scenario: "authored", + finding: + "Word emits on every bold run including the cs/rtl Arabic runs. It does NOT emit . Word reads and renders Arabic bold from this file.", + beforeXml: null, + afterXml: 'النص', + // MS-OI29500 §17.3.2.1, sub-claim a (the only claim on the b/bold page). + links: [ + { + sourceAnchor: "03b9695f-fd69-435d-90e6-b1069aadf291", + claimLabel: "a", + status: "contradicted", + notes: + "Note describes a read rule (w:b applies only to non-cs/non-rtl runs) but Word's actual read+write paths apply w:b to cs/rtl runs too. Implementers should not gate complex-script bold on w:bCs alone.", + }, + ], + }, + { + fixtureName: "cols-test", + scenario: "authored", + finding: + 'Word writes with 3 children, even though the spec says num is ignored when equalWidth=false. Confirms the documented requirement.', + beforeXml: null, + afterXml: + '', + // MS-OI29500 §17.6.4 (cols), sub-claim c: "Word requires that the value + // of the num attribute matches the number of child col elements." + links: [ + { + sourceAnchor: "ef7027d4-e05d-473b-8777-dcc2aee91935", + claimLabel: "c", + status: "confirmed", + notes: null, + }, + ], + }, + { + fixtureName: "rowheight-val-zero", + scenario: "open-and-save", + finding: + 'Word strips the entire element (and the now-empty parent) on save. The doc says Word "requires val != 0"; Word\'s actual repair path is to drop the directive entirely rather than reject the file or coerce val to a positive number.', + beforeXml: '', + afterXml: "(no on the row)", + // MS-OI29500 §17.4.80 (trHeight), sub-claim c: "Word requires that if + // the hRule attribute is set to exact, then the val attribute must not + // be 0." (Sub-claim a covers the hRule-omitted default; sub-claim b + // covers the val datatype.) + links: [ + { + sourceAnchor: "5919e0bd-e6ce-477e-8d66-0e5282f5c506", + claimLabel: "c", + status: "refined", + notes: + "Direction is correct (Word doesn't keep val=0 with hRule=exact) but Word's enforcement is silent removal, not validation failure. SuperDoc parsers should expect the trHeight to disappear during a Word round-trip.", + }, + ], + }, + { + fixtureName: "style-rpr-test", + scenario: "authored", + finding: + "Across 28 blocks under in styles.xml, Word emits zero of the disallowed children (cs, highlight, oMath, rPrChange, rStyle, rtl). Confirms the documented restriction.", + beforeXml: null, + afterXml: "(no disallowed children in any of 28 inspected style rPr blocks)", + // MS-OI29500 §17.7.6.2 (rPr children in style definitions), single claim a. + links: [ + { + sourceAnchor: "d0244b61-fd96-45f0-ac84-7380d2b6d663", + claimLabel: "a", + status: "confirmed", + notes: null, + }, + ], + }, +]; + +async function upsertFixture(sql: Sql, f: FixtureSpec): Promise { + const [row] = await sql>` + INSERT INTO word_fixtures (name, description, sha256, generator_script, word_version) + VALUES (${f.name}, ${f.description}, ${f.sha256}, ${f.generatorScript}, ${f.wordVersion}) + ON CONFLICT (name) DO UPDATE SET + description = EXCLUDED.description, + sha256 = EXCLUDED.sha256, + generator_script = EXCLUDED.generator_script, + word_version = EXCLUDED.word_version + RETURNING id + `; + return row.id; +} + +async function findOrInsertObservation( + sql: Sql, + fixtureId: number, + o: ObservationSpec, +): Promise { + const existing = await sql>` + SELECT id FROM word_observations + WHERE fixture_id = ${fixtureId} AND scenario = ${o.scenario} AND finding = ${o.finding} + LIMIT 1 + `; + if (existing.length > 0) return existing[0].id; + const [row] = await sql>` + INSERT INTO word_observations (fixture_id, scenario, finding, before_xml, after_xml) + VALUES (${fixtureId}, ${o.scenario}, ${o.finding}, ${o.beforeXml}, ${o.afterXml}) + RETURNING id + `; + return row.id; +} + +async function findNoteId( + sql: Sql, + sourceAnchor: string, + claimLabel: string, +): Promise { + const rows = await sql>` + SELECT id FROM behavior_notes + WHERE source_anchor = ${sourceAnchor} + AND claim_label = ${claimLabel} + ORDER BY claim_index + LIMIT 1 + `; + return rows.length > 0 ? rows[0].id : null; +} + +async function linkNoteObservation( + sql: Sql, + noteId: number, + observationId: number, + status: string, + notes: string | null, +): Promise { + await sql` + INSERT INTO behavior_note_observations (behavior_note_id, observation_id, status, notes) + VALUES (${noteId}, ${observationId}, ${status}, ${notes}) + ON CONFLICT (behavior_note_id, observation_id) DO UPDATE SET + status = EXCLUDED.status, + notes = EXCLUDED.notes + `; +} + +async function main() { + const url = process.env.DATABASE_URL; + if (!url) throw new Error("Set DATABASE_URL"); + const db = createDbClient(url); + + console.log(`Seeding ${FIXTURES.length} fixtures + ${OBSERVATIONS.length} observations...`); + + const fixtureIds = new Map(); + for (const f of FIXTURES) { + const id = await upsertFixture(db.sql, f); + fixtureIds.set(f.name, id); + console.log(` fixture ${f.name} → id=${id}`); + } + + let linksCreated = 0; + let linksSkipped = 0; + for (const o of OBSERVATIONS) { + const fixtureId = fixtureIds.get(o.fixtureName); + if (!fixtureId) throw new Error(`Unknown fixture ${o.fixtureName}`); + const obsId = await findOrInsertObservation(db.sql, fixtureId, o); + console.log(` observation [${o.fixtureName}/${o.scenario}] → id=${obsId}`); + // Remove any prior links for this observation so a re-seed with corrected + // claimLabels doesn't leave stale join rows pointing at the wrong claim. + await db.sql`DELETE FROM behavior_note_observations WHERE observation_id = ${obsId}`; + + for (const link of o.links) { + const noteId = await findNoteId(db.sql, link.sourceAnchor, link.claimLabel); + if (noteId === null) { + console.log( + ` SKIP link: no behavior_note for source_anchor=${link.sourceAnchor} claim=${link.claimLabel}`, + ); + linksSkipped++; + continue; + } + await linkNoteObservation(db.sql, noteId, obsId, link.status, link.notes); + console.log(` link → note=${noteId} (${link.claimLabel}) status=${link.status}`); + linksCreated++; + } + } + + console.log(`\nDone. Links created/updated: ${linksCreated}, skipped: ${linksSkipped}.`); + await db.close(); +} + +main().catch((err) => { + console.error("Seed failed:", err); + process.exit(1); +}); diff --git a/tests/ingest-xsd/fixtures/main.xsd b/tests/ingest-ecma-376-xsds/fixtures/main.xsd similarity index 100% rename from tests/ingest-xsd/fixtures/main.xsd rename to tests/ingest-ecma-376-xsds/fixtures/main.xsd diff --git a/tests/ingest-xsd/fixtures/shared.xsd b/tests/ingest-ecma-376-xsds/fixtures/shared.xsd similarity index 100% rename from tests/ingest-xsd/fixtures/shared.xsd rename to tests/ingest-ecma-376-xsds/fixtures/shared.xsd diff --git a/tests/ingest-xsd/ingest.test.ts b/tests/ingest-ecma-376-xsds/ingest.test.ts similarity index 99% rename from tests/ingest-xsd/ingest.test.ts rename to tests/ingest-ecma-376-xsds/ingest.test.ts index 58aacef..336d13a 100644 --- a/tests/ingest-xsd/ingest.test.ts +++ b/tests/ingest-ecma-376-xsds/ingest.test.ts @@ -8,7 +8,7 @@ import { existsSync } from "node:fs"; import { join } from "node:path"; import { afterAll, afterEach, beforeAll, beforeEach, expect, test } from "bun:test"; -import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-ecma-376-xsds/ingest.ts"; import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; const FIXTURES_DIR = join(import.meta.dir, "fixtures"); @@ -23,6 +23,9 @@ let db: DbClient; const TRUNCATE_SQL = ` TRUNCATE + behavior_note_observations, + word_observations, + word_fixtures, behavior_notes, xsd_enums, xsd_inheritance_edges, diff --git a/tests/ingest-xsd/parse-schema.test.ts b/tests/ingest-ecma-376-xsds/parse-schema.test.ts similarity index 97% rename from tests/ingest-xsd/parse-schema.test.ts rename to tests/ingest-ecma-376-xsds/parse-schema.test.ts index 0e4384d..9c03541 100644 --- a/tests/ingest-xsd/parse-schema.test.ts +++ b/tests/ingest-ecma-376-xsds/parse-schema.test.ts @@ -9,9 +9,9 @@ import { existsSync } from "node:fs"; import { join } from "node:path"; import { expect, test } from "bun:test"; -import { parseSchemaSet } from "../../scripts/ingest-xsd/parse-schema.ts"; -import { declarationQNameKey, resolveQNameAttr } from "../../scripts/ingest-xsd/qname.ts"; -import type { Declaration, DeclarationKind } from "../../scripts/ingest-xsd/types.ts"; +import { parseSchemaSet } from "../../scripts/ingest-ecma-376-xsds/parse-schema.ts"; +import { declarationQNameKey, resolveQNameAttr } from "../../scripts/ingest-ecma-376-xsds/qname.ts"; +import type { Declaration, DeclarationKind } from "../../scripts/ingest-ecma-376-xsds/types.ts"; const FIXTURES_DIR = join(import.meta.dir, "fixtures"); const REAL_CACHE_DIR = "./data/xsd-cache/ecma-376-transitional"; diff --git a/tests/ingest-ms-oi29500/app-inference.test.ts b/tests/ingest-ms-oi29500/app-inference.test.ts new file mode 100644 index 0000000..36508b9 --- /dev/null +++ b/tests/ingest-ms-oi29500/app-inference.test.ts @@ -0,0 +1,72 @@ +/** + * Tests for inferApp + minConfidence helpers. + */ + +import { expect, test } from "bun:test"; +import { + inferApp, + minConfidence, +} from "../../scripts/ingest-ms-oi29500/app-inference.ts"; + +test("inferApp: section 17 → Word", () => { + expect(inferApp(1, "17.4.37")).toBe("Word"); +}); + +test("inferApp: section 18 → Excel", () => { + expect(inferApp(1, "18.18.89")).toBe("Excel"); +}); + +test("inferApp: section 13 → PowerPoint", () => { + expect(inferApp(1, "13.3.1")).toBe("PowerPoint"); +}); + +test("inferApp: section 19 (Part 1) → PowerPoint (PML elements)", () => { + expect(inferApp(1, "19.7.48")).toBe("PowerPoint"); +}); + +test("inferApp: section 20 → Office (DrawingML, cross-app)", () => { + expect(inferApp(1, "20.1.4.2.9")).toBe("Office"); +}); + +test("inferApp: section 22.1 → Word (math)", () => { + expect(inferApp(1, "22.1.2.87")).toBe("Word"); +}); + +test("inferApp: section 22.9 → Office (shared types)", () => { + expect(inferApp(1, "22.9.2.14")).toBe("Office"); +}); + +test("inferApp: Part 4 (VML) → Office without text override", () => { + expect(inferApp(4, "14.9.1.1")).toBe("Office"); +}); + +test("inferApp: behavior text override beats section default", () => { + // Part 4 default is Office, but the text mentions only Word. + expect( + inferApp(4, "14.9.1.1", "Word does not allow textbox content inside endnotes."), + ).toBe("Word"); +}); + +test("inferApp: multiple app mentions in text → fall back to section default", () => { + expect( + inferApp(1, "17.4.37", "Word and Excel both interpret this differently."), + ).toBe("Word"); // section 17 default +}); + +test("inferApp: missing inputs → Office", () => { + expect(inferApp(null, null)).toBe("Office"); + expect(inferApp(1, null)).toBe("Office"); +}); + +test("minConfidence: returns the lowest non-null", () => { + expect(minConfidence("high", "medium")).toBe("medium"); + expect(minConfidence("high", "low")).toBe("low"); + expect(minConfidence("medium", "high")).toBe("medium"); +}); + +test("minConfidence: skips nulls", () => { + expect(minConfidence("high", null)).toBe("high"); + expect(minConfidence(null, "low")).toBe("low"); + expect(minConfidence(null, null)).toBeNull(); + expect(minConfidence()).toBeNull(); +}); diff --git a/tests/ingest-ms-oi29500/claim-type.test.ts b/tests/ingest-ms-oi29500/claim-type.test.ts new file mode 100644 index 0000000..5686144 --- /dev/null +++ b/tests/ingest-ms-oi29500/claim-type.test.ts @@ -0,0 +1,82 @@ +/** + * Table-driven verb-classifier tests. Each row maps a representative behavior + * sentence to the (claim_type, confidence) we expect. + */ + +import { expect, test } from "bun:test"; +import { classifyClaim } from "../../scripts/ingest-ms-oi29500/claim-type.ts"; + +interface Case { + text: string; + claimType: + | "ignores" + | "requires_despite_optional" + | "writes" + | "reads_but_does_not_write" + | "repairs" + | "layout_behavior" + | "does_not_support" + | "varies_from_spec"; + confidence: "high" | "medium" | "low"; +} + +const CASES: Case[] = [ + { + text: "Word ignores the moveFromRangeStart element.", + claimType: "ignores", + confidence: "high", + }, + { + text: "Office does not support this attribute.", + claimType: "does_not_support", + confidence: "high", + }, + { + text: "Word does not allow textbox content inside endnotes.", + claimType: "does_not_support", + confidence: "high", + }, + { + text: "Word will save an mce choice for VML content.", + claimType: "writes", + confidence: "high", + }, + { + text: "Word writes an extra w:rPr child even though the spec doesn't require it.", + claimType: "writes", + confidence: "high", + }, + { + text: "Word reads the value but does not write it on save.", + claimType: "reads_but_does_not_write", + confidence: "high", + }, + { + text: "Word renders this attribute as an absolute coordinate.", + claimType: "layout_behavior", + confidence: "medium", + }, + { + text: "Word requires the val attribute despite the spec marking it optional.", + claimType: "requires_despite_optional", + confidence: "medium", + }, + { + text: "Word repairs malformed table cell widths on read.", + claimType: "repairs", + confidence: "medium", + }, + { + text: "Some unrelated prose that doesn't match any verb pattern.", + claimType: "varies_from_spec", + confidence: "low", + }, +]; + +for (const { text, claimType, confidence } of CASES) { + test(`${claimType}/${confidence}: ${text.slice(0, 40)}...`, () => { + const result = classifyClaim(text); + expect(result.claimType).toBe(claimType); + expect(result.confidence).toBe(confidence); + }); +} diff --git a/tests/ingest-ms-oi29500/fixtures/cross-spec-fldSimple.md b/tests/ingest-ms-oi29500/fixtures/cross-spec-fldSimple.md new file mode 100644 index 0000000..b18e586 --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/cross-spec-fldSimple.md @@ -0,0 +1,133 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: fldSimple (Ruby Simple Field) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/34477fff-355a-497e-a2bf-4b9c3f80b093 +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: This element specifies the presence of a simple field at the current location in the document. The semantics of this field +locale: en-us +author: mrsgit09 +document_id: fc8e7fee-8596-b4b3-9edb-ec18da0b4569 +document_version_independent_id: 13c964f5-4784-34a0-5570-29e08e751eb0 +updated_at: 2024-08-20T18:25:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/34477fff-355a-497e-a2bf-4b9c3f80b093.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/89097c23c53300d6a8c590b56f93789334453b20/documentation/office_standards/MS-OI29500/34477fff-355a-497e-a2bf-4b9c3f80b093.md +git_commit_id: 89097c23c53300d6a8c590b56f93789334453b20 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 863 +asset_id: office_standards/ms-oi29500/34477fff-355a-497e-a2bf-4b9c3f80b093 +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/34477fff-355a-497e-a2bf-4b9c3f80b093.md +cmProducts: +- https://authoring-docs-microsoft.poolparty.biz/devrel/540ac133-a371-4dbb-8f94-28d6cc77a70b +spProducts: +- https://authoring-docs-microsoft.poolparty.biz/devrel/60bfc045-f127-4841-9d00-ea35495a5800 +platformId: 1a8fcd5a-33a0-f42c-d729-b04bf2afb956 +--- + +# [MS-OI29500]: fldSimple (Ruby Simple Field) | Microsoft Learn + +This element specifies the presence of a simple field at the current location in the document. The semantics of this field are defined via its field codes ("[\[ISO/IEC-29500-1\]](https://go.microsoft.com/fwlink/?LinkId=132464) §17.16.5"). + +[Example: Consider the following WordprocessingML fragment for a simple field: + + - ``` + + + Example Document.docx + + +``` + +The **fldSimple** element defines a *FILENAME* field ("[ISO/IEC-29500-1] §17.16.5.17; FILENAME") using the simple field syntax. The current field result for the field is *Example Document.docx*. end example] + +| Parent Elements | +| --- | +| **customXml** (§[3.1.3.1.1, customXml](d188da3e-b34b-4445-b3a0-118155f75ff3)); **fldSimple** (§3.1.3.1.2, fldSimple); **hyperlink** (§[3.1.3.1.3, hyperlink](cc9e4bbd-134f-4d35-b2c7-95fde8c633a7)); **rt** ("[ISO/IEC-29500-1] §17.3.3.24"); **rubyBase** ("[ISO/IEC-29500-1] §17.3.3.27"); **sdtContent** (§[3.1.3.1.7, sdtContent](a3757e69-1f9b-4322-b17e-dee4d308d29e)) | + +| Child Elements | Subclause | +| --- | --- | +| **acc** (Accent) | "[ISO/IEC-29500-1] §22.1.2.1" | +| **bar** (Bar) | "[ISO/IEC-29500-1] §22.1.2.7" | +| **bookmarkEnd** (Bookmark End) | "[ISO/IEC-29500-1] §17.13.6.1" | +| **bookmarkStart** (Bookmark Start) | "[ISO/IEC-29500-1] §17.13.6.2" | +| **borderBox** (Border-Box Object) | "[ISO/IEC-29500-1] §22.1.2.11" | +| **box** (Box Object) | "[ISO/IEC-29500-1] §22.1.2.13" | +| **commentRangeEnd** (Comment Anchor Range End) | "[ISO/IEC-29500-1] §17.13.4.3" | +| **commentRangeStart** (Comment Anchor Range Start) | "[ISO/IEC-29500-1] §17.13.4.4" | +| **customXml** (Ruby Inline-Level Custom XML Element) | §3.1.3.1.1, customXml | +| **customXmlDelRangeEnd** (Custom XML Markup Deletion End) | "[ISO/IEC-29500-1] §17.13.5.4" | +| **customXmlDelRangeStart** (Custom XML Markup Deletion Start) | "[ISO/IEC-29500-1] §17.13.5.5" | +| **customXmlInsRangeEnd** (Custom XML Markup Insertion End) | "[ISO/IEC-29500-1] §17.13.5.6" | +| **customXmlInsRangeStart** (Custom XML Markup Insertion Start) | "[ISO/IEC-29500-1] §17.13.5.7" | +| **customXmlMoveFromRangeEnd** (Custom XML Markup Move Source End) | "[ISO/IEC-29500-1] §17.13.5.8" | +| **customXmlMoveFromRangeStart** (Custom XML Markup Move Source Start) | "[ISO/IEC-29500-1] §17.13.5.9" | +| **customXmlMoveToRangeEnd** (Custom XML Markup Move Destination Location End) | "[ISO/IEC-29500-1] §17.13.5.10" | +| **customXmlMoveToRangeStart** (Custom XML Markup Move Destination Location Start) | "[ISO/IEC-29500-1] §17.13.5.11" | +| **d** (Delimiter Object) | "[ISO/IEC-29500-1] §22.1.2.24" | +| **del** (Deleted Run Content) | "[ISO/IEC-29500-1] §17.13.5.14" | +| **eqArr** (Array Object) | "[ISO/IEC-29500-1] §22.1.2.34" | +| **f** (Fraction Object) | "[ISO/IEC-29500-1] §22.1.2.36" | +| **fldData** (Custom Field Data) | "[\[ISO/IEC-29500-4\]](https://go.microsoft.com/fwlink/?LinkId=150884) §14.9.6" | +| **fldSimple** (Ruby Simple Field) | §3.1.3.1.2, fldSimple | +| **func** (Function Apply Object) | "[ISO/IEC-29500-1] §22.1.2.39" | +| **groupChr** (Group-Character Object) | "[ISO/IEC-29500-1] §22.1.2.41" | +| **hyperlink** (Ruby Hyperlink) | §3.1.3.1.3, hyperlink | +| **ins** (Inserted Run Content) | "[ISO/IEC-29500-1] §17.13.5.18" | +| **limLow** (Lower-Limit Object) | "[ISO/IEC-29500-1] §22.1.2.54" | +| **limUpp** (Upper-Limit Object) | "[ISO/IEC-29500-1] §22.1.2.56" | +| **m** (Matrix Object) | "[ISO/IEC-29500-1] §22.1.2.60" | +| **moveFrom** (Move Source Run Content) | "[ISO/IEC-29500-1] §17.13.5.22" | +| **moveFromRangeEnd** (Move Source Location Container - End) | "[ISO/IEC-29500-1] §17.13.5.23" | +| **moveFromRangeStart** (Move Source Location Container - Start) | "[ISO/IEC-29500-1] §17.13.5.24" | +| **moveTo** (Move Destination Run Content) | "[ISO/IEC-29500-1] §17.13.5.25" | +| **moveToRangeEnd** (Move Destination Location Container - End) | "[ISO/IEC-29500-1] §17.13.5.27" | +| **moveToRangeStart** (Move Destination Location Container - Start) | "[ISO/IEC-29500-1] §17.13.5.28" | +| **nary** (n-ary Operator Object) | "[ISO/IEC-29500-1] §22.1.2.70" | +| **oMath** (Office Math) | "[ISO/IEC-29500-1] §22.1.2.77" | +| **oMathPara** (Office Math Paragraph) | "[ISO/IEC-29500-1] §22.1.2.78" | +| **permEnd** (Range Permission End) | "[ISO/IEC-29500-1] §17.13.7.1" | +| **permStart** (Range Permission Start) | "[ISO/IEC-29500-1] §17.13.7.2" | +| **phant** (Phantom Object) | "[ISO/IEC-29500-1] §22.1.2.81" | +| **proofErr** (Proofing Error Anchor) | "[ISO/IEC-29500-1] §17.13.8.1" | +| **r** (Run) | "[ISO/IEC-29500-1] §22.1.2.87" | +| **r** (Text Run) | "[ISO/IEC-29500-1] §17.3.2.25" | +| **rad** (Radical Object) | "[ISO/IEC-29500-1] §22.1.2.88" | +| **sdt** (Ruby Inline-Level Structured Document Tag) | §[3.1.3.1.6, sdt](808b32d8-ad64-4c84-9fb5-85ad68be54b9) | +| **sPre** (Pre-Sub-Superscript Object) | "[ISO/IEC-29500-1] §22.1.2.99" | +| **sSub** (Subscript Object) | "[ISO/IEC-29500-1] §22.1.2.101" | +| **sSubSup** (Sub-Superscript Object) | "[ISO/IEC-29500-1] §22.1.2.103" | +| **sSup** (Superscript Object) | "[ISO/IEC-29500-1] §22.1.2.105" | + +| Attributes | Description | +| --- | --- | +| **dirty** (Field Result Invalidated) | Specifies that this field has been flagged by an application to indicate that its current results are invalid (stale) due to other modifications made to the document, and these contents should be updated before they are displayed if this functionality is supported by the next processing application.

[Rationale: This functionality allows applications with limited subsets of the full functionality of ISO/IEC-29500 Office Open XML File Formats [ISO/IEC-29500-1] to process documents without needing to understand and update all fields based on the semantics for their field codes.

For example, an application can add a new paragraph and flag the table of contents as dirty, without needing to understand anything about how to recalculate that field's content. end rationale]

If this attribute is omitted, then its value shall be assumed to be *false*.

[Example: Consider the following WordprocessingML for a simple field:

- ```


```


The **dirty** attribute value of *true* specifies that the contents of this field are no longer current based on the contents of the document, and should be recalculated whenever an application with this functionality reads the document. end example]

The possible values for this attribute are defined by the ST\_OnOff simple type (§[3.1.3.3.6, ST_OnOff](79cbec69-8430-479d-88c0-56dd44369074)). | +| **fldLock** (Field Should Not be Recalculated) | Specifies that the parent field shall not have its field result recalculated, even if an application attempts to recalculate the results of all fields in the document or a recalculation is explicitly requested.

If this attribute is omitted, then its value shall be assumed to be *false*.

[Example: Consider the following WordprocessingML for a simple field:

- ```


Rex Jaeschke


```


The **fldLock** attribute value of *true* specifies that the contents of this field shall remain *Rex Jaeschke* regardless of the actual result of the current field codes. end example]

The possible values for this attribute are defined by the ST\_OnOff simple type (§3.1.3.3.6, ST\_OnOff). | +| **instr** (Field Codes) | Specifies the field codes for the simple field. The possible field codes are defined in "[ISO/IEC-29500-1] §17.16.5".

[Example: Consider the following WordprocessingML for a simple field:

- ```


Rex Jaeschke


```


The **instr** attribute specifies the field codes for this simple field to be *AUTHOR*. end example]

The possible values for this attribute are defined by the ST\_String simple type (§[3.1.3.3.8, ST_String](ed68652a-6933-4080-ac59-a443946fa732)). | + +The following XML Schema fragment defines the contents of this element: + + - ``` + + + + + + + + + +``` \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/fixtures/multi-claim-r.md b/tests/ingest-ms-oi29500/fixtures/multi-claim-r.md new file mode 100644 index 0000000..e57e2c4 --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/multi-claim-r.md @@ -0,0 +1,60 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: Part 1 Section 22.1.2.87, r (Run) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/a3a6444b-11de-4ac5-81ba-fe03b07f8a45 +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: For additional notes that apply to this portion of the standard, please see the notes for oMath, §22.1.2.77(c). a. +locale: en-us +author: mrsgit09 +document_id: 5ba26cce-b478-72d6-4cfa-7841408716f7 +document_version_independent_id: edc4b383-482f-cded-97ba-bce52e7493cd +updated_at: 2024-04-16T19:01:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/a3a6444b-11de-4ac5-81ba-fe03b07f8a45.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/1d95c2713e0344aa1c45f84961cd8691f6e12270/documentation/office_standards/MS-OI29500/a3a6444b-11de-4ac5-81ba-fe03b07f8a45.md +git_commit_id: 1d95c2713e0344aa1c45f84961cd8691f6e12270 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 171 +asset_id: office_standards/ms-oi29500/a3a6444b-11de-4ac5-81ba-fe03b07f8a45 +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/a3a6444b-11de-4ac5-81ba-fe03b07f8a45.md +cmProducts: [] +platformId: e1c71707-7ed2-febe-2320-404b01d6159a +--- + +# [MS-OI29500]: Part 1 Section 22.1.2.87, r (Run) | Microsoft Learn + +- *For additional notes that apply to this portion of the standard, please see the notes for *[oMath, §22.1.2.77(c)](ab7a0345-712e-4eef-9bcc-80c37e68d9bb)*.* + +a. *The standard allows bold and italic to be set in a math run in the rPr of both Math and WordprocessingML.* + + - If Word reads a math run where bold or italic properties are set in the rPr of WordprocessingML, it ignores those properties during display but then moves them into the Math rPr on save. + +b. *The standard allows br elements inside a math object.* + + - Word does not allow br elements inside a math object. + +c. *The standard allows cr elements as a child of a math run.* + + - Word does not allow the cr element inside a math run. + +d. *The standard allows tab elements as a child of a math run.* + + - Word ignores a tab element if it occurs inside a math run. + +e. *The standard does not allow w:ins or w:del elements as a child of a math run.* + + - Word supports the w:ins and w:del elements inside a math run. \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/fixtures/see-also-only-rPr.md b/tests/ingest-ms-oi29500/fixtures/see-also-only-rPr.md new file mode 100644 index 0000000..cd515ec --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/see-also-only-rPr.md @@ -0,0 +1,40 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: Part 1 Section 17.5.2.27, rPr (Run Properties For Structured Document Tag Contents) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/6bd198dd-11e3-45f3-9c35-0fb448216a02 +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: For additional notes that apply to this portion of the standard, please see the notes for oMath, §17.3.2.22(a). +locale: en-us +author: mrsgit09 +document_id: fde33566-cf6c-aef2-8618-707420499a3b +document_version_independent_id: b7e3b8c7-7f4b-7822-fb98-795c0a97b748 +updated_at: 2024-04-16T19:01:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/6bd198dd-11e3-45f3-9c35-0fb448216a02.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/1d95c2713e0344aa1c45f84961cd8691f6e12270/documentation/office_standards/MS-OI29500/6bd198dd-11e3-45f3-9c35-0fb448216a02.md +git_commit_id: 1d95c2713e0344aa1c45f84961cd8691f6e12270 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 18 +asset_id: office_standards/ms-oi29500/6bd198dd-11e3-45f3-9c35-0fb448216a02 +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/6bd198dd-11e3-45f3-9c35-0fb448216a02.md +cmProducts: [] +platformId: 64cdbd57-80d6-701f-fd5c-808002dd828e +--- + +# [MS-OI29500]: Part 1 Section 17.5.2.27, rPr (Run Properties For Structured Document Tag Contents) | Microsoft Learn + +- *For additional notes that apply to this portion of the standard, please see the notes for *[oMath, §17.3.2.22(a)](94590ec4-e4c1-4f7a-b967-abe6bf8658b2)*.* \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/fixtures/single-claim-ST_Visibility.md b/tests/ingest-ms-oi29500/fixtures/single-claim-ST_Visibility.md new file mode 100644 index 0000000..6d8278b --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/single-claim-ST_Visibility.md @@ -0,0 +1,45 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: Part 1 Section 18.18.89, ST_Visibility (Visibility Types) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/049a4e67-6f4c-4025-a869-6fe7dec8f7e5 +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: a.   The standard states this is about sheet visibility. Office uses this for workbook window visibility. +locale: en-us +author: mrsgit09 +document_id: 430abec1-8172-a5ef-31cf-1d7c729d6957 +document_version_independent_id: f59678e7-697c-8997-45b7-4d1c0a142c0f +updated_at: 2024-04-16T19:01:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/049a4e67-6f4c-4025-a869-6fe7dec8f7e5.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/1d95c2713e0344aa1c45f84961cd8691f6e12270/documentation/office_standards/MS-OI29500/049a4e67-6f4c-4025-a869-6fe7dec8f7e5.md +git_commit_id: 1d95c2713e0344aa1c45f84961cd8691f6e12270 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 16 +asset_id: office_standards/ms-oi29500/049a4e67-6f4c-4025-a869-6fe7dec8f7e5 +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/049a4e67-6f4c-4025-a869-6fe7dec8f7e5.md +cmProducts: +- https://authoring-docs-microsoft.poolparty.biz/devrel/bcbcbad5-4208-4783-8035-8481272c98b8 +spProducts: +- https://authoring-docs-microsoft.poolparty.biz/devrel/43b2e5aa-8a6d-4de2-a252-692232e5edc8 +platformId: cfa05f26-b7c0-ebd1-91cd-005f21d96625 +--- + +# [MS-OI29500]: Part 1 Section 18.18.89, ST_Visibility (Visibility Types) | Microsoft Learn + +a. *The standard states this is about sheet visibility.* + + - Office uses this for workbook window visibility. \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/fixtures/table-only-hlinkClick.md b/tests/ingest-ms-oi29500/fixtures/table-only-hlinkClick.md new file mode 100644 index 0000000..22f56d9 --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/table-only-hlinkClick.md @@ -0,0 +1,62 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: Part 1 Section 21.1.2.3.5, hlinkClick (Click Hyperlink) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/a65b76db-6abc-4989-8cd1-baa9a3500f6f +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: a.   The standard states that the action attribute of the hlinkClick, hlinkHover and hlinkMouseOver elements may use an +locale: en-us +author: mrsgit09 +document_id: 9981ffaf-cc9e-4255-2d8b-e06b99ae1346 +document_version_independent_id: fb9c7683-519d-c8c0-56db-6fe524b62286 +updated_at: 2024-11-12T17:35:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/a65b76db-6abc-4989-8cd1-baa9a3500f6f.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/e2c20ea0c0daef720bc9a4b27e735040c7411bd1/documentation/office_standards/MS-OI29500/a65b76db-6abc-4989-8cd1-baa9a3500f6f.md +git_commit_id: e2c20ea0c0daef720bc9a4b27e735040c7411bd1 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 526 +asset_id: office_standards/ms-oi29500/a65b76db-6abc-4989-8cd1-baa9a3500f6f +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/a65b76db-6abc-4989-8cd1-baa9a3500f6f.md +cmProducts: [] +platformId: 9aa821c4-3a71-871a-cb0c-49fb8137c140 +--- + +# [MS-OI29500]: Part 1 Section 21.1.2.3.5, hlinkClick (Click Hyperlink) | Microsoft Learn + +a. *The standard states that the action attribute of the hlinkClick, hlinkHover and hlinkMouseOver elements may use an unrestricted string.* + +PowerPoint reserves the following values for the action attribute: + +| Value | Description | +| --- | --- | +| ppaction://customshow?id=SHOW\_ID | Specifies that the link shall launch a custom show from the custShowLst element ("[\[ISO/IEC-29500-1\]](https://go.microsoft.com/fwlink/?LinkId=132464) §19.2.1.7; custShowLst (List of Custom Shows)"). The SHOW\_ID variable shall be replaced with the custom show id as specified in the custShow element ("[ISO/IEC-29500-1] §19.2.1.6; custShow (Custom Show)"). | +| ppaction://customshow?id=SHOW\_ID&return=true | Specifies that the link shall launch a custom show from the custShowLst element ("[ISO/IEC-29500-1] §19.2.1.7; custShowLst (List of Custom Shows)"). After the end of the custom show, viewing of this presentation package shall resume. The SHOW\_ID variable shall be replaced with the custom show id as specified in the custShow element ("[ISO/IEC-29500-1] §19.2.1.6; custShow (Custom Show)"). | +| ppaction://hlinkfile | Specifies that the link shall open a file external to this presentation package. The r:id attribute for this element specifies the corresponding relationship containing the reference to the external file. | +| ppaction://hlinkpres?slideindex=SLIDE\_NUM | Specifies that the link shall launch a presentation external to this presentation package. The r:id attribute for this element specifies the corresponding relationship containing the reference to the external file. The SLIDE\_NUM variable shall be replaced with a slide number in the external presentation that the viewing shall be started on. | +| ppaction://hlinkshowjump?jump=endshow | Specifies that the link shall end the presentation. | +| ppaction://hlinkshowjump?jump=firstslide | Specifies that the link shall target the viewing of the first slide within this presentation package. | +| ppaction://hlinkshowjump?jump=lastslide | Specifies that the link shall target the viewing of the last slide within this presentation package. | +| ppaction://hlinkshowjump?jump=lastslideviewed | Specifies that the link shall target the viewing of the slide previously visited within this presentation package. | +| ppaction://hlinkshowjump?jump=nextslide | Specifies that the link shall target the viewing of the next slide within this presentation package. | +| ppaction://hlinkshowjump?jump=previousslide | Specifies that the link shall target the viewing of the previous slide within this presentation package. | +| ppaction://hlinkshowjump?jump=SLIDE\_NUM | Specifies that the link shall target the viewing of the slide in within this presentation package specified by SLIDE\_NUM. The SLIDE\_NUM variable shall be replaced with a slide number in this presentation package that will be targeted. | +| ppaction://hlinksldjump | Specifies that the link shall target the viewing of a specific slide within this presentation package. The r:id attribute for this element specifies the corresponding relationship containing the reference to the slide part which shall be viewed. | +| ppaction://macro?name=MACRO\_NAME | Specifies that the link shall run a macro contained within this presentation. The MACRO\_NAME variable shall be replaced with the name of the macro module. | +| ppaction://program | Specifies that the link shall run an application external to this presentation package. The r:id attribute for this element specifies the corresponding relationship containing the reference to the application. | +| ppaction://noaction | This value of the action attribute does not specify any action performed by the link. The link will continue to respect highlight attribute and snd element if present. | +| ppaction://media | Specifies that the link shall initiate playback of the media object specified by the parent element. | +| ppaction://ole?verb=OLE\_VERB\_INDEX | Specifies that the link shall execute the verb on an OLE object specified by OLE\_VERB\_INDEX. The OLE\_VERB\_INDEX variable shall be replaced with the verb number of the verb registered by the OLE object that shall be executed. | \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/fixtures/txbxContent.md b/tests/ingest-ms-oi29500/fixtures/txbxContent.md new file mode 100644 index 0000000..d1bf603 --- /dev/null +++ b/tests/ingest-ms-oi29500/fixtures/txbxContent.md @@ -0,0 +1,49 @@ +--- +layout: Conceptual +title: '[MS-OI29500]: Part 4 Section 14.9.1.1, txbxContent (Rich Text Box Content Container) | Microsoft Learn' +canonicalUrl: https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/dc83bfa3-db21-4eb4-b3c0-63b11f2575d3 +ms.service: openspecs-office +ROBOTS: INDEX, FOLLOW +uhfHeaderId: MSDocsHeader-OpenSpecs +ms.topic: reference +ms.author: cindyle +protocol_rendering: true +description: For additional notes that apply to this portion of the standard, please see the notes for oMath, §22.1.2.77(f); +locale: en-us +author: mrsgit09 +document_id: f71f7bb0-2de6-3875-fc7b-516e1cf8545a +document_version_independent_id: 5eab6c79-f29b-bcd4-fd50-f996a6ab48e8 +updated_at: 2024-04-16T19:01:00.0000000Z +original_content_git_url: https://github.com/MicrosoftDocs/open_specs_office/blob/live/documentation/office_standards/MS-OI29500/dc83bfa3-db21-4eb4-b3c0-63b11f2575d3.md +gitcommit: https://github.com/MicrosoftDocs/open_specs_office/blob/1d95c2713e0344aa1c45f84961cd8691f6e12270/documentation/office_standards/MS-OI29500/dc83bfa3-db21-4eb4-b3c0-63b11f2575d3.md +git_commit_id: 1d95c2713e0344aa1c45f84961cd8691f6e12270 +site_name: Docs +depot_name: MSDN.open_specs_office +page_type: conceptual +toc_rel: toc.json +feedback_system: None +feedback_product_url: '' +feedback_help_link_type: '' +feedback_help_link_url: '' +word_count: 103 +asset_id: office_standards/ms-oi29500/dc83bfa3-db21-4eb4-b3c0-63b11f2575d3 +moniker_range_name: +monikers: [] +item_type: Content +source_path: documentation/office_standards/MS-OI29500/dc83bfa3-db21-4eb4-b3c0-63b11f2575d3.md +cmProducts: [] +platformId: 1a6da451-763c-397a-6242-3699234fdb5d +--- + +# [MS-OI29500]: Part 4 Section 14.9.1.1, txbxContent (Rich Text Box Content Container) | Microsoft Learn + +- *For additional notes that apply to this portion of the standard, please see the notes for *[oMath, §22.1.2.77(f)](ab7a0345-712e-4eef-9bcc-80c37e68d9bb)*; *[oMathPara, §22.1.2.78(c)](23e0c1c9-4abb-4c75-acc2-7583040e774d)*.* + +a. *The standard states that text box content can be placed inside endnotes, footnotes, comments, or other textboxes.* + + - Word does not allow textbox content inside endnotes, footnotes, comments, or other textboxes. + +b. *The standard specifies this element as part of the WordprocessingML namespace.* + + - Word will save an mce choice for VML content. txbxContent elements written in that choice will be written in with a namespace value of http://schemas.microsoft.com/office/word/2006/wordml. + - This note applies to the following products: Office 2013 Client (Strict), Office 2013 Server (Strict). \ No newline at end of file diff --git a/tests/ingest-ms-oi29500/parse.test.ts b/tests/ingest-ms-oi29500/parse.test.ts new file mode 100644 index 0000000..0c0eae3 --- /dev/null +++ b/tests/ingest-ms-oi29500/parse.test.ts @@ -0,0 +1,82 @@ +/** + * Parser tests against committed MS-OI29500 markdown fixtures. Each fixture + * exercises one shape the parser must handle (or correctly skip). + */ + +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "bun:test"; +import { + entryIdFromTocTitle, + parsePage, +} from "../../scripts/ingest-ms-oi29500/parse.ts"; + +const FIXTURES = join(import.meta.dir, "fixtures"); + +function load(name: string): string { + return readFileSync(join(FIXTURES, `${name}.md`), "utf8"); +} + +test("txbxContent: 2 claim groups, version-scope on b", () => { + const parsed = parsePage(load("txbxContent"), { entryId: "2.1.1779" }); + expect(parsed.ingestable).toBe(true); + expect(parsed.parsedTitle?.partNumber).toBe(4); + expect(parsed.parsedTitle?.ecmaSection).toBe("14.9.1.1"); + expect(parsed.parsedTitle?.name).toBe("txbxContent"); + expect(parsed.entryId).toBe("2.1.1779"); + expect(parsed.claims.length).toBe(2); + expect(parsed.claims[0].label).toBe("a"); + expect(parsed.claims[1].label).toBe("b"); + expect(parsed.claims[0].behaviors.length).toBe(1); + expect(parsed.claims[1].behaviors.length).toBe(1); + expect(parsed.claims[1].behaviors[0].versionScope).toContain("Office 2013"); + expect(parsed.frontmatter.git_commit_id).toBeTruthy(); +}); + +test("multi-claim-r (math run): 5 claim groups a-e, no version scope", () => { + const parsed = parsePage(load("multi-claim-r")); + expect(parsed.ingestable).toBe(true); + expect(parsed.claims.length).toBe(5); + expect(parsed.claims.map((c) => c.label)).toEqual(["a", "b", "c", "d", "e"]); + for (const c of parsed.claims) { + expect(c.behaviors.length).toBeGreaterThanOrEqual(1); + for (const b of c.behaviors) expect(b.versionScope).toBeNull(); + } +}); + +test("see-also-only-rPr: skipped (no claim groups)", () => { + const parsed = parsePage(load("see-also-only-rPr")); + expect(parsed.ingestable).toBe(false); + expect(parsed.skipReason).toContain("no claim groups"); + expect(parsed.parsedTitle?.name).toBe("rPr"); +}); + +test("table-only-hlinkClick: skipped (claim header but no bullets)", () => { + const parsed = parsePage(load("table-only-hlinkClick")); + expect(parsed.ingestable).toBe(false); + expect(parsed.skipReason).toContain("no behavior bullets"); + expect(parsed.claims.length).toBeGreaterThan(0); +}); + +test("cross-spec-fldSimple: skipped (no Part/Section in title)", () => { + const parsed = parsePage(load("cross-spec-fldSimple")); + expect(parsed.ingestable).toBe(false); + expect(parsed.skipReason).toContain("Part/Section"); +}); + +test("single-claim-ST_Visibility: 1 claim, 1 behavior", () => { + const parsed = parsePage(load("single-claim-ST_Visibility")); + expect(parsed.ingestable).toBe(true); + expect(parsed.parsedTitle?.name).toBe("ST_Visibility"); + expect(parsed.claims.length).toBe(1); + expect(parsed.claims[0].behaviors.length).toBe(1); +}); + +test("entryIdFromTocTitle parses the leading 2.x.x marker", () => { + expect(entryIdFromTocTitle("2.1.1779 Part 4 Section 14.9.1.1, txbxContent (...)")).toBe( + "2.1.1779", + ); + expect(entryIdFromTocTitle("Conformance Statements")).toBeNull(); + expect(entryIdFromTocTitle(null)).toBeNull(); + expect(entryIdFromTocTitle(undefined)).toBeNull(); +}); diff --git a/tests/ingest-ms-oi29500/resolve.test.ts b/tests/ingest-ms-oi29500/resolve.test.ts new file mode 100644 index 0000000..aca65aa --- /dev/null +++ b/tests/ingest-ms-oi29500/resolve.test.ts @@ -0,0 +1,123 @@ +/** + * Resolver tests against the existing XSD fixtures (a slim WML schema). + * Verifies the conservative resolution path: top-level / local / ambiguous / + * no-match / no-vocabulary outcomes. + */ + +import { join } from "node:path"; +import { afterAll, beforeAll, expect, test } from "bun:test"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { + loadSymbolMap, + resolveSymbol, + type SymbolMap, +} from "../../scripts/ingest-ms-oi29500/resolve.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-ecma-376-xsds/ingest.ts"; +import { getTestDatabaseUrl } from "../test-db.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-ecma-376-xsds", "fixtures"); + +let db: DbClient; +let map: SymbolMap; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_note_observations, + word_observations, + word_fixtures, + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(getTestDatabaseUrl()); + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; + await db.sql.unsafe(TRUNCATE_SQL); + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + map = await loadSymbolMap(db.sql); +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.close(); +}); + +test("Part 1 Section 17 top-level complexType → resolved (high, top-level)", () => { + const out = resolveSymbol(map, "CT_Para", "17.3.1.22", 1); + expect(out.resolved).toBe(true); + if (out.resolved) { + expect(out.vocabulary).toBe("wml-main"); + expect(out.symbolKind).toBe("complexType"); + expect(out.confidence).toBe("high"); + expect(out.isLocal).toBe(false); + } +}); + +test("Part 1 Section 17 top-level simpleType → resolved", () => { + const out = resolveSymbol(map, "ST_Jc", "17.18.74", 1); + expect(out.resolved).toBe(true); + if (out.resolved) { + expect(out.symbolKind).toBe("simpleType"); + expect(out.isLocal).toBe(false); + } +}); + +test("Part 1 Section 17 local element → resolved (isLocal=true)", () => { + // `text` is a local element inside CT_Para in the fixture XSD. + const out = resolveSymbol(map, "text", "17.3.1.10", 1); + expect(out.resolved).toBe(true); + if (out.resolved) { + expect(out.symbolKind).toBe("element"); + expect(out.isLocal).toBe(true); + } +}); + +test("Part 1 Section 17 unknown name → no-match with target_ref", () => { + const out = resolveSymbol(map, "DoesNotExistInFixture", "17.5.5.5", 1); + expect(out.resolved).toBe(false); + if (!out.resolved) { + expect(out.reason).toBe("no-match"); + expect(out.targetRef).toContain("DoesNotExistInFixture"); + } +}); + +test("Part 1 Section 18 (SML) → no-vocabulary (not ingested)", () => { + const out = resolveSymbol(map, "ST_Visibility", "18.18.89", 1); + expect(out.resolved).toBe(false); + if (!out.resolved) { + expect(out.reason).toBe("no-vocabulary"); + } +}); + +test("Part 4 short-circuits to no-vocabulary regardless of section", () => { + const out = resolveSymbol(map, "txbxContent", "14.9.1.1", 4); + expect(out.resolved).toBe(false); + if (!out.resolved) { + expect(out.reason).toBe("no-vocabulary"); + } +}); + +test("Section 11 (overview) → no-vocabulary", () => { + const out = resolveSymbol(map, "WordprocessingML", "11", 1); + expect(out.resolved).toBe(false); +}); diff --git a/tests/mcp-server/ooxml-behavior.test.ts b/tests/mcp-server/ooxml-behavior.test.ts new file mode 100644 index 0000000..b206938 --- /dev/null +++ b/tests/mcp-server/ooxml-behavior.test.ts @@ -0,0 +1,240 @@ +/** + * Tests for the dedicated `ooxml_behavior` tool and the inline behavior-note + * surface on `ooxml_element` / `ooxml_type`. + * + * Setup mirrors ooxml-queries.test.ts: ingest the same fixture XSDs into a + * truncated test DB, then seed a small set of behavior_notes rows with known + * shape so we can assert filtering behavior and citation formatting. + */ + +import { join } from "node:path"; +import { afterAll, beforeAll, expect, test } from "bun:test"; +import { runOoxmlTool } from "../../apps/mcp-server/src/ooxml-tools.ts"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-ecma-376-xsds/ingest.ts"; +import { getTestDatabaseUrl } from "../test-db.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-ecma-376-xsds", "fixtures"); +const databaseUrl = getTestDatabaseUrl(); + +let db: DbClient; +let msSourceId: number; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_note_observations, + word_observations, + word_fixtures, + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; + await db.sql` + INSERT INTO reference_sources (name, kind, url) + VALUES ( + 'ms-oi29500', + 'open_spec', + 'https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/1fd4a662-8623-49c0-82f0-18fa91b413b8' + ) + ON CONFLICT (name) DO UPDATE SET url = EXCLUDED.url + `; + const [src] = await db.sql>` + SELECT id FROM reference_sources WHERE name = ${"ms-oi29500"} + `; + msSourceId = src.id; + + await db.sql.unsafe(TRUNCATE_SQL); + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Seed behavior_notes. The fixture has top-level CT_Para and ST_Jc, plus + // a local element `text` inside CT_Para. We insert one note per anchor + // pointing at each, plus a target_ref-only note to test the qname + // word-boundary fallback. + const wmlNs = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + const [paraSym] = await db.sql>` + SELECT s.id FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + WHERE s.local_name = ${"CT_Para"} AND s.kind = ${"complexType"} + AND s.parent_symbol_id IS NULL AND ns.uri = ${wmlNs} + `; + const [textLocal] = await db.sql>` + SELECT s.id FROM xsd_symbols s + WHERE s.local_name = ${"text"} AND s.kind = ${"element"} + AND s.parent_symbol_id IS NOT NULL + `; + + await db.sql` + INSERT INTO behavior_notes ( + symbol_id, app, claim_type, summary, source_id, section_id, + source_anchor, claim_label, claim_index, target_ref, + standard_text, behavior_text, confidence, resolution_confidence + ) + VALUES + (${paraSym.id}, 'Word', 'writes', + 'Word emits CT_Para with extra whitespace.', + ${msSourceId}, '17.3.1.22 (Part 1 §17.3.1.22)', + 'guid-para', 'a', 0, NULL, + 'The standard says CT_Para is paragraph metadata.', + 'Word emits CT_Para with extra whitespace.', 'high', 'high'), + (${textLocal.id}, 'Word', 'does_not_support', + 'Word does not support nested text runs in CT_Para.', + ${msSourceId}, '17.3.1.22 (Part 1 §17.3.1.22)', + 'guid-text-local', 'a', 0, NULL, + 'The standard allows nested text runs.', + 'Word does not support nested text runs in CT_Para.', 'high', 'high'), + (NULL, 'Word', 'varies_from_spec', + 'Word handles tbl differently.', + ${msSourceId}, '17.4.37 (Part 1 §17.4.37)', + 'guid-tbl-unresolved', 'a', 0, + 'Section 17.4.37, tbl', + 'Spec says tbl renders inline.', + 'Word handles tbl differently.', 'high', 'low'), + (NULL, 'Word', 'varies_from_spec', + 'Word handles tblPr differently.', + ${msSourceId}, '17.4.59 (Part 1 §17.4.59)', + 'guid-tblpr-unresolved', 'a', 0, + 'Section 17.4.59, tblPr', + 'Spec says tblPr is table-level metadata.', + 'Word handles tblPr differently.', 'high', 'low'), + (NULL, 'Excel', 'does_not_support', + 'Excel does not support textBox in this context.', + ${msSourceId}, '18.5.1.5', + 'guid-textbox-unresolved', 'a', 0, + 'Section 18.5.1.5, textBox', + 'Spec says textBox is allowed.', + 'Excel does not support textBox in this context.', 'high', 'low') + `; +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.sql`DELETE FROM reference_sources WHERE name = ${"ms-oi29500"}`; + await db.close(); +}); + +test("ooxml_behavior with no filters returns the missing-filter error", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", {}, db.sql); + expect(out).toContain("Missing filter"); + expect(out).toContain("section_id"); + // Must NOT include the schema-tool fallback hint that's irrelevant here. + expect(out).not.toContain("known prefix qname"); +}); + +test("ooxml_behavior qname=tbl matches target_ref but excludes textBox / tblPr", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { qname: "w:tbl" }, db.sql); + // The 'tbl' unresolved note should match (target_ref = "Section 17.4.37, tbl"). + expect(out).toContain("Word handles tbl differently"); + // But NOT the tblPr or textBox notes - word-boundary regex prevents the + // substring false positive. + expect(out).not.toContain("Word handles tblPr differently"); + expect(out).not.toContain("Excel does not support textBox"); +}); + +test("ooxml_behavior qname=textBox doesn't pull tbl-related notes", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { qname: "w:textBox" }, db.sql); + expect(out).not.toContain("Word handles tbl differently"); + expect(out).not.toContain("Word handles tblPr differently"); +}); + +test("ooxml_behavior qname=text picks up the local-symbol note", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { qname: "w:text" }, db.sql); + expect(out).toContain("Word does not support nested text runs"); + // Must not also pull the unrelated tbl/textBox/tblPr unresolved notes. + expect(out).not.toContain("Word handles tbl differently"); +}); + +test("ooxml_behavior section_id substring matches", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { section_id: "17.3.1.22" }, db.sql); + expect(out).toContain("Word emits CT_Para"); + expect(out).toContain("Word does not support nested text runs"); +}); + +test("ooxml_behavior source_anchor exact match", async () => { + const out = await runOoxmlTool( + "ooxml_implementation_notes", + { source_anchor: "guid-para" }, + db.sql, + ); + expect(out).toContain("Word emits CT_Para"); + expect(out).not.toContain("Word does not support nested text runs"); +}); + +test("ooxml_behavior app filter is exact", async () => { + const excelOnly = await runOoxmlTool("ooxml_implementation_notes", { app: "Excel" }, db.sql); + expect(excelOnly).toContain("Excel does not support textBox"); + expect(excelOnly).not.toContain("Word emits CT_Para"); +}); + +test("ooxml_behavior claim_type filter is exact", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { claim_type: "writes" }, db.sql); + expect(out).toContain("Word emits CT_Para"); + expect(out).not.toContain("Word does not support nested text runs"); +}); + +test("ooxml_behavior renders a working per-note URL (not the broken landing-guid form)", async () => { + const out = await runOoxmlTool("ooxml_implementation_notes", { source_anchor: "guid-para" }, db.sql); + expect(out).toContain( + "https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/guid-para", + ); + // Must not also stitch the landing-page GUID into the path. + expect(out).not.toContain("1fd4a662-8623-49c0-82f0-18fa91b413b8/guid-para"); +}); + +test("ooxml_type w:ST_Jc inlines a behavior note when one is attached", async () => { + // Add an inline-targeted note for ST_Jc and verify it shows up on ooxml_type. + const wmlNs = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + const [stJc] = await db.sql>` + SELECT s.id FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + WHERE s.local_name = ${"ST_Jc"} AND s.kind = ${"simpleType"} + AND s.parent_symbol_id IS NULL AND ns.uri = ${wmlNs} + `; + await db.sql` + INSERT INTO behavior_notes ( + symbol_id, app, claim_type, summary, source_id, section_id, + source_anchor, claim_label, claim_index, standard_text, behavior_text, + confidence, resolution_confidence + ) + VALUES ( + ${stJc.id}, 'Word', 'varies_from_spec', + 'Word renders both differently from the spec.', + ${msSourceId}, '17.18.44', + 'guid-st-jc', 'a', 0, + 'Spec specifies both as a justification value.', + 'Word renders both differently from the spec.', + 'high', 'high' + ) + `; + const out = await runOoxmlTool("ooxml_type", { qname: "w:ST_Jc" }, db.sql); + expect(out).toContain("SimpleType: ST_Jc"); + expect(out).toContain("Documented behavior notes (1"); + expect(out).toContain("Word renders both differently"); + expect(out).toContain( + "https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/guid-st-jc", + ); +}); diff --git a/tests/mcp-server/ooxml-queries.test.ts b/tests/mcp-server/ooxml-queries.test.ts index 93c8678..f7b0318 100644 --- a/tests/mcp-server/ooxml-queries.test.ts +++ b/tests/mcp-server/ooxml-queries.test.ts @@ -6,7 +6,7 @@ import { join } from "node:path"; import { afterAll, beforeAll, expect, test } from "bun:test"; import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; -import { ingestSchemaSet } from "../../scripts/ingest-xsd/ingest.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-ecma-376-xsds/ingest.ts"; import { getAttributes, getChildren, @@ -18,7 +18,7 @@ import { parseQName, } from "../../apps/mcp-server/src/ooxml-queries.ts"; -const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-xsd", "fixtures"); +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-ecma-376-xsds", "fixtures"); import { getTestDatabaseUrl } from "../test-db.ts"; const databaseUrl = getTestDatabaseUrl(); @@ -27,6 +27,9 @@ let db: DbClient; const TRUNCATE_SQL = ` TRUNCATE + behavior_note_observations, + word_observations, + word_fixtures, behavior_notes, xsd_enums, xsd_inheritance_edges, diff --git a/tests/mcp-server/ooxml-word-behavior.test.ts b/tests/mcp-server/ooxml-word-behavior.test.ts new file mode 100644 index 0000000..11a3fb9 --- /dev/null +++ b/tests/mcp-server/ooxml-word-behavior.test.ts @@ -0,0 +1,253 @@ +/** + * Tests for the dedicated `ooxml_word_behavior` tool and the verification + * badges that surface on ooxml_implementation_notes / ooxml_element / + * ooxml_type when a behavior_note has linked observations. + */ + +import { join } from "node:path"; +import { afterAll, beforeAll, expect, test } from "bun:test"; +import { runOoxmlTool } from "../../apps/mcp-server/src/ooxml-tools.ts"; +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; +import { ingestSchemaSet } from "../../scripts/ingest-ecma-376-xsds/ingest.ts"; +import { getTestDatabaseUrl } from "../test-db.ts"; + +const FIXTURES_DIR = join(import.meta.dir, "..", "ingest-ecma-376-xsds", "fixtures"); +const databaseUrl = getTestDatabaseUrl(); + +let db: DbClient; +let msSourceId: number; +let confirmedNoteId: number; +let refinedNoteId: number; + +const TRUNCATE_SQL = ` + TRUNCATE + behavior_note_observations, + word_observations, + word_fixtures, + behavior_notes, + xsd_enums, + xsd_inheritance_edges, + xsd_group_edges, + xsd_attr_edges, + xsd_child_edges, + xsd_compositors, + xsd_symbol_profiles, + xsd_symbols, + xsd_namespaces, + xsd_profiles + RESTART IDENTITY CASCADE +`; + +beforeAll(async () => { + db = createDbClient(databaseUrl); + await db.sql` + INSERT INTO reference_sources (name, kind) + VALUES ('ecma-376-transitional', 'xsd') + ON CONFLICT (name) DO NOTHING + `; + await db.sql` + INSERT INTO reference_sources (name, kind, url) + VALUES ( + 'ms-oi29500', + 'open_spec', + 'https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/1fd4a662-8623-49c0-82f0-18fa91b413b8' + ) + ON CONFLICT (name) DO UPDATE SET url = EXCLUDED.url + `; + const [src] = await db.sql>` + SELECT id FROM reference_sources WHERE name = ${"ms-oi29500"} + `; + msSourceId = src.id; + + await db.sql.unsafe(TRUNCATE_SQL); + await ingestSchemaSet({ + schemaDir: FIXTURES_DIR, + entrypoints: ["main.xsd"], + profileName: "transitional", + sourceName: "ecma-376-transitional", + db, + }); + + // Two seed notes: one will be confirmed, the other refined. + const wmlNs = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + const [paraSym] = await db.sql>` + SELECT s.id FROM xsd_symbols s + JOIN xsd_symbol_profiles sp ON sp.symbol_id = s.id + JOIN xsd_namespaces ns ON ns.id = sp.namespace_id + WHERE s.local_name = ${"CT_Para"} AND s.kind = ${"complexType"} + AND s.parent_symbol_id IS NULL AND ns.uri = ${wmlNs} + `; + const [n1] = await db.sql>` + INSERT INTO behavior_notes ( + symbol_id, app, claim_type, summary, source_id, + source_anchor, claim_label, claim_index, target_ref, + standard_text, behavior_text, confidence, resolution_confidence + ) VALUES ( + ${paraSym.id}, 'Word', 'writes', + 'Word emits CT_Para with extra whitespace.', + ${msSourceId}, + 'guid-confirmed', 'a', 0, 'Section 17, CT_Para', + 'Spec leaves whitespace open.', + 'Word emits CT_Para with extra whitespace.', 'high', 'high' + ) RETURNING id + `; + const [n2] = await db.sql>` + INSERT INTO behavior_notes ( + symbol_id, app, claim_type, summary, source_id, + source_anchor, claim_label, claim_index, target_ref, + standard_text, behavior_text, confidence, resolution_confidence + ) VALUES ( + ${paraSym.id}, 'Word', 'requires_despite_optional', + 'Word requires CT_Para val to be non-zero.', + ${msSourceId}, + 'guid-refined', 'a', 0, 'Section 17, CT_Para', + 'Spec allows val=0.', + 'Word requires val to be non-zero.', 'high', 'high' + ) RETURNING id + `; + confirmedNoteId = n1.id; + refinedNoteId = n2.id; + + // Two fixtures + two observations + two join rows (one confirmed, one refined). + const [fix1] = await db.sql>` + INSERT INTO word_fixtures (name, description, sha256, generator_script, word_version) + VALUES ( + 'whitespace-test', 'CT_Para whitespace fixture', 'abc123', 'create_document(...)', + 'Word 16.0' + ) RETURNING id + `; + const [fix2] = await db.sql>` + INSERT INTO word_fixtures (name, description, sha256, generator_script, word_version) + VALUES ( + 'val-zero-test', 'CT_Para val=0 fixture', 'def456', 'create_document(...)', + 'Word 16.0' + ) RETURNING id + `; + const [o1] = await db.sql>` + INSERT INTO word_observations (fixture_id, scenario, finding, before_xml, after_xml) + VALUES ( + ${fix1.id}, 'authored', + 'Word emits CT_Para with the whitespace the doc claims.', + NULL, '...' + ) RETURNING id + `; + const [o2] = await db.sql>` + INSERT INTO word_observations (fixture_id, scenario, finding, before_xml, after_xml) + VALUES ( + ${fix2.id}, 'open-and-save', + 'Word strips the whole CT_Para val=0 directive on save (rather than rejecting).', + '', NULL + ) RETURNING id + `; + await db.sql` + INSERT INTO behavior_note_observations (behavior_note_id, observation_id, status, notes) + VALUES + (${confirmedNoteId}, ${o1.id}, 'confirmed', NULL), + (${refinedNoteId}, ${o2.id}, 'refined', 'Word does not reject; it silently drops the directive.') + `; +}); + +afterAll(async () => { + await db.sql.unsafe(TRUNCATE_SQL); + await db.sql`DELETE FROM reference_sources WHERE name = ${"ms-oi29500"}`; + await db.close(); +}); + +test("ooxml_word_behavior with no filters returns all observations", async () => { + const out = await runOoxmlTool("ooxml_word_behavior", {}, db.sql); + expect(out).toContain("Word emits CT_Para"); + expect(out).toContain("Word strips the whole CT_Para val=0"); + expect(out).toContain("[confirmed]"); + expect(out).toContain("[refined]"); +}); + +test("ooxml_word_behavior fixture_name filter", async () => { + const out = await runOoxmlTool( + "ooxml_word_behavior", + { fixture_name: "val-zero-test" }, + db.sql, + ); + expect(out).toContain("val-zero-test"); + expect(out).toContain("[refined]"); + expect(out).not.toContain("whitespace-test"); +}); + +test("ooxml_word_behavior scenario filter", async () => { + const out = await runOoxmlTool( + "ooxml_word_behavior", + { scenario: "open-and-save" }, + db.sql, + ); + expect(out).toContain("Word strips"); + expect(out).not.toContain("Word emits CT_Para"); +}); + +test("ooxml_word_behavior status filter", async () => { + const out = await runOoxmlTool( + "ooxml_word_behavior", + { status: "refined" }, + db.sql, + ); + expect(out).toContain("[refined]"); + expect(out).not.toContain("[confirmed]"); +}); + +test("ooxml_implementation_notes inlines verification status", async () => { + const out = await runOoxmlTool( + "ooxml_implementation_notes", + { source_anchor: "guid-confirmed" }, + db.sql, + ); + expect(out).toContain("[confirmed]"); + expect(out).toContain("Word emits CT_Para"); + // observation finding should appear too + expect(out).toContain("Word emits CT_Para with the whitespace"); +}); + +test("ooxml_word_behavior status filter respects LIMIT (regression)", async () => { + // Insert 3 newer observations with no linked status. With limit=2 and + // status='confirmed', the buggy implementation would return 0 because the + // limit-applied-pre-status would only see the 3 unstatused ones. + const [fixId] = await db.sql>` + INSERT INTO word_fixtures (name, description) VALUES ('limit-test', 'noise') + RETURNING id + `; + for (let i = 0; i < 3; i++) { + await db.sql` + INSERT INTO word_observations (fixture_id, scenario, finding) + VALUES (${fixId.id}, 'noise', ${`noise observation ${i}`}) + `; + } + const out = await runOoxmlTool( + "ooxml_word_behavior", + { status: "confirmed", limit: 2 }, + db.sql, + ); + // Should still find the original confirmed observation despite the noise. + expect(out).toContain("[confirmed]"); + expect(out).toContain("Word emits CT_Para"); +}); + +test("ooxml_implementation_notes flags unverified rows", async () => { + // Insert a third note with no observation. + await db.sql` + INSERT INTO behavior_notes ( + app, claim_type, summary, source_id, + source_anchor, claim_label, claim_index, target_ref, + standard_text, behavior_text, confidence + ) VALUES ( + 'Word', 'varies_from_spec', + 'Untested claim.', + ${msSourceId}, + 'guid-untested', 'a', 0, 'Section X, foo', + 'Spec says X.', + 'Word does Y.', 'high' + ) + `; + const out = await runOoxmlTool( + "ooxml_implementation_notes", + { source_anchor: "guid-untested" }, + db.sql, + ); + expect(out).toContain("[unverified]"); +}); diff --git a/tests/mcp-server/tools-list.test.ts b/tests/mcp-server/tools-list.test.ts index ae02a94..012fb2e 100644 --- a/tests/mcp-server/tools-list.test.ts +++ b/tests/mcp-server/tools-list.test.ts @@ -21,6 +21,10 @@ const EXPECTED_TOOL_NAMES = [ "ooxml_attributes", "ooxml_enum", "ooxml_namespace", + // Implementation behavior notes (MS-OI29500) + "ooxml_implementation_notes", + // Word ground-truth observations from authored fixtures + "ooxml_word_behavior", ] as const; interface JsonRpcResponse {