diff --git a/docs/core-parser-replan.md b/docs/core-parser-replan.md new file mode 100644 index 0000000..3f8e223 --- /dev/null +++ b/docs/core-parser-replan.md @@ -0,0 +1,156 @@ +# Core Parser Replan + +Updated: 2026-03-25 + +## Fixed baselines + +- `main`: `/josh/programs/stax-xml` +- `checkpoint`: `/josh/programs/stax-xml-core-parser-checkpoint` +- `feature`: `/josh/programs/stax-xml-core-parser-refactor` + +Checkpoint was frozen from feature commit `97dbf5b` with a Lore commit so cursor-internal comparisons can stay stable while feature continues to move. + +## Operating rules + +- Benchmark and profiling runs use `packages/stax-xml/dist/index.js` or `packages/stax-xml/dist/index.mjs` only. +- `main` and `checkpoint` stay fixed during the comparison window. +- Cursor results are judged against `checkpoint`. +- Public wrapper results are judged against `main`. +- Stress results are recorded as signals and are not ship gates. + +## Benchmark suites + +### Cursor suite + +- `sync-cursor-consume`: synthetic token-dense XML +- `sync-cursor-attr-unused`: synthetic attribute-dense XML without attribute access +- `async-cursor-midsize-4kb` +- `async-cursor-midsize-64kb` + +### Wrapper suite + +- `sync-parser-books`: `books.xml` +- `sync-parser-complex`: `complex.xml` +- `async-parser-midsize-4kb`: `midsize.xml` +- `async-parser-midsize-64kb`: `midsize.xml` +- `async-parser-mixed-256b`: `mixed.xml` + +### Stress suite + +- `async-parser-single-chunk` + +## Gate definitions + +### Cursor gate: checkpoint vs feature + +- `sync-cursor-consume`: at least 10% faster +- `sync-cursor-attr-unused`: at least 25% faster +- `async-cursor-midsize-4kb`: completes and stays within `1.5x` +- `async-cursor-midsize-64kb`: completes and stays within `1.5x` + +### Wrapper gate: main vs feature + +- No representative wrapper case regresses by more than 5% +- At least one representative wrapper case improves by at least 5% +- Representative wrapper cases are `sync-parser-books` and `sync-parser-complex` +- `async-parser-midsize-4kb`: feature run completes without timeout +- `async-parser-midsize-64kb`: feature run completes without timeout +- `async-parser-mixed-256b`: feature run completes without timeout and keeps a stable checksum + +### Stress suite + +- `async-parser-single-chunk` is signal only +- Runaway behavior, timeout behavior, and chunk sensitivity are recorded, not used as ship gates + +## Workflow + +1. Build dist in each worktree. +2. Verify baseline SHA pins and worktree state. +3. Run the cursor suite for `checkpoint` and `feature`. +4. Run the wrapper suite for `main` and `feature`. +5. Run the stress suite for `feature` when needed. +6. Evaluate cursor and wrapper gates from the suite JSON outputs. + +## Commands + +Build: + +```bash +pnpm --dir /josh/programs/stax-xml/packages/stax-xml build +pnpm --dir /josh/programs/stax-xml-core-parser-checkpoint/packages/stax-xml build +pnpm --dir /josh/programs/stax-xml-core-parser-refactor/packages/stax-xml build +``` + +Verify baselines: + +```bash +node /josh/programs/stax-xml-core-parser-refactor/scripts/verify-core-parser-baselines.mjs \ + /josh/programs/stax-xml \ + /josh/programs/stax-xml-core-parser-checkpoint \ + /josh/programs/stax-xml-core-parser-refactor \ + --expect-checkpoint 97dbf5b +``` + +Run benchmark suites: + +```bash +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-benchmark.mjs \ + /josh/programs/stax-xml-core-parser-checkpoint checkpoint cursor checkpoint + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-benchmark.mjs \ + /josh/programs/stax-xml-core-parser-refactor feature cursor checkpoint + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-benchmark.mjs \ + /josh/programs/stax-xml main wrapper main + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-benchmark.mjs \ + /josh/programs/stax-xml-core-parser-refactor feature wrapper main +``` + +Evaluate gates: + +```bash +node /josh/programs/stax-xml-core-parser-refactor/scripts/evaluate-core-parser-gates.mjs \ + cursor \ + /tmp/stax-compare/checkpoint/benchmarks/cursor.json \ + /tmp/stax-compare/feature/benchmarks/cursor.json + +node /josh/programs/stax-xml-core-parser-refactor/scripts/evaluate-core-parser-gates.mjs \ + wrapper \ + /tmp/stax-compare/main/benchmarks/wrapper.json \ + /tmp/stax-compare/feature/benchmarks/wrapper.json +``` + +Default profiling targets: + +```bash +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-node-cpu-prof.mjs \ + /josh/programs/stax-xml-core-parser-checkpoint checkpoint cursor checkpoint sync-cursor-consume \ + /josh/programs/stax-xml-core-parser-refactor/scripts/parser-benchmark-case.mjs \ + -- /josh/programs/stax-xml-core-parser-checkpoint cursor sync-cursor-consume checkpoint + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-node-cpu-prof.mjs \ + /josh/programs/stax-xml-core-parser-refactor feature cursor checkpoint async-cursor-midsize-64kb \ + /josh/programs/stax-xml-core-parser-refactor/scripts/parser-benchmark-case.mjs \ + -- /josh/programs/stax-xml-core-parser-refactor cursor async-cursor-midsize-64kb checkpoint + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-node-cpu-prof.mjs \ + /josh/programs/stax-xml main wrapper main sync-parser-books \ + /josh/programs/stax-xml-core-parser-refactor/scripts/parser-benchmark-case.mjs \ + -- /josh/programs/stax-xml wrapper sync-parser-books main + +node /josh/programs/stax-xml-core-parser-refactor/scripts/run-node-cpu-prof.mjs \ + /josh/programs/stax-xml-core-parser-refactor feature wrapper main async-parser-mixed-256b \ + /josh/programs/stax-xml-core-parser-refactor/scripts/parser-benchmark-case.mjs \ + -- /josh/programs/stax-xml-core-parser-refactor wrapper async-parser-mixed-256b main +``` + +## Implementation stages + +- Stage 1: sync scalar state +- Stage 2: sync no-attribute fast path +- Stage 3: sync namespace copy-on-write +- Stage 4: async scanner prototype +- Stage 5: async full integration + +Each stage closes only after the cursor gate and wrapper gate both pass. diff --git a/package.json b/package.json index b13ff51..d34c641 100644 --- a/package.json +++ b/package.json @@ -19,13 +19,17 @@ "docs:dev": "pnpm --filter=stax-xml-docs dev", "docs:build": "pnpm --filter=stax-xml-docs build", "docs:preview": "pnpm --filter=stax-xml-docs preview", - "benchmark": "pnpm --filter=benchmark dev:bench:all" + "benchmark": "pnpm --filter=benchmark dev:bench:all", + "bench:verify-baselines": "node ./scripts/verify-core-parser-baselines.mjs", + "bench:run": "node ./scripts/run-benchmark.mjs", + "bench:evaluate": "node ./scripts/evaluate-core-parser-gates.mjs", + "profile:run": "node ./scripts/run-node-cpu-prof.mjs" }, "devDependencies": { "@types/node": "^24.0.10", "@vitest/coverage-v8": "^3.2.4", "@vitest/ui": "^3.2.4", - "tsdown": "^0.15.5", + "tsdown": "^0.21.4", "tsx": "^4.20.5", "typescript": "^5.9.2", "vitest": "^3.2.4" @@ -34,4 +38,4 @@ "node": ">=20.19.0" }, "packageManager": "pnpm@9.1.2" -} \ No newline at end of file +} diff --git a/packages/stax-xml/package.json b/packages/stax-xml/package.json index c6e5017..2f637e2 100644 --- a/packages/stax-xml/package.json +++ b/packages/stax-xml/package.json @@ -10,7 +10,7 @@ "module": "./dist/index.js", "devDependencies": { "starlight-typedoc": "^0.21.3", - "tsdown": "^0.15.5", + "tsdown": "^0.21.4", "typedoc": "^0.28.13", "typescript": "^5.9.2", "vitest": "^3.2.4" @@ -51,4 +51,4 @@ }, "type": "module", "types": "./dist/index.d.ts" -} \ No newline at end of file +} diff --git a/packages/stax-xml/src/StaxXmlCursor.ts b/packages/stax-xml/src/StaxXmlCursor.ts new file mode 100644 index 0000000..212637f --- /dev/null +++ b/packages/stax-xml/src/StaxXmlCursor.ts @@ -0,0 +1,954 @@ +import { type AttributeInfo, XmlEventType } from './types'; +import { AttributeCollector } from './internal/AttributeCollector'; +import { + cloneNamespaces, + collectAttributesFromSource, + hasNamespaceDeclarationInSource, + resolveElementName, + type QualifiedNameInfo, +} from './internal/XmlCursorParserUtil'; + +type CursorLifecycleState = 'INITIAL' | 'ACTIVE' | 'DONE' | 'FAILED'; +type AsyncInputState = 'BUFFER_READY' | 'NEED_INPUT' | 'STREAM_ENDED'; +type ParseAction = XmlEventType | 'need_input' | 'skip'; + +export interface StaxXmlCursorOptions { + encoding?: string; + addEntities?: { entity: string, value: string }[]; + autoDecodeEntities?: boolean; + maxBufferSize?: number; + enableBufferCompaction?: boolean; + batchSize?: number; + batchTimeout?: number; +} + +export class StaxXmlCursor { + private reader: ReadableStreamDefaultReader | null = null; + private readonly decoder: TextDecoder; + private readonly options: StaxXmlCursorOptions; + private readonly entityDecoder: (text: string) => string; + private readonly attributeCollector: AttributeCollector; + private readonly bmhCache = new Map(); + + private buffer: Uint8Array; + private bufferLength = 0; + private position = 0; + private currentTextBuffer = ''; + private currentStartTagSource = ''; + private readonly elementStack: string[] = []; + private readonly namespaceStack: Map[] = [new Map()]; + private lifecycleState: CursorLifecycleState = 'INITIAL'; + private inputState: AsyncInputState = 'NEED_INPUT'; + private currentType?: XmlEventType; + private currentName?: string; + private currentLocalName?: string; + private currentPrefix?: string; + private currentUri?: string; + private currentText?: string; + private pendingEndName?: string; + private pendingEndLocalName?: string; + private pendingEndPrefix?: string; + private pendingEndUri?: string; + private storedError?: Error; + private busy = false; + + private static readonly ASCII_TABLE = (() => { + const table = new Uint8Array(128); + table[9] = 1; + table[10] = 1; + table[13] = 1; + table[32] = 1; + table[60] = 2; + table[62] = 3; + table[47] = 4; + table[61] = 5; + table[33] = 6; + table[63] = 7; + table[34] = 8; + table[39] = 9; + return table; + })(); + + private static readonly ENTITY_REGEX_CACHE = new Map(); + private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; + private static readonly DEFAULT_ENTITY_MAP: Record = { + lt: '<', + gt: '>', + quot: '"', + apos: '\'', + amp: '&', + }; + + constructor(xmlStream: ReadableStream, options: StaxXmlCursorOptions = {}) { + if (!(xmlStream instanceof ReadableStream)) { + throw new Error('xmlStream must be a web standard ReadableStream.'); + } + + this.options = { + encoding: 'utf-8', + autoDecodeEntities: true, + maxBufferSize: 64 * 1024, + enableBufferCompaction: true, + batchSize: 10, + batchTimeout: 10, + ...options, + }; + + this.decoder = new TextDecoder(this.options.encoding, { + fatal: false, + ignoreBOM: true, + }); + this.buffer = new Uint8Array(this.options.maxBufferSize ?? 64 * 1024); + this.entityDecoder = this.compileEntityDecoder(); + this.attributeCollector = new AttributeCollector(this.entityDecoder); + this.attributeCollector.reset(''); + this.reader = xmlStream.getReader(); + } + + hasNext(): boolean { + this.assertNotBusy(); + return this.lifecycleState !== 'DONE' && this.lifecycleState !== 'FAILED'; + } + + async next(): Promise { + this.assertNotBusy(); + if (this.lifecycleState === 'FAILED') { + throw this.storedError; + } + + this.busy = true; + try { + return await this.pullNextToken(); + } catch (error) { + this.markFailed(error as Error); + throw this.storedError; + } finally { + this.busy = false; + } + } + + get eventType(): XmlEventType | undefined { + return this.currentType; + } + + get name(): string | undefined { + return this.currentName; + } + + get localName(): string | undefined { + return this.currentLocalName; + } + + get prefix(): string | undefined { + return this.currentPrefix; + } + + get uri(): string | undefined { + return this.currentUri; + } + + get text(): string | undefined { + return this.currentText; + } + + getText(): string { + if (this.currentType !== XmlEventType.CHARACTERS && this.currentType !== XmlEventType.CDATA) { + throw new Error('Current token does not expose text.'); + } + + return this.currentText!; + } + + getAttributes(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributes(); + } + + getAttributesWithPrefix(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributesWithPrefix(); + } + + getAttributeValue(rawName: string): string | undefined { + this.assertStartElementToken(); + return this.attributeCollector.getAttributeValue(rawName); + } + + private async pullNextToken(): Promise { + this.releaseCurrentStartTagSource(); + + if (this.lifecycleState === 'INITIAL') { + this.lifecycleState = 'ACTIVE'; + this.setCurrentType(XmlEventType.START_DOCUMENT); + return XmlEventType.START_DOCUMENT; + } + + if (this.pendingEndName !== undefined) { + this.setCurrent( + XmlEventType.END_ELEMENT, + this.pendingEndName, + this.pendingEndLocalName, + this.pendingEndPrefix, + this.pendingEndUri + ); + this.clearPendingEnd(); + return XmlEventType.END_ELEMENT; + } + + while (true) { + if (this.position >= this.bufferLength) { + if (this.flushCharacters()) { + return XmlEventType.CHARACTERS; + } + + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + + if (this.elementStack.length > 0) { + throw new Error('Unexpected end of document. Not all elements were closed.'); + } + + this.markDone(); + this.setCurrentType(XmlEventType.END_DOCUMENT); + return XmlEventType.END_DOCUMENT; + } + + const ltPos = this.findSingleByte(60, this.position); + if (ltPos === -1) { + try { + this.currentTextBuffer += this.readBuffer(); + } catch (error) { + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + throw error; + } + + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + } + continue; + } + + if (ltPos > this.position) { + try { + this.currentTextBuffer += this.readBuffer(ltPos - this.position); + } catch (error) { + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + throw error; + } + } + + this.position = ltPos; + if (this.flushCharacters()) { + return XmlEventType.CHARACTERS; + } + + if (this.position + 1 >= this.bufferLength) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unexpected end of document.'); + } + await this.readMore(); + continue; + } + + const nextByte = this.buffer[this.position + 1]; + const charType = this.getXmlCharType(nextByte); + let action: ParseAction; + + if (charType === 4) { + action = this.parseEndTag(); + } else if (charType === 6) { + action = this.parseBangConstruct(); + } else if (charType === 7) { + action = this.parseQuestionConstruct(); + } else { + action = this.parseStartTag(); + } + + if (action === 'need_input') { + await this.readMore(); + continue; + } + if (action === 'skip') { + continue; + } + return action; + } + } + + private parseBangConstruct(): ParseAction { + if (this.matchesPattern(''); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed comment'); + } + return 'need_input'; + } + + this.position = endPos + 3; + return 'skip'; + } + + private parseDoctype(): ParseAction { + const endPos = this.findDoctypeEnd(this.position + 9); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed DOCTYPE declaration'); + } + return 'need_input'; + } + + this.position = endPos + 1; + return 'skip'; + } + + private findDoctypeEnd(startPos: number): number { + let position = startPos; + let bracketDepth = 0; + let quoteChar = 0; + let inComment = false; + + while (position < this.bufferLength) { + const currentByte = this.buffer[position]; + + if (inComment) { + if (currentByte === 45) { + const commentTerminator = this.matchesAsciiPattern(position, '-->'); + if (commentTerminator === 'match') { + inComment = false; + position += 3; + continue; + } + if (commentTerminator === 'incomplete') { + return -1; + } + } + position++; + continue; + } + + if (quoteChar !== 0) { + if (currentByte === quoteChar) { + quoteChar = 0; + } + position++; + continue; + } + + if (currentByte === 34 || currentByte === 39) { + quoteChar = currentByte; + position++; + continue; + } + + if (currentByte === 60) { + const commentStart = this.matchesAsciiPattern(position, '', this.pos + 4); + if (commentEnd === -1) { + throw new Error('Unclosed comment'); + } + + this.pos = commentEnd + 3; + return undefined; + } + + if (this.matchesAt('', this.pos); + if (piEnd === -1) { + throw new Error('Unclosed processing instruction'); + } + + this.pos = piEnd + 2; + } + + private compileEntityDecoder(options: StaxXmlCursorSyncOptions): (text: string) => string { + if (options.autoDecodeEntities === false) { + return (text) => text; + } + + if (options.addEntities && options.addEntities.length > 0) { + const entityMap: Record = { ...StaxXmlCursorSync.DEFAULT_ENTITY_MAP }; + const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; + + for (const { entity, value } of options.addEntities) { + if (entity && value) { + const key = entity.startsWith('&') && entity.endsWith(';') + ? entity.slice(1, -1) + : entity; + entityMap[key] = value; + patterns.push(key); + } + } + patterns.push('amp'); + + const cacheKey = patterns.join(','); + let regex = StaxXmlCursorSync.ENTITY_REGEX_CACHE.get(cacheKey); + if (!regex) { + const pattern = patterns + .sort((left, right) => right.length - left.length) + .map((entity) => entity.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) + .join('|'); + regex = new RegExp(`&(${pattern});`, 'g'); + StaxXmlCursorSync.ENTITY_REGEX_CACHE.set(cacheKey, regex); + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + regex.lastIndex = 0; + return text.replace(regex, (_, entity) => entityMap[entity] || _); + }; + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + StaxXmlCursorSync.DEFAULT_ENTITY_REGEX.lastIndex = 0; + return text.replace( + StaxXmlCursorSync.DEFAULT_ENTITY_REGEX, + (_, entity) => StaxXmlCursorSync.DEFAULT_ENTITY_MAP[entity] || _ + ); + }; + } + + private fail(error: Error): XmlEventType { + this.lifecycleState = 'FAILED'; + this.storedError = error; + this.clearCurrent(); + throw error; + } + + private assertStartElementToken(): void { + if (this.currentType !== XmlEventType.START_ELEMENT) { + throw new Error('Current token does not expose attributes.'); + } + } + + private clearCurrent(): void { + this.currentType = undefined; + this.currentName = undefined; + this.currentLocalName = undefined; + this.currentPrefix = undefined; + this.currentUri = undefined; + this.currentText = undefined; + } + + private clearPendingEnd(): void { + this.pendingEndName = undefined; + this.pendingEndLocalName = undefined; + this.pendingEndPrefix = undefined; + this.pendingEndUri = undefined; + } + + private setCurrent( + type: XmlEventType, + name?: string, + localName?: string, + prefix?: string, + uri?: string, + text?: string + ): void { + this.currentType = type; + this.currentName = name; + this.currentLocalName = localName; + this.currentPrefix = prefix; + this.currentUri = uri; + this.currentText = text; + } + + private findChar(targetCode: number, start = this.pos): number { + const len16 = this.xmlLength - 15; + let i = start; + + for (; i < len16; i += 16) { + if (this.xml.charCodeAt(i) === targetCode) return i; + if (this.xml.charCodeAt(i + 1) === targetCode) return i + 1; + if (this.xml.charCodeAt(i + 2) === targetCode) return i + 2; + if (this.xml.charCodeAt(i + 3) === targetCode) return i + 3; + if (this.xml.charCodeAt(i + 4) === targetCode) return i + 4; + if (this.xml.charCodeAt(i + 5) === targetCode) return i + 5; + if (this.xml.charCodeAt(i + 6) === targetCode) return i + 6; + if (this.xml.charCodeAt(i + 7) === targetCode) return i + 7; + if (this.xml.charCodeAt(i + 8) === targetCode) return i + 8; + if (this.xml.charCodeAt(i + 9) === targetCode) return i + 9; + if (this.xml.charCodeAt(i + 10) === targetCode) return i + 10; + if (this.xml.charCodeAt(i + 11) === targetCode) return i + 11; + if (this.xml.charCodeAt(i + 12) === targetCode) return i + 12; + if (this.xml.charCodeAt(i + 13) === targetCode) return i + 13; + if (this.xml.charCodeAt(i + 14) === targetCode) return i + 14; + if (this.xml.charCodeAt(i + 15) === targetCode) return i + 15; + } + + for (; i < this.xmlLength; i++) { + if (this.xml.charCodeAt(i) === targetCode) { + return i; + } + } + + return -1; + } + + private matchesAt(value: string, pos: number): boolean { + if (pos + value.length > this.xmlLength) { + return false; + } + + for (let i = 0; i < value.length; i++) { + if (this.xml.charCodeAt(pos + i) !== value.charCodeAt(i)) { + return false; + } + } + + return true; + } + + private findTagEnd(start: number): number { + let i = start; + let inQuote = false; + let quoteChar = 0; + + while (i < this.xmlLength) { + const code = this.xml.charCodeAt(i); + if (code === 34 || code === 39) { + if (!inQuote) { + inQuote = true; + quoteChar = code; + } else if (code === quoteChar) { + inQuote = false; + quoteChar = 0; + } + } else if (code === 62 && !inQuote) { + return i; + } + i++; + } + + return -1; + } + + private findSequence(sequence: string, start: number): number { + const maxPos = this.xmlLength - sequence.length; + for (let i = start; i <= maxPos; i++) { + let match = true; + for (let j = 0; j < sequence.length; j++) { + if (this.xml.charCodeAt(i + j) !== sequence.charCodeAt(j)) { + match = false; + break; + } + } + if (match) { + return i; + } + } + return -1; + } + + private findDoctypeEnd(start: number): number { + let position = start; + let bracketDepth = 0; + let quoteChar = 0; + let inComment = false; + + while (position < this.xmlLength) { + const currentChar = this.xml.charCodeAt(position); + + if (inComment) { + if (this.matchesAt('-->', position)) { + inComment = false; + position += 3; + continue; + } + position++; + continue; + } + + if (quoteChar !== 0) { + if (currentChar === quoteChar) { + quoteChar = 0; + } + position++; + continue; + } + + if (currentChar === 34 || currentChar === 39) { + quoteChar = currentChar; + position++; + continue; + } + + if (this.matchesAt(''); - if (endPos === -1) return false; - this.position = endPos + 3; - return true; - } - - /** - * UTF-8 safe CDATA parsing - */ - private _parseCData(): boolean { - const startPos = this.position + 9; // After ''); - if (endPos === -1) return false; - - try { - // Check UTF-8 boundaries - const safeStart = this.findSafeUtf8Boundary(startPos, false); - const safeEnd = this.findSafeUtf8Boundary(endPos, true); - - const cdataContent = this.decoder.decode( - this.buffer.subarray(safeStart, safeEnd), - { stream: false } - ); - - // Inline CDATA creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.CDATA, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: cdataContent, - error: undefined - } as UnifiedXmlEvent as CdataEvent); - - this.position = endPos + 3; - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - private _parseProcessingInstruction(): boolean { - const endPos = this._findPatternBMH('?>'); - if (endPos === -1) return false; - this.position = endPos + 2; - return true; - } - - /** - * UTF-8 safe end tag parsing - */ - private _parseEndTag(): boolean { - const gtPos = this._findSingleByte(62, this.position); // '>' - if (gtPos === -1) return false; - - try { - // Safely decode the entire tag - const tagContent = this.safeDecodeRange(this.position, gtPos + 1); - const closeTagMatch = tagContent.match(/^<\/([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)\s*>$/); - - if (!closeTagMatch) { - this._addError(new Error('Malformed closing tag')); - return true; - } - - const tagName = closeTagMatch[1]; - if (this.elementStack.length === 0 || this.elementStack[this.elementStack.length - 1] !== tagName) { - this._addError(new Error(`Mismatched closing tag: . Expected `)); - return true; - } - - const currentNamespaces = this.namespaceStack.length > 0 ? - this.namespaceStack[this.namespaceStack.length - 1] : new Map(); - const { localName, prefix, uri } = this._parseQualifiedName(tagName, currentNamespaces); - - this.elementStack.pop(); - this.namespaceStack.pop(); - - // Inline END_ELEMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.END_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as EndElementEvent); - - this.position = gtPos + 1; - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - /** - * UTF-8 safe start tag parsing (using ASCII table) - */ - private _parseStartTag(): boolean { - const gtPos = this._findSingleByte(62, this.position); // '>' - if (gtPos === -1) return false; - - try { - // Safely decode the entire tag - const tagContent = this.safeDecodeRange(this.position, gtPos + 1); - const tagMatch = tagContent.match(/^<([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)(\s+[^>]*?)?\s*(\/?)>$/); - - if (!tagMatch) { - this._addError(new Error('Malformed start tag')); - return true; - } - - const tagName = tagMatch[1]; - const attributesString = tagMatch[2] || ''; - const isSelfClosing = tagMatch[3] === '/'; - - const currentNamespaces = new Map(); - if (this.namespaceStack.length > 0) { - const parentNamespaces = this.namespaceStack[this.namespaceStack.length - 1]; - for (const [prefix, uri] of parentNamespaces) { - currentNamespaces.set(prefix, uri); - } - } - - const attributes: { [key: string]: string } = {}; - const attributesWithPrefix: { [key: string]: { value: string; prefix?: string; uri?: string } } = {}; - - // Attribute parsing - Unicode character support - const attrRegex = /([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)(?:\s*=\s*"([^"]*)"|\s*=\s*'([^']*)')?/g; - let attrMatch; - - while ((attrMatch = attrRegex.exec(attributesString)) !== null) { - const attrName = attrMatch[1]; - const attrValue = this.entityDecoder(attrMatch[2] || attrMatch[3] || 'true'); - attributes[attrName] = attrValue; - - const attrNamespaceInfo = this._parseQualifiedName(attrName, currentNamespaces, true); - attributesWithPrefix[attrNamespaceInfo.localName] = { - value: attrValue, - prefix: attrNamespaceInfo.prefix, - uri: attrNamespaceInfo.uri - }; - - if (attrName === 'xmlns') { - currentNamespaces.set('', attrValue); - } else if (attrName.startsWith('xmlns:')) { - const prefix = attrName.substring(6); - currentNamespaces.set(prefix, attrValue); - } - } - - const { localName, prefix, uri } = this._parseQualifiedName(tagName, currentNamespaces); - - // Inline START_ELEMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.START_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: attributes, - attributesWithPrefix: attributesWithPrefix, - value: undefined, - error: undefined - } as UnifiedXmlEvent as StartElementEvent); - - this.position = gtPos + 1; - - if (!isSelfClosing) { - this.elementStack.push(tagName); - this.namespaceStack.push(currentNamespaces); - } else { - // Inline END_ELEMENT creation for self-closing - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.END_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as EndElementEvent); - } - - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - private _parseQualifiedName( - qname: string, - namespaces: Map, - isAttribute: boolean = false - ): { - localName: string; - prefix?: string; - uri?: string; - } { - const colonIndex = qname.indexOf(':'); - if (colonIndex === -1) { - if (isAttribute) { - return { - localName: qname, - prefix: undefined, - uri: undefined - }; - } else { - const defaultUri = namespaces.get(''); - return { - localName: qname, - prefix: undefined, - uri: defaultUri - }; - } - } else { - const prefix = qname.substring(0, colonIndex); - const localName = qname.substring(colonIndex + 1); - const uri = namespaces.get(prefix); - return { - localName, - prefix, - uri - }; - } - } - public get XmlEventType(): typeof XmlEventType { return XmlEventType; } + + private materializeEvent(tokenType: XmlEventType): AnyXmlEvent { + switch (tokenType) { + case XmlEventType.START_DOCUMENT: + return XmlEventFactory.startDocument() as StartDocumentEvent; + case XmlEventType.END_DOCUMENT: + return XmlEventFactory.endDocument() as EndDocumentEvent; + case XmlEventType.START_ELEMENT: + return XmlEventFactory.startElement( + this.cursor.name!, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri, + this.cursor.getAttributes(), + this.toLegacyAsyncAttributesWithPrefix() as unknown as StartElementEvent['attributesWithPrefix'] + ) as StartElementEvent; + case XmlEventType.END_ELEMENT: + return XmlEventFactory.endElement( + this.cursor.name!, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri + ) as EndElementEvent; + case XmlEventType.CHARACTERS: + return XmlEventFactory.characters(this.cursor.getText()); + case XmlEventType.CDATA: + return XmlEventFactory.cdata(this.cursor.getText()); + case XmlEventType.ERROR: + return XmlEventFactory.error(new Error('Cursor should not emit ERROR tokens.')); + default: + throw new Error(`Unsupported token type: ${String(tokenType)}`); + } + } + + private toLegacyAsyncAttributesWithPrefix(): Record { + const attributesWithPrefix = this.cursor.getAttributesWithPrefix(); + return Object.fromEntries( + Object.values(attributesWithPrefix).map((attribute) => [ + attribute.localName, + { + value: attribute.value, + prefix: attribute.prefix, + uri: attribute.uri, + }, + ]) + ); + } } -export default StaxXmlParser; \ No newline at end of file +export default StaxXmlParser; diff --git a/packages/stax-xml/src/StaxXmlParserSync.ts b/packages/stax-xml/src/StaxXmlParserSync.ts index 463a0d4..dc26b3a 100644 --- a/packages/stax-xml/src/StaxXmlParserSync.ts +++ b/packages/stax-xml/src/StaxXmlParserSync.ts @@ -1,14 +1,12 @@ -// StaxXmlParserSync.ts - Optimized version using XmlEventFactory - import { - AnyXmlEvent, - AttributeInfo, - CdataEvent, - CharactersEvent, - EndDocumentEvent, - EndElementEvent, - StartElementEvent, - XmlEventType + type AnyXmlEvent, + type AttributeInfo, + type CdataEvent, + type CharactersEvent, + type EndDocumentEvent, + type EndElementEvent, + type StartElementEvent, + XmlEventType, } from './types'; export interface StaxXmlParserSyncOptions { @@ -19,55 +17,51 @@ export interface StaxXmlParserSyncOptions { export class StaxXmlParserSync implements Iterable, Iterator { private readonly xml: string; private readonly xmlLength: number; - private pos: number = 0; + private pos = 0; private readonly elementStack: string[] = []; private namespaceStack: Map[] = []; private readonly options: StaxXmlParserSyncOptions; private internalIterator?: Generator; - // ===== Static optimization tables and caches ===== - - // ASCII character fast classification table (0-127) private static readonly ASCII_TABLE = (() => { const table = new Uint8Array(128); - // Whitespace characters: 1 - table[9] = 1; // TAB - table[10] = 1; // LF - table[13] = 1; // CR - table[32] = 1; // SPACE - // XML special characters - table[60] = 2; // '<' - table[62] = 3; // '>' - table[47] = 4; // '/' - table[61] = 5; // '=' - table[33] = 6; // '!' - table[63] = 7; // '?' - table[34] = 8; // '"' - table[39] = 9; // "'" + table[9] = 1; + table[10] = 1; + table[13] = 1; + table[32] = 1; + table[60] = 2; + table[62] = 3; + table[47] = 4; + table[61] = 5; + table[33] = 6; + table[63] = 7; + table[34] = 8; + table[39] = 9; return table; })(); - // Multilingual whitespace character Set (fast lookup) private static readonly UNICODE_WHITESPACE = new Set([ - 0x00A0, // Non-breaking space - 0x1680, // Ogham space - 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, // Various spaces - 0x2028, // Line separator - 0x2029, // Paragraph separator - 0x202F, // Narrow no-break space - 0x205F, // Medium mathematical space - 0x3000, // CJK ideographic space - 0xFEFF // Zero-width no-break space + 0x00A0, + 0x1680, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, + 0x2028, + 0x2029, + 0x202F, + 0x205F, + 0x3000, + 0xFEFF, ]); - // Entity regex cache private static readonly ENTITY_REGEX_CACHE = new Map(); private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; private static readonly DEFAULT_ENTITY_MAP: Record = { - 'lt': '<', 'gt': '>', 'quot': '"', 'apos': "'", 'amp': '&' + lt: '<', + gt: '>', + quot: '"', + apos: '\'', + amp: '&', }; - // Compiled entity decoder (per-instance caching) private readonly entityDecoder: (text: string) => string; constructor(xml: string, options: StaxXmlParserSyncOptions = {}) { @@ -75,189 +69,26 @@ export class StaxXmlParserSync implements Iterable, Iterator()); - - // Pre-compile entity decoder this.entityDecoder = this.compileEntityDecoder(); } - // ===== Helper methods: character classification ===== - - private static isWhitespace(code: number): boolean { - if (code < 128) { - return StaxXmlParserSync.ASCII_TABLE[code] === 1; - } - return code <= 32 || StaxXmlParserSync.UNICODE_WHITESPACE.has(code); - } - - // ===== Helper methods: surrogate pair handling ===== - - private static isHighSurrogate(code: number): boolean { - return code >= 0xD800 && code <= 0xDBFF; - } - - private static isLowSurrogate(code: number): boolean { - return code >= 0xDC00 && code <= 0xDFFF; - } - - // ===== Optimized string processing ===== - - // indexOf replacement - fast character search (16-byte unrolling) - private findChar(targetCode: number, start: number = this.pos): number { - const xml = this.xml; - const len = this.xmlLength; - - // Performance improvement with 16-byte unrolling - const len16 = len - 15; - let i = start; - - // 16-byte unrolling loop - for (; i < len16; i += 16) { - if (xml.charCodeAt(i) === targetCode) return i; - if (xml.charCodeAt(i + 1) === targetCode) return i + 1; - if (xml.charCodeAt(i + 2) === targetCode) return i + 2; - if (xml.charCodeAt(i + 3) === targetCode) return i + 3; - if (xml.charCodeAt(i + 4) === targetCode) return i + 4; - if (xml.charCodeAt(i + 5) === targetCode) return i + 5; - if (xml.charCodeAt(i + 6) === targetCode) return i + 6; - if (xml.charCodeAt(i + 7) === targetCode) return i + 7; - if (xml.charCodeAt(i + 8) === targetCode) return i + 8; - if (xml.charCodeAt(i + 9) === targetCode) return i + 9; - if (xml.charCodeAt(i + 10) === targetCode) return i + 10; - if (xml.charCodeAt(i + 11) === targetCode) return i + 11; - if (xml.charCodeAt(i + 12) === targetCode) return i + 12; - if (xml.charCodeAt(i + 13) === targetCode) return i + 13; - if (xml.charCodeAt(i + 14) === targetCode) return i + 14; - if (xml.charCodeAt(i + 15) === targetCode) return i + 15; - } - - // Handle remaining bytes - for (; i < len; i++) { - if (xml.charCodeAt(i) === targetCode) return i; - } - - return -1; - } - - // Fast string search (startsWith replacement) - private matchesAt(str: string, pos: number): boolean { - const len = str.length; - if (pos + len > this.xmlLength) return false; - - for (let i = 0; i < len; i++) { - if (this.xml.charCodeAt(pos + i) !== str.charCodeAt(i)) { - return false; - } - } - return true; - } - - // Inline trim (avoid substring) - private trimmedSlice(start: number, end: number): string { - const xml = this.xml; - - // Remove leading whitespace - while (start < end && StaxXmlParserSync.isWhitespace(xml.charCodeAt(start))) { - if (StaxXmlParserSync.isHighSurrogate(xml.charCodeAt(start))) { - start += 2; - } else { - start++; - } - } - - // Remove trailing whitespace - while (end > start && StaxXmlParserSync.isWhitespace(xml.charCodeAt(end - 1))) { - // Surrogate pair check (reverse direction) - if (end > start + 1 && - StaxXmlParserSync.isLowSurrogate(xml.charCodeAt(end - 1)) && - StaxXmlParserSync.isHighSurrogate(xml.charCodeAt(end - 2))) { - end -= 2; - } else { - end--; - } - } - - return start < end ? xml.slice(start, end) : ''; + public [Symbol.iterator](): Iterator { + return this; } - // ===== Entity processing optimization ===== - - private compileEntityDecoder(): (text: string) => string { - if (!this.options.autoDecodeEntities) { - return (text) => text; - } - - // If custom entities exist - if (this.options.addEntities && this.options.addEntities.length > 0) { - const entityMap: Record = { ...StaxXmlParserSync.DEFAULT_ENTITY_MAP }; - const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; - - for (const { entity, value } of this.options.addEntities) { - if (entity && value) { - // Extract only entity part from &entity; format - const key = entity.startsWith('&') && entity.endsWith(';') - ? entity.slice(1, -1) - : entity; - entityMap[key] = value; - patterns.push(key); - } - } - patterns.push('amp'); // amp goes last - - // Generate cache key and regex caching - const cacheKey = patterns.join(','); - let regex = StaxXmlParserSync.ENTITY_REGEX_CACHE.get(cacheKey); - - if (!regex) { - const pattern = patterns - .sort((a, b) => b.length - a.length) // Longer patterns first - .map(e => e.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) - .join('|'); - regex = new RegExp(`&(${pattern});`, 'g'); - StaxXmlParserSync.ENTITY_REGEX_CACHE.set(cacheKey, regex); - } - - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - regex!.lastIndex = 0; - return text.replace(regex!, (_, entity) => entityMap[entity] || _); - }; + public next(): IteratorResult { + if (!this.internalIterator) { + this.internalIterator = this.internalGenerator(); } - // Use only default entities - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - StaxXmlParserSync.DEFAULT_ENTITY_REGEX.lastIndex = 0; - return text.replace( - StaxXmlParserSync.DEFAULT_ENTITY_REGEX, - (_, entity) => StaxXmlParserSync.DEFAULT_ENTITY_MAP[entity] || _ - ); - }; - } - - // ===== Main parsing logic - using EventFactory ===== - - /** - * Symbol.iterator implementation - returns this instance as iterator - * This ensures for...of and explicit next() calls use the same iterator state - */ - public [Symbol.iterator](): Iterator { - return this; + return this.consumeIterator(this.internalIterator); } - /** - * Internal generator that actually yields AnyXmlEvent - * Important: Return type is same as before - Iterator - * Factory internally creates UnifiedXmlEvent, but - * types are returned as StartElementEvent, EndElementEvent etc. so - * perfectly compatible with AnyXmlEvent union type - */ private *internalGenerator(): Generator { - // Inline startDocument() - maintains V8 hidden class optimization - // All events have same shape with undefined for unused fields yield { type: XmlEventType.START_DOCUMENT, name: undefined, @@ -267,18 +98,16 @@ export class StaxXmlParserSync implements Iterable, Iterator