From 97dbf5b487be958ad0e8a16d53cc18473e5f157c Mon Sep 17 00:00:00 2001 From: Clickin Date: Wed, 25 Mar 2026 10:42:46 +0900 Subject: [PATCH 1/5] Freeze current cursor refactor state for checkpoint-based perf comparisons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parser split now needs a stable internal baseline before cursor-only and wrapper-facing performance can be judged independently. This commit captures the current refactor state, cursor surface, focused tests, and comparison scripts so a checkpoint worktree can be cut from a known SHA without mixing in later optimization passes. Constraint: Upcoming perf analysis must compare feature work against a fixed internal checkpoint and main separately Rejected: Use the moving feature branch head as the internal baseline | comparisons would drift as optimization work lands Confidence: medium Scope-risk: moderate Reversibility: clean Directive: Treat this SHA as the checkpoint source and avoid mutating the derived checkpoint worktree during perf comparisons Tested: > stax-xml@0.4.0 test /josh/programs/stax-xml-core-parser-refactor/packages/stax-xml > vitest run "test/cursor-sync.test.ts" "test/cursor-async.test.ts" "test/parser-sync.test.ts" "test/parser.test.ts" RUN v3.2.4 /josh/programs/stax-xml-core-parser-refactor/packages/stax-xml ✓ test/cursor-sync.test.ts (3 tests) 9ms ✓ test/parser-sync.test.ts (30 tests) 23ms ✓ test/cursor-async.test.ts (3 tests) 22ms ✓ test/parser.test.ts (13 tests) 41ms Test Files 4 passed (4) Tests 49 passed (49) Start at 10:42:42 Duration 612ms (transform 341ms, setup 0ms, collect 616ms, tests 95ms, environment 1ms, prepare 691ms), > stax-xml@0.4.0 build /josh/programs/stax-xml-core-parser-refactor/packages/stax-xml > tsdown ℹ tsdown v0.15.12 powered by rolldown v1.0.0-beta.45 ℹ Using tsdown config: /josh/programs/stax-xml-core-parser-refactor/packages/stax-xml/tsdown.config.ts ℹ Build start ℹ Cleaning 8 files Not-tested: Full vitest suite, benchmark/profiling runs against the new scripts --- package.json | 6 +- packages/stax-xml/src/StaxXmlCursor.ts | 806 ++++++++++++ packages/stax-xml/src/StaxXmlCursorSync.ts | 517 ++++++++ packages/stax-xml/src/StaxXmlParser.ts | 1141 ++--------------- packages/stax-xml/src/StaxXmlParserSync.ts | 750 +---------- packages/stax-xml/src/index.ts | 3 +- .../src/internal/AttributeCollector.ts | 118 ++ .../src/internal/XmlCursorParserUtil.ts | 244 ++++ packages/stax-xml/test/cursor-async.test.ts | 59 + packages/stax-xml/test/cursor-sync.test.ts | 74 ++ .../stax-xml/test/helpers/parser-trace.ts | 133 ++ packages/stax-xml/test/parser-sync.test.ts | 11 + .../stax-xml/test/parser-trace-oracle.test.ts | 99 ++ scripts/compare-runner-lib.mjs | 118 ++ scripts/parser-benchmark-case.mjs | 175 +++ scripts/run-benchmark.mjs | 89 ++ scripts/run-node-cpu-prof.mjs | 55 + 17 files changed, 2664 insertions(+), 1734 deletions(-) create mode 100644 packages/stax-xml/src/StaxXmlCursor.ts create mode 100644 packages/stax-xml/src/StaxXmlCursorSync.ts create mode 100644 packages/stax-xml/src/internal/AttributeCollector.ts create mode 100644 packages/stax-xml/src/internal/XmlCursorParserUtil.ts create mode 100644 packages/stax-xml/test/cursor-async.test.ts create mode 100644 packages/stax-xml/test/cursor-sync.test.ts create mode 100644 packages/stax-xml/test/helpers/parser-trace.ts create mode 100644 packages/stax-xml/test/parser-trace-oracle.test.ts create mode 100644 scripts/compare-runner-lib.mjs create mode 100644 scripts/parser-benchmark-case.mjs create mode 100644 scripts/run-benchmark.mjs create mode 100644 scripts/run-node-cpu-prof.mjs diff --git a/package.json b/package.json index b13ff51..67996d9 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,9 @@ "docs:dev": "pnpm --filter=stax-xml-docs dev", "docs:build": "pnpm --filter=stax-xml-docs build", "docs:preview": "pnpm --filter=stax-xml-docs preview", - "benchmark": "pnpm --filter=benchmark dev:bench:all" + "benchmark": "pnpm --filter=benchmark dev:bench:all", + "bench:run": "node ./scripts/run-benchmark.mjs", + "profile:run": "node ./scripts/run-node-cpu-prof.mjs" }, "devDependencies": { "@types/node": "^24.0.10", @@ -34,4 +36,4 @@ "node": ">=20.19.0" }, "packageManager": "pnpm@9.1.2" -} \ No newline at end of file +} diff --git a/packages/stax-xml/src/StaxXmlCursor.ts b/packages/stax-xml/src/StaxXmlCursor.ts new file mode 100644 index 0000000..be998c7 --- /dev/null +++ b/packages/stax-xml/src/StaxXmlCursor.ts @@ -0,0 +1,806 @@ +import { type AttributeInfo, XmlEventType } from './types'; +import { AttributeCollector } from './internal/AttributeCollector'; +import { + cloneNamespaces, + collectAttributesFromSource, + resolveElementName, + type QualifiedNameInfo, +} from './internal/XmlCursorParserUtil'; + +type CursorLifecycleState = 'INITIAL' | 'ACTIVE' | 'DONE' | 'FAILED'; +type AsyncInputState = 'BUFFER_READY' | 'NEED_INPUT' | 'STREAM_ENDED'; +type ParseAction = XmlEventType | 'need_input' | 'skip'; + +interface CursorToken { + type: XmlEventType; + name?: string; + localName?: string; + prefix?: string; + uri?: string; + text?: string; +} + +export interface StaxXmlCursorOptions { + encoding?: string; + addEntities?: { entity: string, value: string }[]; + autoDecodeEntities?: boolean; + maxBufferSize?: number; + enableBufferCompaction?: boolean; + batchSize?: number; + batchTimeout?: number; +} + +export class StaxXmlCursor { + private reader: ReadableStreamDefaultReader | null = null; + private readonly decoder: TextDecoder; + private readonly options: StaxXmlCursorOptions; + private readonly entityDecoder: (text: string) => string; + private readonly attributeCollector: AttributeCollector; + private readonly bmhCache = new Map(); + + private buffer: Uint8Array; + private bufferLength = 0; + private position = 0; + private currentTextBuffer = ''; + private currentStartTagSource = ''; + private readonly elementStack: string[] = []; + private readonly namespaceStack: Map[] = [new Map()]; + private lifecycleState: CursorLifecycleState = 'INITIAL'; + private inputState: AsyncInputState = 'NEED_INPUT'; + private currentToken?: CursorToken; + private pendingEndElement?: QualifiedNameInfo; + private storedError?: Error; + private busy = false; + + private static readonly ASCII_TABLE = (() => { + const table = new Uint8Array(128); + table[9] = 1; + table[10] = 1; + table[13] = 1; + table[32] = 1; + table[60] = 2; + table[62] = 3; + table[47] = 4; + table[61] = 5; + table[33] = 6; + table[63] = 7; + table[34] = 8; + table[39] = 9; + return table; + })(); + + private static readonly ENTITY_REGEX_CACHE = new Map(); + private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; + private static readonly DEFAULT_ENTITY_MAP: Record = { + lt: '<', + gt: '>', + quot: '"', + apos: '\'', + amp: '&', + }; + + constructor(xmlStream: ReadableStream, options: StaxXmlCursorOptions = {}) { + if (!(xmlStream instanceof ReadableStream)) { + throw new Error('xmlStream must be a web standard ReadableStream.'); + } + + this.options = { + encoding: 'utf-8', + autoDecodeEntities: true, + maxBufferSize: 64 * 1024, + enableBufferCompaction: true, + batchSize: 10, + batchTimeout: 10, + ...options, + }; + + this.decoder = new TextDecoder(this.options.encoding, { + fatal: false, + ignoreBOM: true, + }); + this.buffer = new Uint8Array(this.options.maxBufferSize ?? 64 * 1024); + this.entityDecoder = this.compileEntityDecoder(); + this.attributeCollector = new AttributeCollector(this.entityDecoder); + this.attributeCollector.reset(''); + this.reader = xmlStream.getReader(); + } + + hasNext(): boolean { + this.assertNotBusy(); + return this.lifecycleState !== 'DONE' && this.lifecycleState !== 'FAILED'; + } + + async next(): Promise { + this.assertNotBusy(); + if (this.lifecycleState === 'FAILED') { + throw this.storedError; + } + + this.busy = true; + try { + return await this.pullNextToken(); + } catch (error) { + this.markFailed(error as Error); + throw this.storedError; + } finally { + this.busy = false; + } + } + + get eventType(): XmlEventType | undefined { + return this.currentToken?.type; + } + + get name(): string | undefined { + return this.currentToken?.name; + } + + get localName(): string | undefined { + return this.currentToken?.localName; + } + + get prefix(): string | undefined { + return this.currentToken?.prefix; + } + + get uri(): string | undefined { + return this.currentToken?.uri; + } + + get text(): string | undefined { + return this.currentToken?.text; + } + + getText(): string { + if (this.currentToken?.type !== XmlEventType.CHARACTERS && this.currentToken?.type !== XmlEventType.CDATA) { + throw new Error('Current token does not expose text.'); + } + + return this.currentToken.text; + } + + getAttributes(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributes(); + } + + getAttributesWithPrefix(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributesWithPrefix(); + } + + getAttributeValue(rawName: string): string | undefined { + this.assertStartElementToken(); + return this.attributeCollector.getAttributeValue(rawName); + } + + private async pullNextToken(): Promise { + this.releaseCurrentStartTagSource(); + + if (this.lifecycleState === 'INITIAL') { + this.lifecycleState = 'ACTIVE'; + this.currentToken = { type: XmlEventType.START_DOCUMENT }; + return XmlEventType.START_DOCUMENT; + } + + if (this.pendingEndElement) { + const pending = this.pendingEndElement; + this.pendingEndElement = undefined; + this.currentToken = { + type: XmlEventType.END_ELEMENT, + ...pending, + }; + return XmlEventType.END_ELEMENT; + } + + while (true) { + if (this.position >= this.bufferLength) { + if (this.flushCharacters()) { + return XmlEventType.CHARACTERS; + } + + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + + if (this.elementStack.length > 0) { + throw new Error('Unexpected end of document. Not all elements were closed.'); + } + + this.markDone(); + this.currentToken = { type: XmlEventType.END_DOCUMENT }; + return XmlEventType.END_DOCUMENT; + } + + const ltPos = this.findSingleByte(60, this.position); + if (ltPos === -1) { + try { + this.currentTextBuffer += this.readBuffer(); + } catch (error) { + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + throw error; + } + + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + } + continue; + } + + if (ltPos > this.position) { + try { + this.currentTextBuffer += this.readBuffer(ltPos - this.position); + } catch (error) { + if (this.inputState !== 'STREAM_ENDED') { + await this.readMore(); + continue; + } + throw error; + } + } + + this.position = ltPos; + if (this.flushCharacters()) { + return XmlEventType.CHARACTERS; + } + + if (this.position + 1 >= this.bufferLength) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unexpected end of document.'); + } + await this.readMore(); + continue; + } + + const nextByte = this.buffer[this.position + 1]; + const charType = this.getXmlCharType(nextByte); + let action: ParseAction; + + if (charType === 4) { + action = this.parseEndTag(); + } else if (charType === 6) { + action = this.parseBangConstruct(); + } else if (charType === 7) { + action = this.parseQuestionConstruct(); + } else { + action = this.parseStartTag(); + } + + if (action === 'need_input') { + await this.readMore(); + continue; + } + if (action === 'skip') { + continue; + } + return action; + } + } + + private parseBangConstruct(): ParseAction { + if (this.matchesPattern(''); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed comment'); + } + return 'need_input'; + } + + this.position = endPos + 3; + return 'skip'; + } + + private parseDoctype(): ParseAction { + const endPos = this.findSingleByte(62, this.position); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed DOCTYPE declaration'); + } + return 'need_input'; + } + + this.position = endPos + 1; + return 'skip'; + } + + private parseCData(): ParseAction { + const endPos = this.findPatternBMH(']]>'); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed CDATA section'); + } + return 'need_input'; + } + + const safeStart = this.findSafeUtf8Boundary(this.position + 9, false); + const safeEnd = this.findSafeUtf8Boundary(endPos, true); + this.currentToken = { + type: XmlEventType.CDATA, + text: this.decoder.decode(this.buffer.subarray(safeStart, safeEnd), { stream: false }), + }; + this.position = endPos + 3; + return XmlEventType.CDATA; + } + + private parseProcessingInstruction(): ParseAction { + const endPos = this.findPatternBMH('?>'); + if (endPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed processing instruction'); + } + return 'need_input'; + } + + this.position = endPos + 2; + return 'skip'; + } + + private parseEndTag(): ParseAction { + const gtPos = this.findSingleByte(62, this.position); + if (gtPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed end tag'); + } + return 'need_input'; + } + + const tagContent = this.safeDecodeRange(this.position, gtPos + 1); + const closeTagMatch = tagContent.match(/^<\/([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)\s*>$/); + if (!closeTagMatch) { + throw new Error('Malformed closing tag'); + } + + const tagName = closeTagMatch[1]; + if (this.elementStack.length === 0) { + throw new Error(`Mismatched closing tag: . Expected `); + } + if (this.elementStack[this.elementStack.length - 1] !== tagName) { + throw new Error(`Mismatched closing tag: . Expected `); + } + + const namespaces = this.namespaceStack[this.namespaceStack.length - 1] ?? new Map(); + this.elementStack.pop(); + this.namespaceStack.pop(); + this.currentToken = { + type: XmlEventType.END_ELEMENT, + ...resolveElementName(tagName, namespaces), + }; + this.position = gtPos + 1; + return XmlEventType.END_ELEMENT; + } + + private parseStartTag(): ParseAction { + const gtPos = this.findSingleByte(62, this.position); + if (gtPos === -1) { + if (this.inputState === 'STREAM_ENDED') { + throw new Error('Unclosed start tag'); + } + return 'need_input'; + } + + const tagContent = this.safeDecodeRange(this.position, gtPos + 1); + const tagMatch = tagContent.match(/^<([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)(\s+[^>]*?)?\s*(\/?)>$/); + if (!tagMatch) { + throw new Error('Malformed start tag'); + } + + const tagName = tagMatch[1]; + const isSelfClosing = tagMatch[3] === '/'; + const namespaces = cloneNamespaces(this.namespaceStack[this.namespaceStack.length - 1]); + this.currentStartTagSource = tagContent; + const nameEnd = 1 + tagName.length; + const actualEnd = tagContent.length - (isSelfClosing ? 2 : 1); + + collectAttributesFromSource( + tagContent, + nameEnd, + actualEnd, + namespaces, + this.attributeCollector, + this.entityDecoder, + StaxXmlCursor.isWhitespaceCode + ); + + const nameInfo = resolveElementName(tagName, namespaces); + this.currentToken = { + type: XmlEventType.START_ELEMENT, + ...nameInfo, + }; + this.position = gtPos + 1; + + if (isSelfClosing) { + this.pendingEndElement = nameInfo; + return XmlEventType.START_ELEMENT; + } + + this.elementStack.push(tagName); + this.namespaceStack.push(namespaces); + return XmlEventType.START_ELEMENT; + } + + private flushCharacters(): boolean { + if (this.currentTextBuffer.length === 0) { + return false; + } + + const decodedText = this.entityDecoder(this.currentTextBuffer); + this.currentTextBuffer = ''; + if (decodedText.trim().length === 0) { + return false; + } + + this.currentToken = { + type: XmlEventType.CHARACTERS, + text: decodedText, + }; + return true; + } + + private releaseCurrentStartTagSource(): void { + if (this.currentToken?.type === XmlEventType.START_ELEMENT && this.currentStartTagSource.length > 0) { + this.currentStartTagSource = ''; + this.attributeCollector.reset(''); + } + } + + private async readMore(): Promise { + if (this.inputState === 'STREAM_ENDED') { + return; + } + + this.compactBufferIfNeeded(); + const { done, value } = await this.reader.read(); + if (done) { + this.inputState = 'STREAM_ENDED'; + this.releaseReader(); + return; + } + + this.appendToBuffer(value); + this.inputState = 'BUFFER_READY'; + } + + private appendToBuffer(newData: Uint8Array): void { + const requiredSize = this.bufferLength + newData.length; + if (requiredSize > this.buffer.length) { + const newSize = Math.max(this.buffer.length * 2, requiredSize); + const newBuffer = new Uint8Array(newSize); + newBuffer.set(this.buffer.subarray(0, this.bufferLength)); + this.buffer = newBuffer; + } + + this.buffer.set(newData, this.bufferLength); + this.bufferLength += newData.length; + } + + private compactBufferIfNeeded(): void { + if (!this.options.enableBufferCompaction) { + return; + } + + const maxSize = this.options.maxBufferSize ?? 64 * 1024; + const shouldCompact = + (this.position > 8192 && this.bufferLength > 16384) || + (this.position > maxSize / 2) || + (this.bufferLength > maxSize && this.position > maxSize / 4); + + if (!shouldCompact || this.position === 0 || this.position >= this.bufferLength) { + return; + } + + const safePos = this.findSafeUtf8Boundary(this.position, true); + const remainingLength = this.bufferLength - safePos; + if (remainingLength < safePos) { + const newBuffer = new Uint8Array(this.buffer.length); + newBuffer.set(this.buffer.subarray(safePos, this.bufferLength)); + this.buffer = newBuffer; + } else { + this.buffer.copyWithin(0, safePos, this.bufferLength); + } + + this.bufferLength = remainingLength; + this.position -= safePos; + if (this.bmhCache.size > 20) { + this.bmhCache.clear(); + } + } + + private readBuffer(length?: number): string { + const originalPos = this.position; + let endPos = length ? Math.min(this.position + length, this.bufferLength) : this.bufferLength; + if (length && endPos < this.bufferLength) { + endPos = this.findSafeUtf8Boundary(endPos, true); + } + + const slice = this.buffer.subarray(this.position, endPos); + try { + const result = this.decoder.decode(slice, { stream: this.inputState !== 'STREAM_ENDED' }); + this.position = endPos; + return result; + } catch (error) { + if (this.inputState !== 'STREAM_ENDED' && endPos === this.bufferLength) { + for (let i = 1; i <= 4 && endPos - i > this.position; i++) { + const testEnd = this.findSafeUtf8Boundary(endPos - i, true); + if (testEnd <= this.position) { + continue; + } + try { + const safeSlice = this.buffer.subarray(this.position, testEnd); + const result = this.decoder.decode(safeSlice, { stream: true }); + this.position = testEnd; + return result; + } catch { + continue; + } + } + } + this.position = originalPos; + throw error; + } + } + + private safeDecodeRange(start: number, end: number): string { + const safeStart = this.findSafeUtf8Boundary(start, false); + const safeEnd = this.findSafeUtf8Boundary(end, true); + if (safeStart >= safeEnd) { + return ''; + } + + return this.decoder.decode(this.buffer.subarray(safeStart, safeEnd), { stream: false }); + } + + private findSafeUtf8Boundary(pos: number, searchBackward: boolean): number { + if (pos <= 0 || pos >= this.bufferLength) { + return pos; + } + + if (searchBackward) { + let safePos = pos; + let backtrack = 0; + while (safePos > 0 && backtrack < 4) { + if (this.isUtf8CharStart(this.buffer[safePos])) { + const sequenceLength = this.getUtf8SequenceLength(this.buffer[safePos]); + if (safePos + sequenceLength > pos) { + return safePos; + } + return pos; + } + safePos--; + backtrack++; + } + return pos; + } + + while (pos < this.bufferLength && !this.isUtf8CharStart(this.buffer[pos])) { + pos++; + } + return pos; + } + + private isUtf8CharStart(byte: number): boolean { + return (byte & 0x80) === 0 || (byte & 0xC0) === 0xC0; + } + + private getUtf8SequenceLength(byte: number): number { + if ((byte & 0x80) === 0) return 1; + if ((byte & 0xE0) === 0xC0) return 2; + if ((byte & 0xF0) === 0xE0) return 3; + if ((byte & 0xF8) === 0xF0) return 4; + return 1; + } + + private buildBMHTable(pattern: Uint8Array): Uint8Array { + const table = new Uint8Array(256); + table.fill(pattern.length); + for (let i = 0; i < pattern.length - 1; i++) { + table[pattern[i]] = pattern.length - 1 - i; + } + return table; + } + + private findPatternBMH(pattern: string, startPos = this.position): number { + const patternBytes = new TextEncoder().encode(pattern); + if (patternBytes.length === 0) { + return -1; + } + if (patternBytes.length === 1) { + return this.findSingleByte(patternBytes[0], startPos); + } + + let skipTable = this.bmhCache.get(pattern); + if (!skipTable) { + skipTable = this.buildBMHTable(patternBytes); + if (this.bmhCache.size > 20) { + this.bmhCache.clear(); + } + this.bmhCache.set(pattern, skipTable); + } + + const bufferEnd = this.bufferLength - patternBytes.length; + let pos = startPos; + while (pos <= bufferEnd) { + let i = patternBytes.length - 1; + while (i >= 0 && this.buffer[pos + i] === patternBytes[i]) { + i--; + } + if (i < 0) { + return pos; + } + pos += skipTable[this.buffer[pos + patternBytes.length - 1]]; + } + + return -1; + } + + private findSingleByte(byte: number, startPos = this.position): number { + const end4 = this.bufferLength - 3; + let i = startPos; + for (; i < end4; i += 4) { + if (this.buffer[i] === byte) return i; + if (this.buffer[i + 1] === byte) return i + 1; + if (this.buffer[i + 2] === byte) return i + 2; + if (this.buffer[i + 3] === byte) return i + 3; + } + for (; i < this.bufferLength; i++) { + if (this.buffer[i] === byte) return i; + } + return -1; + } + + private matchesPattern(pattern: string): boolean { + const patternBytes = new TextEncoder().encode(pattern); + if (this.position + patternBytes.length > this.bufferLength) { + return false; + } + + for (let i = 0; i < patternBytes.length; i++) { + if (this.buffer[this.position + i] !== patternBytes[i]) { + return false; + } + } + return true; + } + + private compileEntityDecoder(): (text: string) => string { + if (this.options.autoDecodeEntities === false) { + return (text) => text; + } + + if (this.options.addEntities && this.options.addEntities.length > 0) { + const entityMap: Record = { ...StaxXmlCursor.DEFAULT_ENTITY_MAP }; + const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; + for (const { entity, value } of this.options.addEntities) { + if (entity && value) { + const key = entity.startsWith('&') && entity.endsWith(';') + ? entity.slice(1, -1) + : entity; + entityMap[key] = value; + patterns.push(key); + } + } + patterns.push('amp'); + + const cacheKey = patterns.join(','); + let regex = StaxXmlCursor.ENTITY_REGEX_CACHE.get(cacheKey); + if (!regex) { + const pattern = patterns + .sort((left, right) => right.length - left.length) + .map((entity) => entity.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) + .join('|'); + regex = new RegExp(`&(${pattern});`, 'g'); + StaxXmlCursor.ENTITY_REGEX_CACHE.set(cacheKey, regex); + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + regex.lastIndex = 0; + return text.replace(regex, (_, entity) => entityMap[entity] || _); + }; + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + StaxXmlCursor.DEFAULT_ENTITY_REGEX.lastIndex = 0; + return text.replace( + StaxXmlCursor.DEFAULT_ENTITY_REGEX, + (_, entity) => StaxXmlCursor.DEFAULT_ENTITY_MAP[entity] || _ + ); + }; + } + + private getXmlCharType(byte: number): number { + return byte < 128 ? StaxXmlCursor.ASCII_TABLE[byte] : 0; + } + + private assertNotBusy(): void { + if (this.busy) { + throw new Error('Concurrent cursor access is not allowed.'); + } + } + + private assertStartElementToken(): void { + if (this.currentToken?.type !== XmlEventType.START_ELEMENT) { + throw new Error('Current token does not expose attributes.'); + } + } + + private markDone(): void { + this.lifecycleState = 'DONE'; + this.releaseReader(); + } + + private markFailed(error: Error): void { + this.lifecycleState = 'FAILED'; + this.storedError = error; + this.currentToken = undefined; + this.currentTextBuffer = ''; + this.currentStartTagSource = ''; + this.attributeCollector.reset(''); + this.releaseReader(); + } + + private releaseReader(): void { + if (this.reader) { + this.reader.releaseLock(); + this.reader = null; + } + } + + private static isWhitespaceCode(code: number): boolean { + return code < 128 ? StaxXmlCursor.ASCII_TABLE[code] === 1 : code <= 32; + } +} + +export default StaxXmlCursor; diff --git a/packages/stax-xml/src/StaxXmlCursorSync.ts b/packages/stax-xml/src/StaxXmlCursorSync.ts new file mode 100644 index 0000000..3236506 --- /dev/null +++ b/packages/stax-xml/src/StaxXmlCursorSync.ts @@ -0,0 +1,517 @@ +import { type AttributeInfo, XmlEventType } from './types'; +import { AttributeCollector } from './internal/AttributeCollector'; +import { + cloneNamespaces, + collectAttributesFromSource, + resolveElementName, + type QualifiedNameInfo, +} from './internal/XmlCursorParserUtil'; + +export interface StaxXmlCursorSyncOptions { + autoDecodeEntities?: boolean; + addEntities?: { entity: string, value: string }[]; +} + +type CursorLifecycleState = 'INITIAL' | 'ACTIVE' | 'DONE' | 'FAILED'; + +interface CursorToken { + type: XmlEventType; + name?: string; + localName?: string; + prefix?: string; + uri?: string; + text?: string; +} + +export class StaxXmlCursorSync { + private readonly xml: string; + private readonly xmlLength: number; + private pos = 0; + private readonly elementStack: string[] = []; + private readonly namespaceStack: Map[] = [new Map()]; + private lifecycleState: CursorLifecycleState = 'INITIAL'; + private currentToken?: CursorToken; + private pendingEndElement?: QualifiedNameInfo; + private storedError?: Error; + + private static readonly ASCII_TABLE = (() => { + const table = new Uint8Array(128); + table[9] = 1; + table[10] = 1; + table[13] = 1; + table[32] = 1; + table[34] = 8; + table[39] = 9; + return table; + })(); + + private static readonly UNICODE_WHITESPACE = new Set([ + 0x00A0, + 0x1680, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, + 0x2028, + 0x2029, + 0x202F, + 0x205F, + 0x3000, + 0xFEFF + ]); + + private static readonly ENTITY_REGEX_CACHE = new Map(); + private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; + private static readonly DEFAULT_ENTITY_MAP: Record = { + lt: '<', + gt: '>', + quot: '"', + apos: '\'', + amp: '&', + }; + + private readonly entityDecoder: (text: string) => string; + private readonly attributeCollector: AttributeCollector; + + constructor(xml: string, options: StaxXmlCursorSyncOptions = {}) { + this.xml = xml; + this.xmlLength = xml.length; + this.entityDecoder = this.compileEntityDecoder(options); + this.attributeCollector = new AttributeCollector(this.entityDecoder); + this.attributeCollector.reset(this.xml); + } + + hasNext(): boolean { + return this.lifecycleState !== 'DONE' && this.lifecycleState !== 'FAILED'; + } + + next(): XmlEventType { + if (this.lifecycleState === 'FAILED') { + throw this.storedError; + } + + try { + if (this.lifecycleState === 'INITIAL') { + this.lifecycleState = 'ACTIVE'; + this.currentToken = { type: XmlEventType.START_DOCUMENT }; + return XmlEventType.START_DOCUMENT; + } + + if (this.pendingEndElement) { + const pending = this.pendingEndElement; + this.pendingEndElement = undefined; + this.currentToken = { + type: XmlEventType.END_ELEMENT, + ...pending, + }; + return XmlEventType.END_ELEMENT; + } + + while (true) { + if (this.pos >= this.xmlLength) { + if (this.elementStack.length > 0) { + return this.fail(new Error('Unexpected end of document. Not all elements were closed.')); + } + + this.lifecycleState = 'DONE'; + this.currentToken = { type: XmlEventType.END_DOCUMENT }; + return XmlEventType.END_DOCUMENT; + } + + const ltPos = this.findChar(60, this.pos); + if (ltPos === -1) { + const text = this.trimmedSlice(this.pos, this.xmlLength); + this.pos = this.xmlLength; + if (!text) { + continue; + } + + this.currentToken = { + type: XmlEventType.CHARACTERS, + text: this.entityDecoder(text), + }; + return XmlEventType.CHARACTERS; + } + + if (ltPos > this.pos) { + const text = this.trimmedSlice(this.pos, ltPos); + this.pos = ltPos; + if (!text) { + continue; + } + + this.currentToken = { + type: XmlEventType.CHARACTERS, + text: this.entityDecoder(text), + }; + return XmlEventType.CHARACTERS; + } + + const nextCharCode = this.xml.charCodeAt(this.pos + 1); + if (nextCharCode === 47) { + return this.parseEndTag(); + } + if (nextCharCode === 33) { + const cdataType = this.parseBangConstruct(); + if (cdataType) { + return cdataType; + } + continue; + } + if (nextCharCode === 63) { + this.parseProcessingInstruction(); + continue; + } + + return this.parseStartTag(); + } + } catch (error) { + return this.fail(error as Error); + } + } + + get eventType(): XmlEventType | undefined { + return this.currentToken?.type; + } + + get name(): string | undefined { + return this.currentToken?.name; + } + + get localName(): string | undefined { + return this.currentToken?.localName; + } + + get prefix(): string | undefined { + return this.currentToken?.prefix; + } + + get uri(): string | undefined { + return this.currentToken?.uri; + } + + get text(): string | undefined { + return this.currentToken?.text; + } + + getText(): string { + if (this.currentToken?.type !== XmlEventType.CHARACTERS && this.currentToken?.type !== XmlEventType.CDATA) { + throw new Error('Current token does not expose text.'); + } + + return this.currentToken.text; + } + + getAttributes(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributes(); + } + + getAttributesWithPrefix(): Record { + this.assertStartElementToken(); + return this.attributeCollector.getAttributesWithPrefix(); + } + + getAttributeValue(rawName: string): string | undefined { + this.assertStartElementToken(); + return this.attributeCollector.getAttributeValue(rawName); + } + + private parseStartTag(): XmlEventType { + const tagStart = this.pos + 1; + const tagEnd = this.findTagEnd(tagStart); + if (tagEnd === -1) { + throw new Error('Unclosed start tag'); + } + + let actualEnd = tagEnd; + let isSelfClosing = false; + if (this.xml.charCodeAt(tagEnd - 1) === 47) { + actualEnd = tagEnd - 1; + isSelfClosing = true; + } + + let nameEnd = tagStart; + while (nameEnd < actualEnd) { + const code = this.xml.charCodeAt(nameEnd); + if (code <= 32) { + if (StaxXmlCursorSync.isWhitespace(code)) { + break; + } + } else if (code === 62 || code === 47) { + break; + } + nameEnd++; + } + + const tagName = this.xml.slice(tagStart, nameEnd); + const namespaces = cloneNamespaces(this.namespaceStack[this.namespaceStack.length - 1]); + collectAttributesFromSource( + this.xml, + nameEnd, + actualEnd, + namespaces, + this.attributeCollector, + this.entityDecoder, + StaxXmlCursorSync.isWhitespace + ); + + const nameInfo = resolveElementName(tagName, namespaces); + this.currentToken = { + type: XmlEventType.START_ELEMENT, + ...nameInfo, + }; + + this.pos = tagEnd + 1; + + if (isSelfClosing) { + this.pendingEndElement = nameInfo; + return XmlEventType.START_ELEMENT; + } + + this.elementStack.push(tagName); + this.namespaceStack.push(namespaces); + return XmlEventType.START_ELEMENT; + } + + private parseEndTag(): XmlEventType { + const tagClose = this.findChar(62, this.pos); + if (tagClose === -1) { + throw new Error('Unclosed end tag'); + } + + const fullTagName = this.trimmedSlice(this.pos + 2, tagClose); + if (this.elementStack.length === 0) { + throw new Error(`Mismatched closing tag: . No open elements.`); + } + + const expectedTagName = this.elementStack[this.elementStack.length - 1]; + if (fullTagName !== expectedTagName) { + throw new Error(`Mismatched closing tag: . Expected .`); + } + + this.elementStack.pop(); + const namespaces = this.namespaceStack.pop() ?? new Map(); + this.currentToken = { + type: XmlEventType.END_ELEMENT, + ...resolveElementName(fullTagName, namespaces), + }; + this.pos = tagClose + 1; + return XmlEventType.END_ELEMENT; + } + + private parseBangConstruct(): XmlEventType | undefined { + if (this.matchesAt('', this.pos + 9); + if (cdataEnd === -1) { + throw new Error('Unclosed CDATA section'); + } + + this.currentToken = { + type: XmlEventType.CDATA, + text: this.xml.slice(this.pos + 9, cdataEnd), + }; + this.pos = cdataEnd + 3; + return XmlEventType.CDATA; + } + + if (this.matchesAt('', this.pos + 4); + if (commentEnd === -1) { + throw new Error('Unclosed comment'); + } + + this.pos = commentEnd + 3; + return undefined; + } + + if (this.matchesAt('', this.pos); + if (piEnd === -1) { + throw new Error('Unclosed processing instruction'); + } + + this.pos = piEnd + 2; + } + + private compileEntityDecoder(options: StaxXmlCursorSyncOptions): (text: string) => string { + if (options.autoDecodeEntities === false) { + return (text) => text; + } + + if (options.addEntities && options.addEntities.length > 0) { + const entityMap: Record = { ...StaxXmlCursorSync.DEFAULT_ENTITY_MAP }; + const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; + + for (const { entity, value } of options.addEntities) { + if (entity && value) { + const key = entity.startsWith('&') && entity.endsWith(';') + ? entity.slice(1, -1) + : entity; + entityMap[key] = value; + patterns.push(key); + } + } + patterns.push('amp'); + + const cacheKey = patterns.join(','); + let regex = StaxXmlCursorSync.ENTITY_REGEX_CACHE.get(cacheKey); + if (!regex) { + const pattern = patterns + .sort((left, right) => right.length - left.length) + .map((entity) => entity.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) + .join('|'); + regex = new RegExp(`&(${pattern});`, 'g'); + StaxXmlCursorSync.ENTITY_REGEX_CACHE.set(cacheKey, regex); + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + regex.lastIndex = 0; + return text.replace(regex, (_, entity) => entityMap[entity] || _); + }; + } + + return (text: string) => { + if (!text || text.indexOf('&') === -1) { + return text; + } + StaxXmlCursorSync.DEFAULT_ENTITY_REGEX.lastIndex = 0; + return text.replace( + StaxXmlCursorSync.DEFAULT_ENTITY_REGEX, + (_, entity) => StaxXmlCursorSync.DEFAULT_ENTITY_MAP[entity] || _ + ); + }; + } + + private fail(error: Error): XmlEventType { + this.lifecycleState = 'FAILED'; + this.storedError = error; + this.currentToken = undefined; + throw error; + } + + private assertStartElementToken(): void { + if (this.currentToken?.type !== XmlEventType.START_ELEMENT) { + throw new Error('Current token does not expose attributes.'); + } + } + + private findChar(targetCode: number, start = this.pos): number { + const len16 = this.xmlLength - 15; + let i = start; + + for (; i < len16; i += 16) { + if (this.xml.charCodeAt(i) === targetCode) return i; + if (this.xml.charCodeAt(i + 1) === targetCode) return i + 1; + if (this.xml.charCodeAt(i + 2) === targetCode) return i + 2; + if (this.xml.charCodeAt(i + 3) === targetCode) return i + 3; + if (this.xml.charCodeAt(i + 4) === targetCode) return i + 4; + if (this.xml.charCodeAt(i + 5) === targetCode) return i + 5; + if (this.xml.charCodeAt(i + 6) === targetCode) return i + 6; + if (this.xml.charCodeAt(i + 7) === targetCode) return i + 7; + if (this.xml.charCodeAt(i + 8) === targetCode) return i + 8; + if (this.xml.charCodeAt(i + 9) === targetCode) return i + 9; + if (this.xml.charCodeAt(i + 10) === targetCode) return i + 10; + if (this.xml.charCodeAt(i + 11) === targetCode) return i + 11; + if (this.xml.charCodeAt(i + 12) === targetCode) return i + 12; + if (this.xml.charCodeAt(i + 13) === targetCode) return i + 13; + if (this.xml.charCodeAt(i + 14) === targetCode) return i + 14; + if (this.xml.charCodeAt(i + 15) === targetCode) return i + 15; + } + + for (; i < this.xmlLength; i++) { + if (this.xml.charCodeAt(i) === targetCode) { + return i; + } + } + + return -1; + } + + private matchesAt(value: string, pos: number): boolean { + if (pos + value.length > this.xmlLength) { + return false; + } + + for (let i = 0; i < value.length; i++) { + if (this.xml.charCodeAt(pos + i) !== value.charCodeAt(i)) { + return false; + } + } + + return true; + } + + private findTagEnd(start: number): number { + let i = start; + let inQuote = false; + let quoteChar = 0; + + while (i < this.xmlLength) { + const code = this.xml.charCodeAt(i); + if (code === 34 || code === 39) { + if (!inQuote) { + inQuote = true; + quoteChar = code; + } else if (code === quoteChar) { + inQuote = false; + quoteChar = 0; + } + } else if (code === 62 && !inQuote) { + return i; + } + i++; + } + + return -1; + } + + private findSequence(sequence: string, start: number): number { + const maxPos = this.xmlLength - sequence.length; + for (let i = start; i <= maxPos; i++) { + let match = true; + for (let j = 0; j < sequence.length; j++) { + if (this.xml.charCodeAt(i + j) !== sequence.charCodeAt(j)) { + match = false; + break; + } + } + if (match) { + return i; + } + } + return -1; + } + + private trimmedSlice(start: number, end: number): string { + while (start < end && StaxXmlCursorSync.isWhitespace(this.xml.charCodeAt(start))) { + start++; + } + while (end > start && StaxXmlCursorSync.isWhitespace(this.xml.charCodeAt(end - 1))) { + end--; + } + return start < end ? this.xml.slice(start, end) : ''; + } + + private static isWhitespace(code: number): boolean { + if (code < 128) { + return StaxXmlCursorSync.ASCII_TABLE[code] === 1; + } + return code <= 32 || StaxXmlCursorSync.UNICODE_WHITESPACE.has(code); + } +} + +export default StaxXmlCursorSync; diff --git a/packages/stax-xml/src/StaxXmlParser.ts b/packages/stax-xml/src/StaxXmlParser.ts index 380b07d..5159d55 100644 --- a/packages/stax-xml/src/StaxXmlParser.ts +++ b/packages/stax-xml/src/StaxXmlParser.ts @@ -1,1094 +1,147 @@ -// StaxXmlParser.ts - UTF-8 safe version with Boyer-Moore-Horspool and Batch API import { - AnyXmlEvent, - CdataEvent, - CharactersEvent, - EndDocumentEvent, - EndElementEvent, - ErrorEvent, - StartElementEvent, - UnifiedXmlEvent, - XmlEventType + type AnyXmlEvent, + type EndDocumentEvent, + type EndElementEvent, + type ErrorEvent, + type StartDocumentEvent, + type StartElementEvent, + XmlEventFactory, + XmlEventType, } from './types'; +import { + StaxXmlCursor, + type StaxXmlCursorOptions as StaxXmlParserOptions, +} from './StaxXmlCursor'; -/** - * Configuration options for the StaxXmlParser - * - * @public - */ -export interface StaxXmlParserOptions { - /** - * Text encoding for the input stream - * @defaultValue 'utf-8' - */ - encoding?: string; - - /** - * Additional custom entities to decode - * @defaultValue [] - */ - addEntities?: { entity: string, value: string }[]; - - /** - * Whether to automatically decode XML entities - * @defaultValue true - */ - autoDecodeEntities?: boolean; - - /** - * Maximum buffer size in bytes - * @defaultValue 65536 - */ - maxBufferSize?: number; - - /** - * Whether to enable buffer compaction for memory efficiency - * @defaultValue true - */ - enableBufferCompaction?: boolean; - - /** - * Number of events to batch together - * @defaultValue 1 - */ - batchSize?: number; - - /** - * Timeout for batch processing in milliseconds - * @defaultValue 0 - */ - batchTimeout?: number; -} +export type { StaxXmlParserOptions }; -/** - * High-performance asynchronous XML parser implementing the StAX (Streaming API for XML) pattern. - * - * This parser provides memory-efficient processing of large XML files through streaming - * with support for pull-based parsing, custom entity handling, and namespace processing. - * - * @remarks - * The parser uses UTF-8 safe processing with Boyer-Moore-Horspool pattern search optimization - * and supports both single-event and batch processing modes for improved performance. - * - * @example - * Basic usage: - * ```typescript - * const xmlContent = 'Hello'; - * const stream = new ReadableStream({ - * start(controller) { - * controller.enqueue(new TextEncoder().encode(xmlContent)); - * controller.close(); - * } - * }); - * - * const parser = new StaxXmlParser(stream); - * for await (const event of parser) { - * console.log(event.type, event); - * } - * ``` - * - * @example - * With custom options: - * ```typescript - * const options = { - * autoDecodeEntities: true, - * maxBufferSize: 128 * 1024, - * addEntities: [{ entity: 'custom', value: 'replacement' }] - * }; - * const parser = new StaxXmlParser(stream, options); - * ``` - * - * @public - */ export class StaxXmlParser implements AsyncIterator { - private reader: ReadableStreamDefaultReader | null = null; - private readonly decoder: TextDecoder; - private buffer: Uint8Array; - private bufferLength: number = 0; - private position: number = 0; - private eventQueue: AnyXmlEvent[] = []; - private resolveNext: ((value: IteratorResult) => void) | null = null; - private error: Error | null = null; - private isStreamEnded: boolean = false; - private parserFinished: boolean = false; - private currentTextBuffer: string = ''; - private elementStack: string[] = []; - private namespaceStack: Map[] = []; + private readonly cursor: StaxXmlCursor; private readonly options: StaxXmlParserOptions; + private done = false; - // ===== Optimization tables and caches ===== - - // ASCII character fast classification table (actually used) - private static readonly ASCII_TABLE = (() => { - const table = new Uint8Array(128); - // Whitespace characters: 1 - table[9] = 1; // TAB - table[10] = 1; // LF - table[13] = 1; // CR - table[32] = 1; // SPACE - // XML special characters: 2-12 - table[60] = 2; // '<' - table[62] = 3; // '>' - table[47] = 4; // '/' - table[61] = 5; // '=' - table[33] = 6; // '!' - table[63] = 7; // '?' - table[34] = 8; // '"' - table[39] = 9; // "'" - table[38] = 10; // '&' - table[91] = 11; // '[' - table[93] = 12; // ']' - return table; - })(); - - // Entity regex cache - private static readonly ENTITY_REGEX_CACHE = new Map(); - private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; - private static readonly DEFAULT_ENTITY_MAP: Record = { - 'lt': '<', 'gt': '>', 'quot': '"', 'apos': "'", 'amp': '&' - }; - - // Compiled entity decoder - private readonly entityDecoder: (text: string) => string; - - // Boyer-Moore-Horspool pattern cache - private readonly bmhCache = new Map(); - - // Batch processing state - private batchMetrics = { - avgEventSize: 100, - lastBatchTime: 0, - eventCount: 0 - }; - - /** - * Creates a new StaxXmlParser instance. - * - * @param xmlStream - The ReadableStream containing XML data as Uint8Array chunks - * @param options - Configuration options for the parser - * @throws {Error} When xmlStream is not a valid ReadableStream - * - * @example - * ```typescript - * const xmlData = 'content'; - * const stream = new ReadableStream({ - * start(controller) { - * controller.enqueue(new TextEncoder().encode(xmlData)); - * controller.close(); - * } - * }); - * - * const parser = new StaxXmlParser(stream, { - * autoDecodeEntities: true, - * maxBufferSize: 64 * 1024 - * }); - * ``` - */ constructor(xmlStream: ReadableStream, options: StaxXmlParserOptions = {}) { - if (!(xmlStream instanceof ReadableStream)) { - throw new Error('xmlStream must be a web standard ReadableStream.'); - } - this.options = { - encoding: 'utf-8', - autoDecodeEntities: true, - maxBufferSize: 64 * 1024, - enableBufferCompaction: true, batchSize: 10, batchTimeout: 10, - ...options + ...options, }; - - // TextDecoder optimization settings - this.decoder = new TextDecoder(this.options.encoding, { - fatal: false, // Use replacement character � instead of error - ignoreBOM: true // Ignore BOM - }); - - this.buffer = new Uint8Array(this.options.maxBufferSize || 64 * 1024); - - // Pre-compile entity decoder - this.entityDecoder = this._compileEntityDecoder(); - - this.reader = xmlStream.getReader(); - this._startReading(); - // Inline START_DOCUMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.START_DOCUMENT, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as StartElementEvent); - } - - // ===== ASCII table utility methods ===== - - /** - * Fast XML special character check - */ - private getXmlCharType(byte: number): number { - return byte < 128 ? StaxXmlParser.ASCII_TABLE[byte] : 0; - } - - // ===== UTF-8 safety methods ===== - - /** - * Check if UTF-8 byte is the start of a character - * @param byte The byte to check - * @returns true if it's the start of a character - */ - private isUtf8CharStart(byte: number): boolean { - // ASCII (0xxxxxxx) or multibyte start (11xxxxxx) - // Not a continuation byte (10xxxxxx) - return (byte & 0x80) === 0 || (byte & 0xC0) === 0xC0; - } - - /** - * Calculate UTF-8 sequence length - * @param byte The first byte - * @returns Sequence length (1-4) - */ - private getUtf8SequenceLength(byte: number): number { - if ((byte & 0x80) === 0) return 1; // 0xxxxxxx - if ((byte & 0xE0) === 0xC0) return 2; // 110xxxxx - if ((byte & 0xF0) === 0xE0) return 3; // 1110xxxx - if ((byte & 0xF8) === 0xF0) return 4; // 11110xxx - return 1; // Invalid sequence + this.cursor = new StaxXmlCursor(xmlStream, this.options); } - /** - * Safely adjust position at UTF-8 character boundaries - * @param pos The position to adjust - * @param searchBackward Whether to search backwards - * @returns Safe UTF-8 boundary position - */ - private findSafeUtf8Boundary(pos: number, searchBackward: boolean = true): number { - if (pos <= 0 || pos >= this.bufferLength) return pos; - - if (searchBackward) { - // Search backwards to find character start - let safePos = pos; - let backtrack = 0; - - while (safePos > 0 && backtrack < 4) { - if (this.isUtf8CharStart(this.buffer[safePos])) { - // Check if the sequence starting at this position includes the original pos - const seqLen = this.getUtf8SequenceLength(this.buffer[safePos]); - if (safePos + seqLen > pos) { - // pos is in the middle of this character, return safePos - return safePos; - } else { - // pos is already at a safe boundary - return pos; - } - } - safePos--; - backtrack++; - } - return pos; // Could not find appropriate boundary - } else { - // Search forward to find next character start - while (pos < this.bufferLength && !this.isUtf8CharStart(this.buffer[pos])) { - pos++; - } - return pos; - } - } - - /** - * Safely extract UTF-8 string from buffer - * @param start Starting position - * @param end Ending position - * @returns Decoded string - */ - private safeDecodeRange(start: number, end: number): string { - // Adjust start and end to safe boundaries - const safeStart = this.findSafeUtf8Boundary(start, false); - const safeEnd = this.findSafeUtf8Boundary(end, true); - - if (safeStart >= safeEnd) return ''; - - return this.decoder.decode( - this.buffer.subarray(safeStart, safeEnd), - { stream: false } - ); - } - - // ===== Boyer-Moore-Horspool pattern search implementation ===== - - /** - * Build Boyer-Moore-Horspool bad character table - */ - private _buildBMHTable(pattern: Uint8Array): Uint8Array { - const table = new Uint8Array(256); - const patternLength = pattern.length; - - table.fill(patternLength); - - for (let i = 0; i < patternLength - 1; i++) { - table[pattern[i]] = patternLength - 1 - i; - } - - return table; - } - - /** - * Pattern search using Boyer-Moore-Horspool algorithm - * XML delimiters are all ASCII, so no UTF-8 boundary issues - */ - private _findPatternBMH(pattern: string, startPos?: number): number { - const patternBytes = new TextEncoder().encode(pattern); - const patternLength = patternBytes.length; - - if (patternLength === 0) return -1; - - if (patternLength === 1) { - return this._findSingleByte(patternBytes[0], startPos); - } - - let skipTable = this.bmhCache.get(pattern); - if (!skipTable) { - skipTable = this._buildBMHTable(patternBytes); - if (this.bmhCache.size > 20) { - // Cache size limit - this.bmhCache.clear(); - } - this.bmhCache.set(pattern, skipTable); - } - - const start = startPos || this.position; - const bufferEnd = this.bufferLength - patternLength; - let pos = start; - - while (pos <= bufferEnd) { - let i = patternLength - 1; - while (i >= 0 && this.buffer[pos + i] === patternBytes[i]) { - i--; - } - - if (i < 0) { - return pos; - } - - pos += skipTable[this.buffer[pos + patternLength - 1]]; - } - - return -1; - } - - /** - * Single byte search (optimized) - */ - private _findSingleByte(byte: number, startPos?: number): number { - const start = startPos || this.position; - const buffer = this.buffer; - const end = this.bufferLength; - - const end4 = end - 3; - let i = start; - - for (; i < end4; i += 4) { - if (buffer[i] === byte) return i; - if (buffer[i + 1] === byte) return i + 1; - if (buffer[i + 2] === byte) return i + 2; - if (buffer[i + 3] === byte) return i + 3; - } - - for (; i < end; i++) { - if (buffer[i] === byte) return i; - } - - return -1; - } - - // ===== Entity decoder compilation ===== - - private _compileEntityDecoder(): (text: string) => string { - if (!this.options.autoDecodeEntities) { - return (text) => text; + public async next(): Promise> { + if (this.done) { + return { value: undefined, done: true }; } - if (this.options.addEntities && this.options.addEntities.length > 0) { - const entityMap: Record = { ...StaxXmlParser.DEFAULT_ENTITY_MAP }; - const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; - - for (const { entity, value } of this.options.addEntities) { - if (entity && value) { - const key = entity.startsWith('&') && entity.endsWith(';') - ? entity.slice(1, -1) - : entity; - entityMap[key] = value; - patterns.push(key); - } - } - patterns.push('amp'); - - const cacheKey = patterns.join(','); - let regex = StaxXmlParser.ENTITY_REGEX_CACHE.get(cacheKey); - - if (!regex) { - const pattern = patterns - .sort((a, b) => b.length - a.length) - .map(e => e.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) - .join('|'); - regex = new RegExp(`&(${pattern});`, 'g'); - StaxXmlParser.ENTITY_REGEX_CACHE.set(cacheKey, regex); + try { + const tokenType = await this.cursor.next(); + const event = this.materializeEvent(tokenType); + if (tokenType === XmlEventType.END_DOCUMENT) { + this.done = true; } - - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - regex!.lastIndex = 0; - return text.replace(regex!, (_, entity) => entityMap[entity] || _); + return { + value: event, + done: false, + }; + } catch (error) { + this.done = true; + return { + value: XmlEventFactory.error(error as Error) as ErrorEvent, + done: false, }; } - - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - StaxXmlParser.DEFAULT_ENTITY_REGEX.lastIndex = 0; - return text.replace( - StaxXmlParser.DEFAULT_ENTITY_REGEX, - (_, entity) => StaxXmlParser.DEFAULT_ENTITY_MAP[entity] || _ - ); - }; - } - - // ===== Batch processing API ===== - - private _calculateOptimalBatchSize(): number { - const MIN_BATCH = 1; - const MAX_BATCH = this.options.batchSize || 10; - - if (this.bufferLength < 1024) return MIN_BATCH; - if (this.bufferLength > 10240) return MAX_BATCH; - - if (this.eventQueue.length > 0) { - const lastEvent = this.eventQueue[this.eventQueue.length - 1]; - if (lastEvent?.type === XmlEventType.CHARACTERS) { - return MIN_BATCH; - } - } - - if (this.batchMetrics.eventCount > 100) { - const avgSize = this.batchMetrics.avgEventSize; - if (avgSize > 1000) return MIN_BATCH; - if (avgSize < 100) return MAX_BATCH; - } - - return Math.min(MAX_BATCH, Math.max(MIN_BATCH, Math.floor(this.bufferLength / 1024))); } public async nextBatch(size?: number): Promise { const batch: AnyXmlEvent[] = []; - const targetSize = size || this._calculateOptimalBatchSize(); - const startTime = Date.now(); - const timeout = this.options.batchTimeout || 10; + const targetSize = size ?? 1; + const timeout = this.options.batchTimeout ?? 10; + const startedAt = Date.now(); for (let i = 0; i < targetSize; i++) { - if (Date.now() - startTime > timeout) { + if (Date.now() - startedAt > timeout) { break; } const result = await this.next(); - if (result.done) break; - batch.push(result.value); - } - - return batch; - } - - public async *batchedIterator(batchSize?: number): AsyncGenerator { - while (!this.parserFinished || this.eventQueue.length > 0) { - const targetSize = batchSize || this._calculateOptimalBatchSize(); - const batch = await this.nextBatch(targetSize); - if (batch.length === 0) break; - yield batch; - } - } - - // ===== Improved buffer management ===== - - private _compactBufferIfNeeded(): void { - if (!this.options.enableBufferCompaction) return; - - const maxSize = this.options.maxBufferSize || 64 * 1024; - - const shouldCompact = - (this.position > 8192 && this.bufferLength > 16384) || - (this.position > maxSize / 2) || - (this.bufferLength > maxSize && this.position > maxSize / 4); - - if (shouldCompact) { - this._compactBuffer(); - } - } - - private _compactBuffer(): void { - if (this.position > 0 && this.position < this.bufferLength) { - // Check UTF-8 boundaries - const safePos = this.findSafeUtf8Boundary(this.position, true); - - const remainingLength = this.bufferLength - safePos; - - if (remainingLength < safePos) { - const newBuffer = new Uint8Array(this.buffer.length); - newBuffer.set(this.buffer.subarray(safePos, this.bufferLength)); - this.buffer = newBuffer; - } else { - this.buffer.copyWithin(0, safePos, this.bufferLength); - } - - this.bufferLength = remainingLength; - this.position = this.position - safePos; - - if (this.bmhCache.size > 20) { - this.bmhCache.clear(); - } - } - } - - // ===== Main parsing logic ===== - - private async _startReading(): Promise { - try { - while (true) { - const { done, value } = await this.reader!.read(); - - if (done) { - this.isStreamEnded = true; - this._parseBuffer(); - - if (!this.parserFinished && this.elementStack.length > 0) { - this._addError(new Error('Unexpected end of document. Not all elements were closed.')); - } - - if (!this.parserFinished) { - this._flushCharacters(); - // Inline END_DOCUMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.END_DOCUMENT, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as EndDocumentEvent); - this.parserFinished = true; - } - - if (this.resolveNext && this.eventQueue.length === 0) { - this.resolveNext({ value: undefined, done: true }); - this.resolveNext = null; - } - break; - } - - this._appendToBuffer(value); - this._parseBuffer(); - this._compactBufferIfNeeded(); - this._updateBatchMetrics(value.length); - } - } catch (err) { - this._addError(err as Error); - if (this.resolveNext) { - this.resolveNext({ value: undefined, done: true }); - this.resolveNext = null; - } - } - } - - private _updateBatchMetrics(bytesProcessed: number): void { - const eventsDelta = this.eventQueue.length; - if (eventsDelta > 0) { - this.batchMetrics.eventCount += eventsDelta; - this.batchMetrics.avgEventSize = - (this.batchMetrics.avgEventSize * 0.9) + - ((bytesProcessed / eventsDelta) * 0.1); - } - this.batchMetrics.lastBatchTime = Date.now(); - } - - private _parseBuffer(): void { - while (this.position < this.bufferLength && !this.parserFinished) { - const ltPos = this._findSingleByte(60, this.position); // '<' - - if (ltPos === -1) { - if (this.isStreamEnded) { - const remainingText = this._readBuffer(); - this.currentTextBuffer += remainingText; - this._flushCharacters(); - } + if (result.done) { break; } - if (ltPos > this.position) { - try { - const textLength = ltPos - this.position; - const text = this._readBuffer(textLength); - this.currentTextBuffer += text; - } catch (error) { - if (!this.isStreamEnded) break; - throw error; - } - } - - this.position = ltPos; - - // Fast tag type identification using ASCII table - const nextByte = this.buffer[this.position + 1]; - const charType = this.getXmlCharType(nextByte); - - if (charType === 4) { // '/' (47) - this._flushCharacters(); - if (!this._parseEndTag()) break; - } else if (charType === 6) { // '!' (33) - if (this._matchesPattern(''); - if (endPos === -1) return false; - this.position = endPos + 3; - return true; - } - - /** - * UTF-8 safe CDATA parsing - */ - private _parseCData(): boolean { - const startPos = this.position + 9; // After ''); - if (endPos === -1) return false; - - try { - // Check UTF-8 boundaries - const safeStart = this.findSafeUtf8Boundary(startPos, false); - const safeEnd = this.findSafeUtf8Boundary(endPos, true); - - const cdataContent = this.decoder.decode( - this.buffer.subarray(safeStart, safeEnd), - { stream: false } - ); - - // Inline CDATA creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.CDATA, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: cdataContent, - error: undefined - } as UnifiedXmlEvent as CdataEvent); - - this.position = endPos + 3; - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - private _parseProcessingInstruction(): boolean { - const endPos = this._findPatternBMH('?>'); - if (endPos === -1) return false; - this.position = endPos + 2; - return true; - } - - /** - * UTF-8 safe end tag parsing - */ - private _parseEndTag(): boolean { - const gtPos = this._findSingleByte(62, this.position); // '>' - if (gtPos === -1) return false; - - try { - // Safely decode the entire tag - const tagContent = this.safeDecodeRange(this.position, gtPos + 1); - const closeTagMatch = tagContent.match(/^<\/([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)\s*>$/); - - if (!closeTagMatch) { - this._addError(new Error('Malformed closing tag')); - return true; - } - - const tagName = closeTagMatch[1]; - if (this.elementStack.length === 0 || this.elementStack[this.elementStack.length - 1] !== tagName) { - this._addError(new Error(`Mismatched closing tag: . Expected `)); - return true; - } - - const currentNamespaces = this.namespaceStack.length > 0 ? - this.namespaceStack[this.namespaceStack.length - 1] : new Map(); - const { localName, prefix, uri } = this._parseQualifiedName(tagName, currentNamespaces); - - this.elementStack.pop(); - this.namespaceStack.pop(); - - // Inline END_ELEMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.END_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as EndElementEvent); - - this.position = gtPos + 1; - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - /** - * UTF-8 safe start tag parsing (using ASCII table) - */ - private _parseStartTag(): boolean { - const gtPos = this._findSingleByte(62, this.position); // '>' - if (gtPos === -1) return false; - - try { - // Safely decode the entire tag - const tagContent = this.safeDecodeRange(this.position, gtPos + 1); - const tagMatch = tagContent.match(/^<([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)(\s+[^>]*?)?\s*(\/?)>$/); - - if (!tagMatch) { - this._addError(new Error('Malformed start tag')); - return true; - } - - const tagName = tagMatch[1]; - const attributesString = tagMatch[2] || ''; - const isSelfClosing = tagMatch[3] === '/'; - - const currentNamespaces = new Map(); - if (this.namespaceStack.length > 0) { - const parentNamespaces = this.namespaceStack[this.namespaceStack.length - 1]; - for (const [prefix, uri] of parentNamespaces) { - currentNamespaces.set(prefix, uri); - } - } - - const attributes: { [key: string]: string } = {}; - const attributesWithPrefix: { [key: string]: { value: string; prefix?: string; uri?: string } } = {}; - - // Attribute parsing - Unicode character support - const attrRegex = /([a-zA-Z0-9_:.\-\u0080-\uFFFF]+)(?:\s*=\s*"([^"]*)"|\s*=\s*'([^']*)')?/g; - let attrMatch; - - while ((attrMatch = attrRegex.exec(attributesString)) !== null) { - const attrName = attrMatch[1]; - const attrValue = this.entityDecoder(attrMatch[2] || attrMatch[3] || 'true'); - attributes[attrName] = attrValue; - - const attrNamespaceInfo = this._parseQualifiedName(attrName, currentNamespaces, true); - attributesWithPrefix[attrNamespaceInfo.localName] = { - value: attrValue, - prefix: attrNamespaceInfo.prefix, - uri: attrNamespaceInfo.uri - }; - - if (attrName === 'xmlns') { - currentNamespaces.set('', attrValue); - } else if (attrName.startsWith('xmlns:')) { - const prefix = attrName.substring(6); - currentNamespaces.set(prefix, attrValue); - } - } - - const { localName, prefix, uri } = this._parseQualifiedName(tagName, currentNamespaces); - - // Inline START_ELEMENT creation - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.START_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: attributes, - attributesWithPrefix: attributesWithPrefix, - value: undefined, - error: undefined - } as UnifiedXmlEvent as StartElementEvent); - - this.position = gtPos + 1; - - if (!isSelfClosing) { - this.elementStack.push(tagName); - this.namespaceStack.push(currentNamespaces); - } else { - // Inline END_ELEMENT creation for self-closing - maintains V8 hidden class optimization - this._addEvent({ - type: XmlEventType.END_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as UnifiedXmlEvent as EndElementEvent); - } - - return true; - } catch (error) { - if (!this.isStreamEnded) return false; - throw error; - } - } - - private _parseQualifiedName( - qname: string, - namespaces: Map, - isAttribute: boolean = false - ): { - localName: string; - prefix?: string; - uri?: string; - } { - const colonIndex = qname.indexOf(':'); - if (colonIndex === -1) { - if (isAttribute) { - return { - localName: qname, - prefix: undefined, - uri: undefined - }; - } else { - const defaultUri = namespaces.get(''); - return { - localName: qname, - prefix: undefined, - uri: defaultUri - }; - } - } else { - const prefix = qname.substring(0, colonIndex); - const localName = qname.substring(colonIndex + 1); - const uri = namespaces.get(prefix); - return { - localName, - prefix, - uri - }; - } - } - public get XmlEventType(): typeof XmlEventType { return XmlEventType; } + + private materializeEvent(tokenType: XmlEventType): AnyXmlEvent { + switch (tokenType) { + case XmlEventType.START_DOCUMENT: + return XmlEventFactory.startDocument() as StartDocumentEvent; + case XmlEventType.END_DOCUMENT: + return XmlEventFactory.endDocument() as EndDocumentEvent; + case XmlEventType.START_ELEMENT: + return XmlEventFactory.startElement( + this.cursor.name, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri, + this.cursor.getAttributes(), + this.toLegacyAsyncAttributesWithPrefix() + ) as StartElementEvent; + case XmlEventType.END_ELEMENT: + return XmlEventFactory.endElement( + this.cursor.name, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri + ) as EndElementEvent; + case XmlEventType.CHARACTERS: + return XmlEventFactory.characters(this.cursor.getText()); + case XmlEventType.CDATA: + return XmlEventFactory.cdata(this.cursor.getText()); + case XmlEventType.ERROR: + return XmlEventFactory.error(new Error('Cursor should not emit ERROR tokens.')); + default: + throw new Error(`Unsupported token type: ${String(tokenType)}`); + } + } + + private toLegacyAsyncAttributesWithPrefix(): Record { + const attributesWithPrefix = this.cursor.getAttributesWithPrefix(); + return Object.fromEntries( + Object.values(attributesWithPrefix).map((attribute) => [ + attribute.localName, + { + value: attribute.value, + prefix: attribute.prefix, + uri: attribute.uri, + }, + ]) + ); + } } -export default StaxXmlParser; \ No newline at end of file +export default StaxXmlParser; diff --git a/packages/stax-xml/src/StaxXmlParserSync.ts b/packages/stax-xml/src/StaxXmlParserSync.ts index 463a0d4..deb2e84 100644 --- a/packages/stax-xml/src/StaxXmlParserSync.ts +++ b/packages/stax-xml/src/StaxXmlParserSync.ts @@ -1,712 +1,88 @@ -// StaxXmlParserSync.ts - Optimized version using XmlEventFactory - import { - AnyXmlEvent, - AttributeInfo, - CdataEvent, - CharactersEvent, - EndDocumentEvent, - EndElementEvent, - StartElementEvent, - XmlEventType + type AnyXmlEvent, + type EndDocumentEvent, + type EndElementEvent, + type ErrorEvent, + type StartDocumentEvent, + type StartElementEvent, + XmlEventFactory, + XmlEventType, } from './types'; +import { + StaxXmlCursorSync, + type StaxXmlCursorSyncOptions, +} from './StaxXmlCursorSync'; -export interface StaxXmlParserSyncOptions { - autoDecodeEntities?: boolean; - addEntities?: { entity: string, value: string }[]; -} +export type { StaxXmlCursorSyncOptions }; export class StaxXmlParserSync implements Iterable, Iterator { - private readonly xml: string; - private readonly xmlLength: number; - private pos: number = 0; - private readonly elementStack: string[] = []; - private namespaceStack: Map[] = []; - private readonly options: StaxXmlParserSyncOptions; - private internalIterator?: Generator; - - // ===== Static optimization tables and caches ===== - - // ASCII character fast classification table (0-127) - private static readonly ASCII_TABLE = (() => { - const table = new Uint8Array(128); - // Whitespace characters: 1 - table[9] = 1; // TAB - table[10] = 1; // LF - table[13] = 1; // CR - table[32] = 1; // SPACE - // XML special characters - table[60] = 2; // '<' - table[62] = 3; // '>' - table[47] = 4; // '/' - table[61] = 5; // '=' - table[33] = 6; // '!' - table[63] = 7; // '?' - table[34] = 8; // '"' - table[39] = 9; // "'" - return table; - })(); - - // Multilingual whitespace character Set (fast lookup) - private static readonly UNICODE_WHITESPACE = new Set([ - 0x00A0, // Non-breaking space - 0x1680, // Ogham space - 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, // Various spaces - 0x2028, // Line separator - 0x2029, // Paragraph separator - 0x202F, // Narrow no-break space - 0x205F, // Medium mathematical space - 0x3000, // CJK ideographic space - 0xFEFF // Zero-width no-break space - ]); - - // Entity regex cache - private static readonly ENTITY_REGEX_CACHE = new Map(); - private static readonly DEFAULT_ENTITY_REGEX = /&(lt|gt|quot|apos|amp);/g; - private static readonly DEFAULT_ENTITY_MAP: Record = { - 'lt': '<', 'gt': '>', 'quot': '"', 'apos': "'", 'amp': '&' - }; - - // Compiled entity decoder (per-instance caching) - private readonly entityDecoder: (text: string) => string; - - constructor(xml: string, options: StaxXmlParserSyncOptions = {}) { - this.xml = xml; - this.xmlLength = xml.length; - this.options = { - autoDecodeEntities: true, - ...options - }; - - this.namespaceStack.push(new Map()); - - // Pre-compile entity decoder - this.entityDecoder = this.compileEntityDecoder(); - } - - // ===== Helper methods: character classification ===== - - private static isWhitespace(code: number): boolean { - if (code < 128) { - return StaxXmlParserSync.ASCII_TABLE[code] === 1; - } - return code <= 32 || StaxXmlParserSync.UNICODE_WHITESPACE.has(code); - } - - // ===== Helper methods: surrogate pair handling ===== - - private static isHighSurrogate(code: number): boolean { - return code >= 0xD800 && code <= 0xDBFF; - } - - private static isLowSurrogate(code: number): boolean { - return code >= 0xDC00 && code <= 0xDFFF; - } - - // ===== Optimized string processing ===== - - // indexOf replacement - fast character search (16-byte unrolling) - private findChar(targetCode: number, start: number = this.pos): number { - const xml = this.xml; - const len = this.xmlLength; - - // Performance improvement with 16-byte unrolling - const len16 = len - 15; - let i = start; - - // 16-byte unrolling loop - for (; i < len16; i += 16) { - if (xml.charCodeAt(i) === targetCode) return i; - if (xml.charCodeAt(i + 1) === targetCode) return i + 1; - if (xml.charCodeAt(i + 2) === targetCode) return i + 2; - if (xml.charCodeAt(i + 3) === targetCode) return i + 3; - if (xml.charCodeAt(i + 4) === targetCode) return i + 4; - if (xml.charCodeAt(i + 5) === targetCode) return i + 5; - if (xml.charCodeAt(i + 6) === targetCode) return i + 6; - if (xml.charCodeAt(i + 7) === targetCode) return i + 7; - if (xml.charCodeAt(i + 8) === targetCode) return i + 8; - if (xml.charCodeAt(i + 9) === targetCode) return i + 9; - if (xml.charCodeAt(i + 10) === targetCode) return i + 10; - if (xml.charCodeAt(i + 11) === targetCode) return i + 11; - if (xml.charCodeAt(i + 12) === targetCode) return i + 12; - if (xml.charCodeAt(i + 13) === targetCode) return i + 13; - if (xml.charCodeAt(i + 14) === targetCode) return i + 14; - if (xml.charCodeAt(i + 15) === targetCode) return i + 15; - } - - // Handle remaining bytes - for (; i < len; i++) { - if (xml.charCodeAt(i) === targetCode) return i; - } - - return -1; - } - - // Fast string search (startsWith replacement) - private matchesAt(str: string, pos: number): boolean { - const len = str.length; - if (pos + len > this.xmlLength) return false; - - for (let i = 0; i < len; i++) { - if (this.xml.charCodeAt(pos + i) !== str.charCodeAt(i)) { - return false; - } - } - return true; - } - - // Inline trim (avoid substring) - private trimmedSlice(start: number, end: number): string { - const xml = this.xml; - - // Remove leading whitespace - while (start < end && StaxXmlParserSync.isWhitespace(xml.charCodeAt(start))) { - if (StaxXmlParserSync.isHighSurrogate(xml.charCodeAt(start))) { - start += 2; - } else { - start++; - } - } + private readonly cursor: StaxXmlCursorSync; + private done = false; - // Remove trailing whitespace - while (end > start && StaxXmlParserSync.isWhitespace(xml.charCodeAt(end - 1))) { - // Surrogate pair check (reverse direction) - if (end > start + 1 && - StaxXmlParserSync.isLowSurrogate(xml.charCodeAt(end - 1)) && - StaxXmlParserSync.isHighSurrogate(xml.charCodeAt(end - 2))) { - end -= 2; - } else { - end--; - } - } - - return start < end ? xml.slice(start, end) : ''; + constructor(xml: string, options: StaxXmlCursorSyncOptions = {}) { + this.cursor = new StaxXmlCursorSync(xml, options); } - // ===== Entity processing optimization ===== - - private compileEntityDecoder(): (text: string) => string { - if (!this.options.autoDecodeEntities) { - return (text) => text; - } - - // If custom entities exist - if (this.options.addEntities && this.options.addEntities.length > 0) { - const entityMap: Record = { ...StaxXmlParserSync.DEFAULT_ENTITY_MAP }; - const patterns: string[] = ['lt', 'gt', 'quot', 'apos']; - - for (const { entity, value } of this.options.addEntities) { - if (entity && value) { - // Extract only entity part from &entity; format - const key = entity.startsWith('&') && entity.endsWith(';') - ? entity.slice(1, -1) - : entity; - entityMap[key] = value; - patterns.push(key); - } - } - patterns.push('amp'); // amp goes last - - // Generate cache key and regex caching - const cacheKey = patterns.join(','); - let regex = StaxXmlParserSync.ENTITY_REGEX_CACHE.get(cacheKey); - - if (!regex) { - const pattern = patterns - .sort((a, b) => b.length - a.length) // Longer patterns first - .map(e => e.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) - .join('|'); - regex = new RegExp(`&(${pattern});`, 'g'); - StaxXmlParserSync.ENTITY_REGEX_CACHE.set(cacheKey, regex); - } - - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - regex!.lastIndex = 0; - return text.replace(regex!, (_, entity) => entityMap[entity] || _); - }; - } - - // Use only default entities - return (text: string) => { - if (!text || text.indexOf('&') === -1) return text; - StaxXmlParserSync.DEFAULT_ENTITY_REGEX.lastIndex = 0; - return text.replace( - StaxXmlParserSync.DEFAULT_ENTITY_REGEX, - (_, entity) => StaxXmlParserSync.DEFAULT_ENTITY_MAP[entity] || _ - ); - }; - } - - // ===== Main parsing logic - using EventFactory ===== - - /** - * Symbol.iterator implementation - returns this instance as iterator - * This ensures for...of and explicit next() calls use the same iterator state - */ public [Symbol.iterator](): Iterator { return this; } - /** - * Internal generator that actually yields AnyXmlEvent - * Important: Return type is same as before - Iterator - * Factory internally creates UnifiedXmlEvent, but - * types are returned as StartElementEvent, EndElementEvent etc. so - * perfectly compatible with AnyXmlEvent union type - */ - private *internalGenerator(): Generator { - // Inline startDocument() - maintains V8 hidden class optimization - // All events have same shape with undefined for unused fields - yield { - type: XmlEventType.START_DOCUMENT, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as unknown as StartElementEvent; - - while (this.pos < this.xmlLength) { - const ltPos = this.findChar(60, this.pos); // Find '<' - - if (ltPos === -1) { - // Process remaining text - if (this.pos < this.xmlLength) { - const text = this.trimmedSlice(this.pos, this.xmlLength); - if (text) { - // Inline characters() - maintains V8 hidden class optimization - yield { - type: XmlEventType.CHARACTERS, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: this.entityDecoder(text), - error: undefined - } as unknown as CharactersEvent; - } - } - break; - } - - // Process text before '<' - if (ltPos > this.pos) { - const text = this.trimmedSlice(this.pos, ltPos); - if (text) { - // Inline characters() - maintains V8 hidden class optimization - yield { - type: XmlEventType.CHARACTERS, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: this.entityDecoder(text), - error: undefined - } as unknown as CharactersEvent; - } - } - - this.pos = ltPos; - const nextCharCode = this.xml.charCodeAt(this.pos + 1); - - switch (nextCharCode) { - case 47: // '/' - yield* this.parseEndTag(); - break; - case 33: // '!' - yield* this.parseCdataCommentDoctype(); - break; - case 63: // '?' - yield* this.parseProcessingInstruction(); - break; - default: - yield* this.parseStartTag(); - break; - } - } - - // Inline endDocument() - maintains V8 hidden class optimization - yield { - type: XmlEventType.END_DOCUMENT, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as unknown as EndDocumentEvent; - } - public next(): IteratorResult { - if (!this.internalIterator) { - this.internalIterator = this.internalGenerator(); - } - return this.internalIterator.next(); - } - - // ===== Tag parsing methods - using EventFactory ===== - - private *parseEndTag(): Generator { - const tagClose = this.findChar(62, this.pos); // '>' - if (tagClose === -1) throw new Error('Unclosed end tag'); - - const fullTagName = this.trimmedSlice(this.pos + 2, tagClose); - - if (this.elementStack.length === 0) { - throw new Error(`Mismatched closing tag: . No open elements.`); - } - - const expectedTagName = this.elementStack[this.elementStack.length - 1]; - if (fullTagName !== expectedTagName) { - throw new Error(`Mismatched closing tag: . Expected .`); - } - - this.elementStack.pop(); - const currentNamespaces = this.namespaceStack.pop(); - - // Inline QName parsing (for end tag) - optimized - const colonIndex = fullTagName.indexOf(':'); - let localName: string, prefix: string | undefined, uri: string | undefined; - - if (colonIndex === -1) { - localName = fullTagName; - prefix = undefined; - uri = currentNamespaces ? currentNamespaces.get('') : undefined; - } else { - prefix = fullTagName.slice(0, colonIndex); - localName = fullTagName.slice(colonIndex + 1); - uri = currentNamespaces ? currentNamespaces.get(prefix) : undefined; - } - - // Inline endElement() - maintains V8 hidden class optimization - yield { - type: XmlEventType.END_ELEMENT, - name: fullTagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as unknown as EndElementEvent; - - this.pos = tagClose + 1; - } - - private *parseCdataCommentDoctype(): Generator { - if (this.matchesAt('', this.pos + 9); - if (cdataEnd === -1) throw new Error('Unclosed CDATA section'); - - const cdataContent = this.xml.slice(this.pos + 9, cdataEnd); - // Inline cdata() - maintains V8 hidden class optimization - yield { - type: XmlEventType.CDATA, - name: undefined, - localName: undefined, - prefix: undefined, - uri: undefined, - attributes: undefined, - attributesWithPrefix: undefined, - value: cdataContent, - error: undefined - } as unknown as CdataEvent; - - this.pos = cdataEnd + 3; - } else if (this.matchesAt('', this.pos + 4); - if (commentEnd === -1) throw new Error('Unclosed comment'); - this.pos = commentEnd + 3; - } else if (this.matchesAt('' - if (doctypeEnd === -1) throw new Error('Unclosed DOCTYPE declaration'); - this.pos = doctypeEnd + 1; - } - } - - private *parseProcessingInstruction(): Generator { - const piEnd = this.findSequence('?>', this.pos); - if (piEnd === -1) throw new Error('Unclosed processing instruction'); - this.pos = piEnd + 2; - } - - private *parseStartTag(): Generator { - const tagStart = this.pos + 1; - const tagEnd = this.findTagEnd(tagStart); - if (tagEnd === -1) throw new Error('Unclosed start tag'); - - let isSelfClosing = false; - let actualEnd = tagEnd; - - if (this.xml.charCodeAt(tagEnd - 1) === 47) { // '/' - isSelfClosing = true; - actualEnd = tagEnd - 1; + if (this.done) { + return { value: undefined, done: true }; } - // Separate tag name and attributes (optimized) - let nameEnd = tagStart; - const xml = this.xml; - - // Fast tag name search (find whitespace, '>', '/') - while (nameEnd < actualEnd) { - const code = xml.charCodeAt(nameEnd); - // Fast branching for ASCII range - if (code <= 32) { - if (StaxXmlParserSync.isWhitespace(code)) break; - } else if (code === 62 || code === 47) { // '>' or '/' - break; - } - nameEnd++; - } - - const tagName = xml.slice(tagStart, nameEnd); - - // Create namespace context - const currentNamespaces = new Map(); - if (this.namespaceStack.length > 0) { - const parentNamespaces = this.namespaceStack[this.namespaceStack.length - 1]; - for (const [prefix, uri] of parentNamespaces) { - currentNamespaces.set(prefix, uri); + try { + const tokenType = this.cursor.next(); + const event = this.materializeEvent(tokenType); + if (tokenType === XmlEventType.END_DOCUMENT) { + this.done = true; } - } - - // Attribute parsing (inline optimization) - const { attributes, attributesWithPrefix } = this.parseAttributesFast( - nameEnd, - actualEnd, - currentNamespaces - ); - - // Inline QName parsing (for tag name) - optimized - const colonIndex = tagName.indexOf(':'); - let localName: string, prefix: string | undefined, uri: string | undefined; - - if (colonIndex === -1) { - localName = tagName; - prefix = undefined; - uri = currentNamespaces.get(''); - } else { - prefix = tagName.slice(0, colonIndex); - localName = tagName.slice(colonIndex + 1); - uri = currentNamespaces.get(prefix); - } - - // Inline startElement() - maintains V8 hidden class optimization - yield { - type: XmlEventType.START_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes, - attributesWithPrefix, - value: undefined, - error: undefined - } as unknown as StartElementEvent; - - this.elementStack.push(tagName); - - if (!isSelfClosing) { - this.namespaceStack.push(currentNamespaces); - } else { - // Inline endElement() for self-closing tags - maintains V8 hidden class optimization - yield { - type: XmlEventType.END_ELEMENT, - name: tagName, - localName, - prefix, - uri, - attributes: undefined, - attributesWithPrefix: undefined, - value: undefined, - error: undefined - } as unknown as EndElementEvent; - this.elementStack.pop(); - } - - this.pos = tagEnd + 1; - } - - // ===== Attribute parsing (optimized) ===== - - private parseAttributesFast( - start: number, - end: number, - namespaces: Map - ): { - attributes: Record, - attributesWithPrefix: Record - } { - // Fast path: when there are no attributes - if (start >= end) { return { - attributes: {}, - attributesWithPrefix: {} + value: event, + done: false, }; - } - - const attributes: Record = {}; - const attributesWithPrefix: Record = {}; - - let i = start; - const xml = this.xml; - - while (i < end) { - // Skip whitespace (handled inline) - while (i < end && StaxXmlParserSync.isWhitespace(xml.charCodeAt(i))) i++; - if (i >= end) break; - - // Extract attribute name - const nameStart = i; - while (i < end) { - const code = xml.charCodeAt(i); - if (code === 61 || StaxXmlParserSync.isWhitespace(code)) break; // '=' or space - i++; - } - - if (i === nameStart) break; - const attrName = xml.slice(nameStart, i); - - // Find '=' - while (i < end && StaxXmlParserSync.isWhitespace(xml.charCodeAt(i))) i++; - if (i >= end || xml.charCodeAt(i) !== 61) { - // Boolean attribute - parseQualifiedName inline processing - attributes[attrName] = 'true'; - - // Inline QName parsing (for attributes) - const colonIndex = attrName.indexOf(':'); - let localName: string, prefix: string | undefined, uri: string | undefined; - - if (colonIndex === -1) { - localName = attrName; - prefix = undefined; - uri = undefined; - } else { - prefix = attrName.slice(0, colonIndex); - localName = attrName.slice(colonIndex + 1); - uri = namespaces.get(prefix); - } - - attributesWithPrefix[attrName] = { value: 'true', localName, prefix, uri }; - continue; - } - - i++; // Skip '=' - - // Find quotes - while (i < end && StaxXmlParserSync.isWhitespace(xml.charCodeAt(i))) i++; - if (i >= end) break; - - const quote = xml.charCodeAt(i); - if (quote !== 34 && quote !== 39) break; // '"' or "'" - - i++; - const valueStart = i; - - // Find end of value - while (i < end && xml.charCodeAt(i) !== quote) i++; - - const rawValue = xml.slice(valueStart, i); - const attrValue = this.entityDecoder(rawValue); - attributes[attrName] = attrValue; - - // Handle xmlns - if (attrName === 'xmlns') { - namespaces.set('', attrValue); - } else if (attrName.startsWith('xmlns:')) { - namespaces.set(attrName.slice(6), attrValue); - } - - // Inline QName parsing (for attributes) - optimized - const colonIndex = attrName.indexOf(':'); - let localName: string, prefix: string | undefined, uri: string | undefined; - - if (colonIndex === -1) { - localName = attrName; - prefix = undefined; - uri = undefined; - } else { - prefix = attrName.slice(0, colonIndex); - localName = attrName.slice(colonIndex + 1); - uri = namespaces.get(prefix); - } - - // Special handling for xmlns attributes - if (attrName.startsWith('xmlns')) { - if (attrName === 'xmlns') { - localName = 'xmlns'; - prefix = undefined; - } else { - localName = attrName.slice(6); - prefix = 'xmlns'; - } - } - - attributesWithPrefix[attrName] = { - value: attrValue, - localName, - prefix, - uri + } catch (error) { + this.done = true; + return { + value: XmlEventFactory.error(error as Error) as ErrorEvent, + done: false, }; - - i++; // Skip closing quote } - - return { attributes, attributesWithPrefix }; } - // ===== Utility methods ===== - - private findTagEnd(start: number): number { - let i = start; - let inQuote = false; - let quoteChar = 0; - - while (i < this.xmlLength) { - const code = this.xml.charCodeAt(i); - - if (code === 34 || code === 39) { // '"' or "'" - if (!inQuote) { - inQuote = true; - quoteChar = code; - } else if (code === quoteChar) { - inQuote = false; - quoteChar = 0; - } - } else if (code === 62 && !inQuote) { // '>' - return i; - } - i++; + private materializeEvent(tokenType: XmlEventType): AnyXmlEvent { + switch (tokenType) { + case XmlEventType.START_DOCUMENT: + return XmlEventFactory.startDocument() as StartDocumentEvent; + case XmlEventType.END_DOCUMENT: + return XmlEventFactory.endDocument() as EndDocumentEvent; + case XmlEventType.START_ELEMENT: + return XmlEventFactory.startElement( + this.cursor.name, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri, + this.cursor.getAttributes(), + this.cursor.getAttributesWithPrefix() + ) as StartElementEvent; + case XmlEventType.END_ELEMENT: + return XmlEventFactory.endElement( + this.cursor.name, + this.cursor.localName, + this.cursor.prefix, + this.cursor.uri + ) as EndElementEvent; + case XmlEventType.CHARACTERS: + return XmlEventFactory.characters(this.cursor.getText()); + case XmlEventType.CDATA: + return XmlEventFactory.cdata(this.cursor.getText()); + case XmlEventType.ERROR: + return XmlEventFactory.error(new Error('Cursor should not emit ERROR tokens.')); + default: + throw new Error(`Unsupported token type: ${String(tokenType)}`); } - return -1; - } - - private findSequence(sequence: string, start: number): number { - const seqLen = sequence.length; - const maxPos = this.xmlLength - seqLen; - - for (let i = start; i <= maxPos; i++) { - let match = true; - for (let j = 0; j < seqLen; j++) { - if (this.xml.charCodeAt(i + j) !== sequence.charCodeAt(j)) { - match = false; - break; - } - } - if (match) return i; - } - return -1; } +} -} \ No newline at end of file +export default StaxXmlParserSync; diff --git a/packages/stax-xml/src/index.ts b/packages/stax-xml/src/index.ts index ad5e34f..686960c 100644 --- a/packages/stax-xml/src/index.ts +++ b/packages/stax-xml/src/index.ts @@ -1,5 +1,7 @@ export * from "./StaxXmlParser.js"; export * from "./StaxXmlParserSync.js"; +export * from "./StaxXmlCursor.js"; +export * from "./StaxXmlCursorSync.js"; export * from "./StaxXmlWriter.js"; export * from "./StaxXmlWriterSync.js"; @@ -7,4 +9,3 @@ export { isCdata, isCharacters, isEndDocument, isEndElement, isError, isStartDoc export type { AnyXmlEvent, CdataEvent, CharactersEvent, ErrorEvent, StartElementEvent, WriteElementOptions, XmlAttribute } from "./types.js"; - diff --git a/packages/stax-xml/src/internal/AttributeCollector.ts b/packages/stax-xml/src/internal/AttributeCollector.ts new file mode 100644 index 0000000..8d9ea65 --- /dev/null +++ b/packages/stax-xml/src/internal/AttributeCollector.ts @@ -0,0 +1,118 @@ +import type { AttributeInfo } from '../types'; + +interface AttributeEntry { + rawName: string; + localName: string; + prefix?: string; + uri?: string; + value?: string; + sourceStart?: number; + sourceEnd?: number; +} + +export class AttributeCollector { + private source = ''; + private entries: AttributeEntry[] = []; + private entryIndex = new Map(); + private attributesCache?: Record; + private attributesWithPrefixCache?: Record; + + constructor(private readonly decodeValue: (text: string) => string) {} + + reset(source: string): void { + this.source = source; + this.entries = []; + this.entryIndex.clear(); + this.attributesCache = undefined; + this.attributesWithPrefixCache = undefined; + } + + addDecoded( + rawName: string, + localName: string, + prefix: string | undefined, + uri: string | undefined, + value: string + ): void { + this.addEntry({ + rawName, + localName, + prefix, + uri, + value, + }); + } + + addLazy( + rawName: string, + localName: string, + prefix: string | undefined, + uri: string | undefined, + sourceStart: number, + sourceEnd: number + ): void { + this.addEntry({ + rawName, + localName, + prefix, + uri, + sourceStart, + sourceEnd, + }); + } + + isEmpty(): boolean { + return this.entries.length === 0; + } + + getAttributeValue(rawName: string): string | undefined { + const index = this.entryIndex.get(rawName); + if (index === undefined) { + return undefined; + } + + return this.materializeValue(this.entries[index]); + } + + getAttributes(): Record { + if (!this.attributesCache) { + this.attributesCache = {}; + for (const entry of this.entries) { + this.attributesCache[entry.rawName] = this.materializeValue(entry); + } + } + + return this.attributesCache; + } + + getAttributesWithPrefix(): Record { + if (!this.attributesWithPrefixCache) { + this.attributesWithPrefixCache = {}; + for (const entry of this.entries) { + this.attributesWithPrefixCache[entry.rawName] = { + value: this.materializeValue(entry), + localName: entry.localName, + prefix: entry.prefix, + uri: entry.uri, + }; + } + } + + return this.attributesWithPrefixCache; + } + + private addEntry(entry: AttributeEntry): void { + this.entryIndex.set(entry.rawName, this.entries.length); + this.entries.push(entry); + this.attributesCache = undefined; + this.attributesWithPrefixCache = undefined; + } + + private materializeValue(entry: AttributeEntry): string { + if (entry.value === undefined) { + entry.value = this.decodeValue(this.source.slice(entry.sourceStart, entry.sourceEnd)); + } + + return entry.value; + } +} diff --git a/packages/stax-xml/src/internal/XmlCursorParserUtil.ts b/packages/stax-xml/src/internal/XmlCursorParserUtil.ts new file mode 100644 index 0000000..2d7354c --- /dev/null +++ b/packages/stax-xml/src/internal/XmlCursorParserUtil.ts @@ -0,0 +1,244 @@ +import { AttributeCollector } from './AttributeCollector'; + +export interface QualifiedNameInfo { + name: string; + localName: string; + prefix?: string; + uri?: string; +} + +export interface AttributeNameInfo { + rawName: string; + localName: string; + prefix?: string; + uri?: string; + isNamespaceDeclaration: boolean; +} + +export function cloneNamespaces(parent: Map | undefined): Map { + const namespaces = new Map(); + if (!parent) { + return namespaces; + } + + for (const [prefix, uri] of parent) { + namespaces.set(prefix, uri); + } + + return namespaces; +} + +export function resolveElementName( + qname: string, + namespaces: Map +): QualifiedNameInfo { + const colonIndex = qname.indexOf(':'); + if (colonIndex === -1) { + return { + name: qname, + localName: qname, + uri: namespaces.get(''), + }; + } + + const prefix = qname.slice(0, colonIndex); + const localName = qname.slice(colonIndex + 1); + return { + name: qname, + localName, + prefix, + uri: namespaces.get(prefix), + }; +} + +export function resolveAttributeName( + rawName: string, + namespaces: Map +): AttributeNameInfo { + if (rawName === 'xmlns') { + return { + rawName, + localName: 'xmlns', + isNamespaceDeclaration: true, + }; + } + + if (rawName.startsWith('xmlns:')) { + return { + rawName, + localName: rawName.slice(6), + prefix: 'xmlns', + isNamespaceDeclaration: true, + }; + } + + const colonIndex = rawName.indexOf(':'); + if (colonIndex === -1) { + return { + rawName, + localName: rawName, + isNamespaceDeclaration: false, + }; + } + + const prefix = rawName.slice(0, colonIndex); + const localName = rawName.slice(colonIndex + 1); + return { + rawName, + localName, + prefix, + uri: namespaces.get(prefix), + isNamespaceDeclaration: false, + }; +} + +export function applyNamespaceDeclaration( + attribute: AttributeNameInfo, + value: string, + namespaces: Map +): void { + if (!attribute.isNamespaceDeclaration) { + return; + } + + if (attribute.prefix === 'xmlns') { + namespaces.set(attribute.localName, value); + return; + } + + namespaces.set('', value); +} + +interface PendingAttribute { + rawName: string; + sourceStart?: number; + sourceEnd?: number; + decodedValue?: string; +} + +export function collectAttributesFromSource( + source: string, + start: number, + end: number, + namespaces: Map, + collector: AttributeCollector, + decodeValue: (text: string) => string, + isWhitespace: (code: number) => boolean +): void { + collector.reset(source); + if (start >= end) { + return; + } + + const pendingAttributes: PendingAttribute[] = []; + let i = start; + + while (i < end) { + while (i < end && isWhitespace(source.charCodeAt(i))) { + i++; + } + if (i >= end) { + break; + } + + const nameStart = i; + while (i < end) { + const code = source.charCodeAt(i); + if (code === 61 || isWhitespace(code)) { + break; + } + i++; + } + + if (i === nameStart) { + break; + } + + const rawName = source.slice(nameStart, i); + while (i < end && isWhitespace(source.charCodeAt(i))) { + i++; + } + + if (i >= end || source.charCodeAt(i) !== 61) { + pendingAttributes.push({ + rawName, + decodedValue: 'true', + }); + continue; + } + + i++; + while (i < end && isWhitespace(source.charCodeAt(i))) { + i++; + } + if (i >= end) { + throw new Error(`Unterminated attribute value for ${rawName}`); + } + + const quote = source.charCodeAt(i); + if (quote !== 34 && quote !== 39) { + throw new Error(`Unterminated attribute value for ${rawName}`); + } + + i++; + const valueStart = i; + while (i < end && source.charCodeAt(i) !== quote) { + i++; + } + if (i >= end) { + throw new Error(`Unterminated attribute value for ${rawName}`); + } + + pendingAttributes.push({ + rawName, + sourceStart: valueStart, + sourceEnd: i, + }); + i++; + } + + for (const pending of pendingAttributes) { + const value = pending.decodedValue ?? decodeValue(source.slice(pending.sourceStart, pending.sourceEnd)); + const nameInfo = resolveAttributeName(pending.rawName, namespaces); + + if (!nameInfo.isNamespaceDeclaration) { + continue; + } + + applyNamespaceDeclaration(nameInfo, value, namespaces); + collector.addDecoded( + nameInfo.rawName, + nameInfo.localName, + nameInfo.prefix, + nameInfo.uri, + value + ); + } + + for (const pending of pendingAttributes) { + const nameInfo = resolveAttributeName(pending.rawName, namespaces); + if (nameInfo.isNamespaceDeclaration) { + continue; + } + + if (pending.decodedValue !== undefined) { + collector.addDecoded( + nameInfo.rawName, + nameInfo.localName, + nameInfo.prefix, + nameInfo.uri, + pending.decodedValue + ); + continue; + } + + collector.addLazy( + nameInfo.rawName, + nameInfo.localName, + nameInfo.prefix, + nameInfo.uri, + pending.sourceStart, + pending.sourceEnd + ); + } +} diff --git a/packages/stax-xml/test/cursor-async.test.ts b/packages/stax-xml/test/cursor-async.test.ts new file mode 100644 index 0000000..73caf60 --- /dev/null +++ b/packages/stax-xml/test/cursor-async.test.ts @@ -0,0 +1,59 @@ +import { describe, expect, it } from 'vitest'; +import { StaxXmlCursor } from '../src/StaxXmlCursor'; +import { XmlEventType } from '../src/types'; +import { createChunkedStream, stringToReadableStream } from './helpers/parser-trace'; + +describe('StaxXmlCursor', () => { + it('should expose token state across chunked input', async () => { + const cursor = new StaxXmlCursor( + createChunkedStream('text', 4) + ); + + await expect(cursor.next()).resolves.toBe(XmlEventType.START_DOCUMENT); + + await expect(cursor.next()).resolves.toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('root'); + expect(cursor.getAttributes()).toEqual({ + 'xmlns:a': 'urn:a', + }); + + await expect(cursor.next()).resolves.toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('a:item'); + expect(cursor.localName).toBe('item'); + expect(cursor.prefix).toBe('a'); + expect(cursor.uri).toBe('urn:a'); + expect(cursor.getAttributeValue('attr')).toBe('value'); + + await expect(cursor.next()).resolves.toBe(XmlEventType.CHARACTERS); + expect(cursor.getText()).toBe('text'); + + await expect(cursor.next()).resolves.toBe(XmlEventType.END_ELEMENT); + expect(cursor.name).toBe('a:item'); + await expect(cursor.next()).resolves.toBe(XmlEventType.END_ELEMENT); + await expect(cursor.next()).resolves.toBe(XmlEventType.END_DOCUMENT); + expect(cursor.hasNext()).toBe(false); + }); + + it('should enforce the single-consumer contract while a pull is in flight', async () => { + const cursor = new StaxXmlCursor(createChunkedStream('', 1)); + + const pendingNext = cursor.next(); + + expect(() => cursor.hasNext()).toThrow('Concurrent cursor access is not allowed.'); + await expect(cursor.next()).rejects.toThrow('Concurrent cursor access is not allowed.'); + + await expect(pendingNext).resolves.toBe(XmlEventType.START_DOCUMENT); + }); + + it('should fail on malformed XML and rethrow the same error after failure', async () => { + const cursor = new StaxXmlCursor(stringToReadableStream('')); + + await expect(cursor.next()).resolves.toBe(XmlEventType.START_DOCUMENT); + await expect(cursor.next()).resolves.toBe(XmlEventType.START_ELEMENT); + await expect(cursor.next()).resolves.toBe(XmlEventType.START_ELEMENT); + + await expect(cursor.next()).rejects.toThrow('Mismatched closing tag'); + expect(cursor.hasNext()).toBe(false); + await expect(cursor.next()).rejects.toThrow('Mismatched closing tag'); + }); +}); diff --git a/packages/stax-xml/test/cursor-sync.test.ts b/packages/stax-xml/test/cursor-sync.test.ts new file mode 100644 index 0000000..914c4a0 --- /dev/null +++ b/packages/stax-xml/test/cursor-sync.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; +import { StaxXmlCursorSync } from '../src/StaxXmlCursorSync'; +import { XmlEventType } from '../src/types'; + +describe('StaxXmlCursorSync', () => { + it('should expose token state and lazy attributes for the current START_ELEMENT', () => { + const cursor = new StaxXmlCursorSync('text'); + + expect(cursor.hasNext()).toBe(true); + expect(cursor.next()).toBe(XmlEventType.START_DOCUMENT); + expect(() => cursor.getAttributes()).toThrow('Current token does not expose attributes.'); + + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('root'); + expect(cursor.localName).toBe('root'); + expect(cursor.uri).toBeUndefined(); + expect(cursor.getAttributeValue('a:flag')).toBe('on'); + expect(cursor.getAttributes()).toEqual({ + 'a:flag': 'on', + 'xmlns:a': 'urn:a', + }); + + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('a:item'); + expect(cursor.localName).toBe('item'); + expect(cursor.prefix).toBe('a'); + expect(cursor.uri).toBe('urn:a'); + expect(cursor.getAttributeValue('attr')).toBe('value'); + + expect(cursor.next()).toBe(XmlEventType.CHARACTERS); + expect(cursor.getText()).toBe('text'); + expect(() => cursor.getAttributes()).toThrow('Current token does not expose attributes.'); + + expect(cursor.next()).toBe(XmlEventType.END_ELEMENT); + expect(cursor.name).toBe('a:item'); + expect(cursor.uri).toBe('urn:a'); + + expect(cursor.next()).toBe(XmlEventType.END_ELEMENT); + expect(cursor.name).toBe('root'); + + expect(cursor.next()).toBe(XmlEventType.END_DOCUMENT); + expect(cursor.hasNext()).toBe(false); + }); + + it('should keep self-closing namespace declarations scoped to that element', () => { + const cursor = new StaxXmlCursorSync(''); + + expect(cursor.next()).toBe(XmlEventType.START_DOCUMENT); + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('item'); + expect(cursor.uri).toBe('urn:item'); + + expect(cursor.next()).toBe(XmlEventType.END_ELEMENT); + expect(cursor.name).toBe('item'); + expect(cursor.uri).toBe('urn:item'); + + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.name).toBe('sibling'); + expect(cursor.uri).toBeUndefined(); + }); + + it('should fail on malformed XML and rethrow the same error after failure', () => { + const cursor = new StaxXmlCursorSync(''); + + expect(cursor.next()).toBe(XmlEventType.START_DOCUMENT); + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + expect(cursor.next()).toBe(XmlEventType.START_ELEMENT); + + expect(() => cursor.next()).toThrow('Mismatched closing tag'); + expect(cursor.hasNext()).toBe(false); + expect(() => cursor.next()).toThrow('Mismatched closing tag'); + }); +}); diff --git a/packages/stax-xml/test/helpers/parser-trace.ts b/packages/stax-xml/test/helpers/parser-trace.ts new file mode 100644 index 0000000..3b34579 --- /dev/null +++ b/packages/stax-xml/test/helpers/parser-trace.ts @@ -0,0 +1,133 @@ +import StaxXmlParser from '../../src/StaxXmlParser'; +import { StaxXmlParserSync } from '../../src/StaxXmlParserSync'; +import type { AnyXmlEvent, AttributeInfo } from '../../src/types'; + +export interface NormalizedAttributeRecord { + name: string; + localName: string; + prefix?: string; + uri?: string; + value: string; +} + +export interface NormalizedTraceRecord { + type: string; + name?: string; + localName?: string; + prefix?: string; + uri?: string; + text?: string; + attributes?: Record; + attributesWithPrefix?: NormalizedAttributeRecord[]; +} + +export function stringToReadableStream(str: string): ReadableStream { + const encoder = new TextEncoder(); + const bytes = encoder.encode(str); + + return new ReadableStream({ + start(controller) { + controller.enqueue(bytes); + controller.close(); + } + }); +} + +export function createChunkedStream(str: string, chunkSize: number): ReadableStream { + const encoder = new TextEncoder(); + const bytes = encoder.encode(str); + let offset = 0; + + return new ReadableStream({ + pull(controller) { + if (offset >= bytes.length) { + controller.close(); + return; + } + + const nextOffset = Math.min(offset + chunkSize, bytes.length); + controller.enqueue(bytes.slice(offset, nextOffset)); + offset = nextOffset; + } + }); +} + +export function collectSyncTrace(xml: string): NormalizedTraceRecord[] { + return Array.from(new StaxXmlParserSync(xml)).map(normalizeEvent); +} + +export async function collectAsyncTrace(xml: string, chunkSize?: number): Promise { + const stream = chunkSize + ? createChunkedStream(xml, chunkSize) + : stringToReadableStream(xml); + const parser = new StaxXmlParser(stream); + const trace: NormalizedTraceRecord[] = []; + + for await (const event of parser) { + trace.push(normalizeEvent(event)); + } + + return trace; +} + +export function normalizeEvent(event: AnyXmlEvent): NormalizedTraceRecord { + const base: NormalizedTraceRecord = { + type: event.type, + name: 'name' in event ? event.name : undefined, + localName: 'localName' in event ? event.localName : undefined, + prefix: 'prefix' in event ? event.prefix : undefined, + uri: 'uri' in event ? event.uri : undefined, + text: 'value' in event ? event.value : undefined, + attributes: 'attributes' in event ? sortRecord(event.attributes) : undefined, + attributesWithPrefix: 'attributesWithPrefix' in event + ? normalizeAttributesWithPrefix(event.attributesWithPrefix) + : undefined, + }; + + return stripUndefined(base); +} + +function normalizeAttributesWithPrefix( + value: Record | undefined +): NormalizedAttributeRecord[] | undefined { + if (!value) { + return undefined; + } + + return Object.entries(value) + .map(([key, info]) => normalizeAttributeEntry(key, info)) + .sort((left, right) => left.name.localeCompare(right.name)); +} + +function normalizeAttributeEntry(key: string, info: AttributeInfo): NormalizedAttributeRecord { + const localName = info.localName ?? key; + const name = info.prefix + ? `${info.prefix}:${localName}` + : localName; + + return stripUndefined({ + name, + localName, + prefix: info.prefix, + uri: info.uri, + value: info.value, + }); +} + +function sortRecord( + value: Record | undefined +): Record | undefined { + if (!value) { + return undefined; + } + + return Object.fromEntries( + Object.entries(value).sort(([left], [right]) => left.localeCompare(right)) + ); +} + +function stripUndefined(value: T): T { + return Object.fromEntries( + Object.entries(value as Record).filter(([, entry]) => entry !== undefined) + ) as T; +} diff --git a/packages/stax-xml/test/parser-sync.test.ts b/packages/stax-xml/test/parser-sync.test.ts index 8898ea1..a2e756d 100644 --- a/packages/stax-xml/test/parser-sync.test.ts +++ b/packages/stax-xml/test/parser-sync.test.ts @@ -430,4 +430,15 @@ describe('StaxXmlParserSync', () => { { type: XmlEventType.END_DOCUMENT }, ]); }); + + it('should emit a terminal ERROR event for malformed XML', () => { + const parser = new StaxXmlParserSync(''); + const events = Array.from(parser); + + expect(events.at(-1)).toEqual({ + type: XmlEventType.ERROR, + error: expect.any(Error), + }); + expect((events.at(-1) as { error: Error }).error.message).toContain('Mismatched closing tag'); + }); }); diff --git a/packages/stax-xml/test/parser-trace-oracle.test.ts b/packages/stax-xml/test/parser-trace-oracle.test.ts new file mode 100644 index 0000000..14dc9e5 --- /dev/null +++ b/packages/stax-xml/test/parser-trace-oracle.test.ts @@ -0,0 +1,99 @@ +import { describe, expect, it } from 'vitest'; +import { + collectAsyncTrace, + collectSyncTrace, + type NormalizedTraceRecord, +} from './helpers/parser-trace'; + +const namespaceScopeTrace: NormalizedTraceRecord[] = [ + { type: 'START_DOCUMENT' }, + { + type: 'START_ELEMENT', + name: 'root', + localName: 'root', + attributes: { + 'a:flag': 'on', + 'xmlns:a': 'urn:a', + }, + attributesWithPrefix: [ + { name: 'a:flag', localName: 'flag', prefix: 'a', uri: 'urn:a', value: 'on' }, + { name: 'xmlns:a', localName: 'a', prefix: 'xmlns', value: 'urn:a' }, + ], + }, + { + type: 'START_ELEMENT', + name: 'a:one', + localName: 'one', + prefix: 'a', + uri: 'urn:a', + attributes: { + attr: '1', + }, + attributesWithPrefix: [ + { name: 'attr', localName: 'attr', value: '1' }, + ], + }, + { + type: 'END_ELEMENT', + name: 'a:one', + localName: 'one', + prefix: 'a', + uri: 'urn:a', + }, + { + type: 'START_ELEMENT', + name: 'two', + localName: 'two', + uri: 'urn:two', + attributes: { + xmlns: 'urn:two', + }, + attributesWithPrefix: [ + { name: 'xmlns', localName: 'xmlns', value: 'urn:two' }, + ], + }, + { + type: 'END_ELEMENT', + name: 'two', + localName: 'two', + uri: 'urn:two', + }, + { + type: 'START_ELEMENT', + name: 'three', + localName: 'three', + attributes: {}, + attributesWithPrefix: [], + }, + { + type: 'END_ELEMENT', + name: 'three', + localName: 'three', + }, + { + type: 'END_ELEMENT', + name: 'root', + localName: 'root', + }, + { type: 'END_DOCUMENT' }, +]; + +describe('Parser Trace Oracle', () => { + it('should normalize sync parser traces for namespace scope and self-closing tags', () => { + const xml = ''; + + expect(collectSyncTrace(xml)).toEqual(namespaceScopeTrace); + }); + + it('should normalize async parser traces for namespace scope and self-closing tags', async () => { + const xml = ''; + + await expect(collectAsyncTrace(xml)).resolves.toEqual(namespaceScopeTrace); + }); + + it('should produce the same async normalized trace for chunked input', async () => { + const xml = ''; + + await expect(collectAsyncTrace(xml, 5)).resolves.toEqual(namespaceScopeTrace); + }); +}); diff --git a/scripts/compare-runner-lib.mjs b/scripts/compare-runner-lib.mjs new file mode 100644 index 0000000..8ee8430 --- /dev/null +++ b/scripts/compare-runner-lib.mjs @@ -0,0 +1,118 @@ +import { mkdir, readdir, writeFile } from 'node:fs/promises'; +import { createWriteStream } from 'node:fs'; +import path from 'node:path'; +import process from 'node:process'; +import { spawn } from 'node:child_process'; + +export function usage(message) { + if (message) { + console.error(message); + console.error(''); + } +} + +export function getOutputRoot(label) { + return path.join('/tmp/stax-compare', label); +} + +export async function ensureDir(dirPath) { + await mkdir(dirPath, { recursive: true }); + return dirPath; +} + +export async function getGitMetadata(repoRoot) { + const head = await runSmallCommand('git', ['-C', repoRoot, 'rev-parse', 'HEAD']); + const branch = await runSmallCommand('git', ['-C', repoRoot, 'rev-parse', '--abbrev-ref', 'HEAD']); + + return { + head, + branch, + }; +} + +export async function runLoggedCommand({ command, args, cwd, env, logPath }) { + await ensureDir(path.dirname(logPath)); + + const startedAt = new Date().toISOString(); + const logStream = createWriteStream(logPath, { flags: 'w' }); + logStream.write(`# cwd: ${cwd}\n`); + logStream.write(`# command: ${[command, ...args].join(' ')}\n`); + logStream.write(`# startedAt: ${startedAt}\n\n`); + + const start = process.hrtime.bigint(); + + const result = await new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd, + env: { + ...process.env, + ...env, + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + child.stdout.on('data', (chunk) => { + logStream.write(chunk); + }); + child.stderr.on('data', (chunk) => { + logStream.write(chunk); + }); + child.on('error', reject); + child.on('close', (code, signal) => { + resolve({ code, signal }); + }); + }); + + const durationMs = Number(process.hrtime.bigint() - start) / 1_000_000; + logStream.write(`\n# finishedAt: ${new Date().toISOString()}\n`); + logStream.write(`# durationMs: ${durationMs.toFixed(3)}\n`); + logStream.write(`# exitCode: ${result.code ?? 'null'}\n`); + if (result.signal) { + logStream.write(`# signal: ${result.signal}\n`); + } + await new Promise((resolve) => logStream.end(resolve)); + + return { + startedAt, + durationMs, + exitCode: result.code, + signal: result.signal, + logPath, + }; +} + +export async function writeJson(filePath, value) { + await ensureDir(path.dirname(filePath)); + await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +export async function listCpuProfiles(dirPath) { + const entries = await readdir(dirPath, { withFileTypes: true }); + return entries + .filter((entry) => entry.isFile() && entry.name.endsWith('.cpuprofile')) + .map((entry) => path.join(dirPath, entry.name)) + .sort(); +} + +async function runSmallCommand(command, args) { + const output = await new Promise((resolve, reject) => { + const child = spawn(command, args, { + stdio: ['ignore', 'pipe', 'inherit'], + }); + + let stdout = ''; + child.stdout.on('data', (chunk) => { + stdout += chunk.toString(); + }); + child.on('error', reject); + child.on('close', (code) => { + if (code === 0) { + resolve(stdout.trim()); + return; + } + reject(new Error(`${command} ${args.join(' ')} failed with exit code ${code}`)); + }); + }); + + return output; +} diff --git a/scripts/parser-benchmark-case.mjs b/scripts/parser-benchmark-case.mjs new file mode 100644 index 0000000..1049a76 --- /dev/null +++ b/scripts/parser-benchmark-case.mjs @@ -0,0 +1,175 @@ +import { existsSync, readFileSync } from 'node:fs'; +import path from 'node:path'; +import process from 'node:process'; +import { performance } from 'node:perf_hooks'; +import { pathToFileURL } from 'node:url'; + +const [, , repoRootArg, caseNameArg] = process.argv; + +if (!repoRootArg || !caseNameArg) { + console.error('Usage: node scripts/parser-benchmark-case.mjs '); + process.exit(1); +} + +const repoRoot = path.resolve(repoRootArg); +const caseName = caseNameArg; +const distCandidates = [ + path.join(repoRoot, 'packages/stax-xml/dist/index.js'), + path.join(repoRoot, 'packages/stax-xml/dist/index.mjs'), +]; +const sourceEntry = path.join(repoRoot, 'packages/stax-xml/src/index.ts'); +const distEntry = distCandidates.find((candidate) => existsSync(candidate)); + +if (!distEntry && !existsSync(sourceEntry)) { + console.error(`Missing parser entrypoints: ${distCandidates.join(', ')} and ${sourceEntry}`); + process.exit(1); +} + +const entrypoint = distEntry ?? sourceEntry; +const { + StaxXmlCursor, + StaxXmlCursorSync, + StaxXmlParser, + StaxXmlParserSync, + XmlEventType, +} = await import(pathToFileURL(entrypoint).href); + +const midsizeXml = readFileSync(path.join(repoRoot, 'packages/benchmark/assets/midsize.xml'), 'utf8'); +const syncConsumeXml = createSyncConsumeXml(20000); +const attrHeavyXml = createAttributeHeavyXml(2500, 48); + +const cases = { + 'sync-consume': { + iterations: 8, + run: async () => consumeSync(StaxXmlCursorSync, StaxXmlParserSync, XmlEventType, syncConsumeXml), + }, + 'sync-attr-heavy': { + iterations: 10, + run: async () => consumeSync(StaxXmlCursorSync, StaxXmlParserSync, XmlEventType, attrHeavyXml), + }, + 'async-consume-single-chunk': { + iterations: 6, + run: async () => consumeAsync(StaxXmlParser, XmlEventType, midsizeXml, midsizeXml.length), + }, + 'async-consume-64kb-chunk': { + iterations: 6, + run: async () => consumeAsync(StaxXmlParser, XmlEventType, midsizeXml, 64 * 1024), + }, +}; + +const selected = cases[caseName]; +if (!selected) { + console.error(`Unknown case: ${caseName}`); + console.error(`Supported cases: ${Object.keys(cases).join(', ')}`); + process.exit(1); +} + +await selected.run(); + +const timings = []; +let eventsProcessed = 0; +for (let i = 0; i < selected.iterations; i++) { + maybeGc(); + const startedAt = performance.now(); + eventsProcessed = await selected.run(); + timings.push(performance.now() - startedAt); +} + +const summary = { + caseName, + repoRoot, + entrypoint, + iterations: selected.iterations, + eventsProcessed, + minMs: Math.min(...timings), + maxMs: Math.max(...timings), + meanMs: average(timings), + timingsMs: timings, +}; + +console.log(JSON.stringify(summary, null, 2)); + +function maybeGc() { + if (typeof global.gc === 'function') { + global.gc(); + } +} + +function average(values) { + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function createAttributeHeavyXml(itemCount, attributeCount) { + const items = []; + for (let index = 0; index < itemCount; index++) { + const attributes = []; + for (let attributeIndex = 0; attributeIndex < attributeCount; attributeIndex++) { + attributes.push(`a${attributeIndex}="value-${index}-${attributeIndex}"`); + } + items.push(``); + } + return `${items.join('')}`; +} + +function createSyncConsumeXml(itemCount) { + const items = []; + for (let index = 0; index < itemCount; index++) { + items.push(`${index}value-${index}`); + } + return `${items.join('')}`; +} + +function consumeSync(CursorSync, ParserSync, EventType, xml) { + let eventsProcessed = 0; + + if (typeof CursorSync === 'function') { + const cursor = new CursorSync(xml); + while (cursor.hasNext()) { + const tokenType = cursor.next(); + eventsProcessed++; + if (tokenType === EventType.ERROR) { + throw new Error('Cursor emitted unexpected ERROR token.'); + } + } + return eventsProcessed; + } + + const parser = new ParserSync(xml); + for (const event of parser) { + eventsProcessed++; + if (event.type === EventType.ERROR) { + throw event.error; + } + } + return eventsProcessed; +} + +async function consumeAsync(ParserAsync, EventType, xml, chunkSize) { + let eventsProcessed = 0; + const parser = new ParserAsync(createChunkedStream(xml, chunkSize)); + for await (const event of parser) { + eventsProcessed++; + if (event.type === EventType.ERROR) { + throw event.error; + } + } + return eventsProcessed; +} + +function createChunkedStream(xml, chunkSize) { + const bytes = new TextEncoder().encode(xml); + let offset = 0; + + return new ReadableStream({ + pull(controller) { + if (offset >= bytes.length) { + controller.close(); + return; + } + + const nextOffset = Math.min(offset + chunkSize, bytes.length); + controller.enqueue(bytes.slice(offset, nextOffset)); + offset = nextOffset; + } + }); +} diff --git a/scripts/run-benchmark.mjs b/scripts/run-benchmark.mjs new file mode 100644 index 0000000..d790479 --- /dev/null +++ b/scripts/run-benchmark.mjs @@ -0,0 +1,89 @@ +import path from 'node:path'; +import process from 'node:process'; +import { + ensureDir, + getGitMetadata, + getOutputRoot, + runLoggedCommand, + usage, + writeJson, +} from './compare-runner-lib.mjs'; + +const [, , repoRootArg, label, suiteArg = 'all'] = process.argv; + +if (!repoRootArg || !label) { + usage('Usage: node scripts/run-benchmark.mjs