From ac7fe3c48642045c38e3b59dd2b0566cf770b54f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 3 Sep 2025 01:01:35 +0000 Subject: [PATCH 1/3] chore(release): 0.16.0 [skip ci] # [0.16.0](https://github.com/Harbour-Enterprises/SuperDoc/compare/v0.15.18...v0.16.0) (2025-09-03) ### Bug Fixes * additional fixes to list indent/outdent, split list, toggle list, types and more tests ([02e6cd9](https://github.com/Harbour-Enterprises/SuperDoc/commit/02e6cd971b672adc7a27ee6f4c3e491ea6582927)) * backspaceNextToList, toggleList and tests ([8b33258](https://github.com/Harbour-Enterprises/SuperDoc/commit/8b33258aa9a09cd566191083de2095377f532de5)) * closing dropdown after clicking again ([#835](https://github.com/Harbour-Enterprises/SuperDoc/issues/835)) ([88ff88d](https://github.com/Harbour-Enterprises/SuperDoc/commit/88ff88d06568716d78be4fcdc311cbba0e6ba3fd)) * definition possibly missing name key, add jsdoc ([bb714f1](https://github.com/Harbour-Enterprises/SuperDoc/commit/bb714f14635239301ed6931bb06259b299b11fa8)) * images are missing for the document in edit mode ([#831](https://github.com/Harbour-Enterprises/SuperDoc/issues/831)) ([a9af47e](https://github.com/Harbour-Enterprises/SuperDoc/commit/a9af47ed4def516900b14460218e476374c69a80)) * include package lock on tests folder ([#845](https://github.com/Harbour-Enterprises/SuperDoc/issues/845)) ([1409d02](https://github.com/Harbour-Enterprises/SuperDoc/commit/1409d02ce457db963a5696ec78be30a3f349ffca)) * insertContentAt fails if new line characters (\n) inserted ([dd60d91](https://github.com/Harbour-Enterprises/SuperDoc/commit/dd60d91711e63741e2d6ca2ced02251f2a4e0465)) * install http server ([#846](https://github.com/Harbour-Enterprises/SuperDoc/issues/846)) ([1a6e684](https://github.com/Harbour-Enterprises/SuperDoc/commit/1a6e684f809ac96e00e370bb324f0317ec6917ef)) * **internal:** remove pdfjs from build ([#843](https://github.com/Harbour-Enterprises/SuperDoc/issues/843)) ([021b2c1](https://github.com/Harbour-Enterprises/SuperDoc/commit/021b2c123052215ba8f52ee103034ebaaa72e1e4)) * remove footer line length breaking deployments ([04766cd](https://github.com/Harbour-Enterprises/SuperDoc/commit/04766cdb1f085419730212b70eacf4072ef6eeeb)) * toggle list ([770998a](https://github.com/Harbour-Enterprises/SuperDoc/commit/770998a9e9b5097d1efa031dc12e6bf12920fa8b)) * update condition checks for screenshot updates in CI workflow ([e17fdf0](https://github.com/Harbour-Enterprises/SuperDoc/commit/e17fdf0b939e8caef65f60207611a71343e4cfde)) ### Features * enable dispatching example apps tests ([#844](https://github.com/Harbour-Enterprises/SuperDoc/issues/844)) ([8b2bc73](https://github.com/Harbour-Enterprises/SuperDoc/commit/8b2bc73bb909c2ce93a93e6266f18c17af0b46e2)) * filter out ooxml tags cli to highest priority namespaces ([23b1efa](https://github.com/Harbour-Enterprises/SuperDoc/commit/23b1efabc63f999f1b297ac046e8c178ff345e49)) --- packages/superdoc/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/superdoc/package.json b/packages/superdoc/package.json index faa535428..8d0221b72 100644 --- a/packages/superdoc/package.json +++ b/packages/superdoc/package.json @@ -1,7 +1,7 @@ { "name": "@harbour-enterprises/superdoc", "type": "module", - "version": "0.16.0-next.6", + "version": "0.16.0", "license": "AGPL-3.0", "readme": "../../README.md", "files": [ From 9bc488d40430b61240d05bacc541cae51ea84ebb Mon Sep 17 00:00:00 2001 From: Nick Bernal Date: Tue, 2 Sep 2025 21:39:48 -0700 Subject: [PATCH 2/3] fix: imports encoded in utf-16 break DocxZipper --- packages/super-editor/src/core/DocxZipper.js | 48 +++--- .../super-editor/src/core/DocxZipper.test.js | 60 ++++++++ .../super-editor/src/core/encoding-helpers.js | 80 ++++++++++ .../src/core/encoding-helpers.test.js | 142 ++++++++++++++++++ 4 files changed, 304 insertions(+), 26 deletions(-) create mode 100644 packages/super-editor/src/core/encoding-helpers.js create mode 100644 packages/super-editor/src/core/encoding-helpers.test.js diff --git a/packages/super-editor/src/core/DocxZipper.js b/packages/super-editor/src/core/DocxZipper.js index 4ba50a94c..e50b0c3c5 100644 --- a/packages/super-editor/src/core/DocxZipper.js +++ b/packages/super-editor/src/core/DocxZipper.js @@ -1,6 +1,7 @@ import xmljs from 'xml-js'; import JSZip from 'jszip'; import { getContentTypesFromXml } from './super-converter/helpers.js'; +import { ensureXmlString, isXmlLike } from './encoding-helpers.js'; /** * Class to handle unzipping and zipping of docx files @@ -37,42 +38,37 @@ class DocxZipper { const extractedFiles = await this.unzip(file); const files = Object.entries(extractedFiles.files); - const mediaObjects = {}; - const validTypes = ['xml', 'rels']; - for (const file of files) { - const [, zipEntry] = file; - - if (validTypes.some((validType) => zipEntry.name.endsWith(validType))) { - const content = await zipEntry.async('string'); - this.files.push({ - name: zipEntry.name, - content, - }); + for (const [, zipEntry] of files) { + const name = zipEntry.name; + + if (isXmlLike(name)) { + // Read raw bytes and decode (handles UTF-8 & UTF-16) + const u8 = await zipEntry.async('uint8array'); + const content = ensureXmlString(u8); + this.files.push({ name, content }); } else if ( - (zipEntry.name.startsWith('word/media') && zipEntry.name !== 'word/media/') || - (zipEntry.name.startsWith('media') && zipEntry.name !== 'media/') + (name.startsWith('word/media') && name !== 'word/media/') || + (name.startsWith('media') && name !== 'media/') ) { - // If we are in node, we need to convert the buffer to base64 + // Media files if (isNode) { const buffer = await zipEntry.async('nodebuffer'); const fileBase64 = buffer.toString('base64'); - this.mediaFiles[zipEntry.name] = fileBase64; - } - - // If we are in the browser, we can use the base64 directly - else { + this.mediaFiles[name] = fileBase64; + } else { const blob = await zipEntry.async('blob'); - const extension = this.getFileExtension(zipEntry.name); + const extension = this.getFileExtension(name); const fileBase64 = await zipEntry.async('base64'); - this.mediaFiles[zipEntry.name] = `data:image/${extension};base64,${fileBase64}`; + this.mediaFiles[name] = `data:image/${extension};base64,${fileBase64}`; - const file = new File([blob], zipEntry.name, { type: blob.type }); - const imageUrl = URL.createObjectURL(file); - this.media[zipEntry.name] = imageUrl; + const fileObj = new File([blob], name, { type: blob.type }); + const imageUrl = URL.createObjectURL(fileObj); + this.media[name] = imageUrl; } - } else if (zipEntry.name.startsWith('word/fonts') && zipEntry.name !== 'word/fonts/') { + } else if (name.startsWith('word/fonts') && name !== 'word/fonts/') { + // Font files const uint8array = await zipEntry.async('uint8array'); - this.fonts[zipEntry.name] = uint8array; + this.fonts[name] = uint8array; } } diff --git a/packages/super-editor/src/core/DocxZipper.test.js b/packages/super-editor/src/core/DocxZipper.test.js index 654e34066..28755f280 100644 --- a/packages/super-editor/src/core/DocxZipper.test.js +++ b/packages/super-editor/src/core/DocxZipper.test.js @@ -2,6 +2,7 @@ import path from 'path'; import fs from 'fs'; import { describe, it, expect, beforeEach } from 'vitest'; import DocxZipper from './DocxZipper'; +import JSZip from 'jszip'; async function readFileAsBuffer(filePath) { const resolvedPath = path.resolve(__dirname, filePath); @@ -48,3 +49,62 @@ describe('DocxZipper - file extraction', () => { expect(documentXml).toBeTruthy(); }); }); + +// Helper to build a UTF-16LE Buffer with BOM +function utf16leWithBOM(str) { + const bom = Buffer.from([0xff, 0xfe]); + const body = Buffer.from(str, 'utf16le'); + return Buffer.concat([bom, body]); +} + +describe('DocxZipper - UTF-16 XML handling', () => { + let zipper; + beforeEach(() => { + zipper = new DocxZipper(); + }); + + it('decodes a UTF-16LE customXml part correctly (was failing before fix)', async () => { + const zip = new JSZip(); + + // Minimal [Content_Types].xml to look like a docx + const contentTypes = ` + + + + + `; + zip.file('[Content_Types].xml', contentTypes); + + // A basic UTF-8 document.xml so there's at least one normal XML entry + const documentXml = ` + + Hello + `; + zip.file('word/document.xml', documentXml); + + // The problematic UTF-16LE customXml item + const customXmlUtf16 = ` + + TELEKOM!4176814.1 + A675398 + GUDRUN.JORDAN@TELEKOM.DE + 2023-07-06T15:09:00.0000000+02:00 + TELEKOM +`; + zip.file('customXml/item2.xml', utf16leWithBOM(customXmlUtf16)); + + // Generate the zip as a Node buffer and feed it to the zipper + const buf = await zip.generateAsync({ type: 'nodebuffer' }); + const files = await zipper.getDocxData(buf /* isNode not needed for XML */); + + // Find the customXml item + const item2 = files.find((f) => f.name === 'customXml/item2.xml'); + expect(item2).toBeTruthy(); + + // ✅ With the fix, content is a clean JS string: + expect(item2.content).toContain(' /\.xml$|\.rels$/i.test(name); + +/** + * Hex dump for optional debugging + * @param {Uint8Array|ArrayBuffer} bytes + * @param {number} n + * @returns {string} Hex dump + */ +export function hex(bytes, n = 32) { + const u8 = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes); + return Array.from(u8.slice(0, n)) + .map((b) => b.toString(16).padStart(2, '0')) + .join(' '); +} + +/** + * Try to detect encoding by BOM / null density + * @param {Uint8Array} u8 + * @returns {string} Detected encoding + */ +export function sniffEncoding(u8) { + if (u8.length >= 2) { + const b0 = u8[0], + b1 = u8[1]; + if (b0 === 0xff && b1 === 0xfe) return 'utf-16le'; + if (b0 === 0xfe && b1 === 0xff) return 'utf-16be'; + } + // Heuristic: lots of NULs near the start → likely UTF-16 + let nul = 0; + for (let i = 0; i < Math.min(64, u8.length); i++) if (u8[i] === 0) nul++; + if (nul > 16) return 'utf-16le'; + return 'utf-8'; +} + +/** + * Remove leading BOM from already-decoded JS string + * @param {string} str + * @returns {string} Cleaned string without BOM + */ +export function stripBOM(str) { + return str && str.charCodeAt(0) === 0xfeff ? str.slice(1) : str; +} + +/** + * Decode XML/RELS content to a clean JS string. + * Accepts: string | Uint8Array | ArrayBuffer + * @param {string|Uint8Array|ArrayBuffer} content + * @returns {string} Clean XML string + */ +export function ensureXmlString(content) { + if (typeof content === 'string') return stripBOM(content); + + // Accept: Buffer, Uint8Array, DataView, any TypedArray, or ArrayBuffer + let u8 = null; + + if (content && typeof content === 'object') { + if (content instanceof Uint8Array) { + u8 = content; + } else if (typeof Buffer !== 'undefined' && Buffer.isBuffer && Buffer.isBuffer(content)) { + // Node Buffer + u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength); + } else if (ArrayBuffer.isView && ArrayBuffer.isView(content)) { + // Any ArrayBufferView: DataView or other TypedArray + u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength); + } else if (content.constructor && (content instanceof ArrayBuffer || content.constructor.name === 'ArrayBuffer')) { + u8 = new Uint8Array(content); + } + } + + if (!u8) throw new Error('Unsupported content type for XML'); + + const enc = sniffEncoding(u8); + let xml = new TextDecoder(enc).decode(u8); + return stripBOM(xml); +} diff --git a/packages/super-editor/src/core/encoding-helpers.test.js b/packages/super-editor/src/core/encoding-helpers.test.js new file mode 100644 index 000000000..7a0a154bf --- /dev/null +++ b/packages/super-editor/src/core/encoding-helpers.test.js @@ -0,0 +1,142 @@ +import { describe, it, expect } from 'vitest'; +import { isXmlLike, hex, sniffEncoding, stripBOM, ensureXmlString } from './encoding-helpers.js'; + +function utf16leWithBOM(str) { + const bom = Buffer.from([0xff, 0xfe]); + const body = Buffer.from(str, 'utf16le'); + return Buffer.concat([bom, body]); +} + +function utf16beWithBOM(str) { + const le = Buffer.from(str, 'utf16le'); + const swapped = Buffer.alloc(le.length); + for (let i = 0; i < le.length; i += 2) { + swapped[i] = le[i + 1]; + swapped[i + 1] = le[i]; + } + const bom = Buffer.from([0xfe, 0xff]); + return Buffer.concat([bom, swapped]); +} + +function noBOMUtf16leBytes(str) { + // UTF-16LE WITHOUT a BOM (to trigger the NUL-heuristic) + return Buffer.from(str, 'utf16le'); +} + +describe('isXmlLike', () => { + it('matches .xml and .rels', () => { + expect(isXmlLike('word/document.xml')).toBe(true); + expect(isXmlLike('word/_rels/document.xml.rels')).toBe(true); + expect(isXmlLike('docProps/core.xml')).toBe(true); + }); + it('rejects non-xml', () => { + expect(isXmlLike('word/media/image1.png')).toBe(false); + expect(isXmlLike('customXml/item1.xml.bin')).toBe(false); + expect(isXmlLike('word/fonts/font1.odttf')).toBe(false); + }); +}); + +describe('hex', () => { + it('renders hex dump of first N bytes', () => { + const u8 = new Uint8Array([0xff, 0xfe, 0x3c, 0x00, 0x3f, 0x00]); + expect(hex(u8, 6)).toBe('ff fe 3c 00 3f 00'); + }); +}); + +describe('sniffEncoding', () => { + it('detects UTF-16LE by BOM', () => { + const u8 = utf16leWithBOM(''); + expect(sniffEncoding(u8)).toBe('utf-16le'); + }); + it('detects UTF-16BE by BOM', () => { + const u8 = utf16beWithBOM(''); + expect(sniffEncoding(u8)).toBe('utf-16be'); + }); + it('defaults to utf-8 for plain ASCII/UTF-8', () => { + const u8 = new TextEncoder().encode(''); + expect(sniffEncoding(u8)).toBe('utf-8'); + }); + it('heuristically detects UTF-16 (no BOM) via NUL density', () => { + const u8 = noBOMUtf16leBytes(''); + // Our heuristic returns 'utf-16le' for lots of NULs + expect(sniffEncoding(u8)).toBe('utf-16le'); + }); +}); + +describe('stripBOM', () => { + it('removes U+FEFF if present', () => { + const s = '\uFEFF'; + expect(stripBOM(s)).toBe(''); + }); + it('no-ops when no BOM present', () => { + const s = ''; + expect(stripBOM(s)).toBe(s); + }); +}); + +describe('ensureXmlString', () => { + it('returns same string when given a plain XML string', () => { + const s = ''; + expect(ensureXmlString(s)).toBe(s); + }); + + it('strips leading BOM from a decoded string', () => { + const s = '\uFEFF'; + expect(ensureXmlString(s)).toBe(''); + }); + + it('decodes UTF-8 bytes', () => { + const u8 = new TextEncoder().encode('héllo'); + const out = ensureXmlString(u8); + expect(out).toContain(' { + const u8 = utf16leWithBOM('v'); + const out = ensureXmlString(u8); + expect(out.toLowerCase()).toContain('encoding="utf-16"'); + expect(out).toContain(''); + expect(out).not.toMatch(/\u0000/); + }); + + it('decodes UTF-16BE with BOM bytes', () => { + const u8 = utf16beWithBOM('v'); + const out = ensureXmlString(u8); + expect(out.toLowerCase()).toContain('encoding="utf-16"'); + expect(out).toContain(''); + expect(out).not.toMatch(/\u0000/); + }); + + it('decodes UTF-16 (no BOM) via heuristic', () => { + const u8 = noBOMUtf16leBytes('NOBOM'); + const out = ensureXmlString(u8); + expect(out).toContain(''); + expect(out).toContain('NOBOM'); + expect(out).not.toMatch(/\u0000/); + }); + + it('accepts ArrayBuffer input', () => { + const u8 = new TextEncoder().encode(''); + const out = ensureXmlString(u8.buffer); + expect(out).toContain(''); + }); + + it('throws on unsupported content types', () => { + expect(() => ensureXmlString(12345)).toThrow(/Unsupported content type/); + }); + + it('decodes from Node Buffer (utf-8)', () => { + const buf = Buffer.from('', 'utf8'); + const out = ensureXmlString(buf); + expect(out).toContain(''); + }); +}); + +describe('ensureXmlString cross-env', () => { + it('decodes from Node Buffer (utf-8)', () => { + const buf = Buffer.from('', 'utf8'); + const out = ensureXmlString(buf); + expect(out).toContain(''); + }); +}); From 6d09115f2bea86dc11d84a7e637d7ef897119116 Mon Sep 17 00:00:00 2001 From: Nick Bernal Date: Tue, 2 Sep 2025 21:50:17 -0700 Subject: [PATCH 3/3] fix: imports encoded in utf-16 break DocxZipper