diff --git a/packages/super-editor/src/core/DocxZipper.js b/packages/super-editor/src/core/DocxZipper.js
index 4ba50a94c..e50b0c3c5 100644
--- a/packages/super-editor/src/core/DocxZipper.js
+++ b/packages/super-editor/src/core/DocxZipper.js
@@ -1,6 +1,7 @@
import xmljs from 'xml-js';
import JSZip from 'jszip';
import { getContentTypesFromXml } from './super-converter/helpers.js';
+import { ensureXmlString, isXmlLike } from './encoding-helpers.js';
/**
* Class to handle unzipping and zipping of docx files
@@ -37,42 +38,37 @@ class DocxZipper {
const extractedFiles = await this.unzip(file);
const files = Object.entries(extractedFiles.files);
- const mediaObjects = {};
- const validTypes = ['xml', 'rels'];
- for (const file of files) {
- const [, zipEntry] = file;
-
- if (validTypes.some((validType) => zipEntry.name.endsWith(validType))) {
- const content = await zipEntry.async('string');
- this.files.push({
- name: zipEntry.name,
- content,
- });
+ for (const [, zipEntry] of files) {
+ const name = zipEntry.name;
+
+ if (isXmlLike(name)) {
+ // Read raw bytes and decode (handles UTF-8 & UTF-16)
+ const u8 = await zipEntry.async('uint8array');
+ const content = ensureXmlString(u8);
+ this.files.push({ name, content });
} else if (
- (zipEntry.name.startsWith('word/media') && zipEntry.name !== 'word/media/') ||
- (zipEntry.name.startsWith('media') && zipEntry.name !== 'media/')
+ (name.startsWith('word/media') && name !== 'word/media/') ||
+ (name.startsWith('media') && name !== 'media/')
) {
- // If we are in node, we need to convert the buffer to base64
+ // Media files
if (isNode) {
const buffer = await zipEntry.async('nodebuffer');
const fileBase64 = buffer.toString('base64');
- this.mediaFiles[zipEntry.name] = fileBase64;
- }
-
- // If we are in the browser, we can use the base64 directly
- else {
+ this.mediaFiles[name] = fileBase64;
+ } else {
const blob = await zipEntry.async('blob');
- const extension = this.getFileExtension(zipEntry.name);
+ const extension = this.getFileExtension(name);
const fileBase64 = await zipEntry.async('base64');
- this.mediaFiles[zipEntry.name] = `data:image/${extension};base64,${fileBase64}`;
+ this.mediaFiles[name] = `data:image/${extension};base64,${fileBase64}`;
- const file = new File([blob], zipEntry.name, { type: blob.type });
- const imageUrl = URL.createObjectURL(file);
- this.media[zipEntry.name] = imageUrl;
+ const fileObj = new File([blob], name, { type: blob.type });
+ const imageUrl = URL.createObjectURL(fileObj);
+ this.media[name] = imageUrl;
}
- } else if (zipEntry.name.startsWith('word/fonts') && zipEntry.name !== 'word/fonts/') {
+ } else if (name.startsWith('word/fonts') && name !== 'word/fonts/') {
+ // Font files
const uint8array = await zipEntry.async('uint8array');
- this.fonts[zipEntry.name] = uint8array;
+ this.fonts[name] = uint8array;
}
}
diff --git a/packages/super-editor/src/core/DocxZipper.test.js b/packages/super-editor/src/core/DocxZipper.test.js
index 654e34066..28755f280 100644
--- a/packages/super-editor/src/core/DocxZipper.test.js
+++ b/packages/super-editor/src/core/DocxZipper.test.js
@@ -2,6 +2,7 @@ import path from 'path';
import fs from 'fs';
import { describe, it, expect, beforeEach } from 'vitest';
import DocxZipper from './DocxZipper';
+import JSZip from 'jszip';
async function readFileAsBuffer(filePath) {
const resolvedPath = path.resolve(__dirname, filePath);
@@ -48,3 +49,62 @@ describe('DocxZipper - file extraction', () => {
expect(documentXml).toBeTruthy();
});
});
+
+// Helper to build a UTF-16LE Buffer with BOM
+function utf16leWithBOM(str) {
+ const bom = Buffer.from([0xff, 0xfe]);
+ const body = Buffer.from(str, 'utf16le');
+ return Buffer.concat([bom, body]);
+}
+
+describe('DocxZipper - UTF-16 XML handling', () => {
+ let zipper;
+ beforeEach(() => {
+ zipper = new DocxZipper();
+ });
+
+ it('decodes a UTF-16LE customXml part correctly (was failing before fix)', async () => {
+ const zip = new JSZip();
+
+ // Minimal [Content_Types].xml to look like a docx
+ const contentTypes = `
+
+
+
+
+ `;
+ zip.file('[Content_Types].xml', contentTypes);
+
+ // A basic UTF-8 document.xml so there's at least one normal XML entry
+ const documentXml = `
+
+ Hello
+ `;
+ zip.file('word/document.xml', documentXml);
+
+ // The problematic UTF-16LE customXml item
+ const customXmlUtf16 = `
+
+ TELEKOM!4176814.1
+ A675398
+ GUDRUN.JORDAN@TELEKOM.DE
+ 2023-07-06T15:09:00.0000000+02:00
+ TELEKOM
+`;
+ zip.file('customXml/item2.xml', utf16leWithBOM(customXmlUtf16));
+
+ // Generate the zip as a Node buffer and feed it to the zipper
+ const buf = await zip.generateAsync({ type: 'nodebuffer' });
+ const files = await zipper.getDocxData(buf /* isNode not needed for XML */);
+
+ // Find the customXml item
+ const item2 = files.find((f) => f.name === 'customXml/item2.xml');
+ expect(item2).toBeTruthy();
+
+ // ✅ With the fix, content is a clean JS string:
+ expect(item2.content).toContain(' /\.xml$|\.rels$/i.test(name);
+
+/**
+ * Hex dump for optional debugging
+ * @param {Uint8Array|ArrayBuffer} bytes
+ * @param {number} n
+ * @returns {string} Hex dump
+ */
+export function hex(bytes, n = 32) {
+ const u8 = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
+ return Array.from(u8.slice(0, n))
+ .map((b) => b.toString(16).padStart(2, '0'))
+ .join(' ');
+}
+
+/**
+ * Try to detect encoding by BOM / null density
+ * @param {Uint8Array} u8
+ * @returns {string} Detected encoding
+ */
+export function sniffEncoding(u8) {
+ if (u8.length >= 2) {
+ const b0 = u8[0],
+ b1 = u8[1];
+ if (b0 === 0xff && b1 === 0xfe) return 'utf-16le';
+ if (b0 === 0xfe && b1 === 0xff) return 'utf-16be';
+ }
+ // Heuristic: lots of NULs near the start → likely UTF-16
+ let nul = 0;
+ for (let i = 0; i < Math.min(64, u8.length); i++) if (u8[i] === 0) nul++;
+ if (nul > 16) return 'utf-16le';
+ return 'utf-8';
+}
+
+/**
+ * Remove leading BOM from already-decoded JS string
+ * @param {string} str
+ * @returns {string} Cleaned string without BOM
+ */
+export function stripBOM(str) {
+ return str && str.charCodeAt(0) === 0xfeff ? str.slice(1) : str;
+}
+
+/**
+ * Decode XML/RELS content to a clean JS string.
+ * Accepts: string | Uint8Array | ArrayBuffer
+ * @param {string|Uint8Array|ArrayBuffer} content
+ * @returns {string} Clean XML string
+ */
+export function ensureXmlString(content) {
+ if (typeof content === 'string') return stripBOM(content);
+
+ // Accept: Buffer, Uint8Array, DataView, any TypedArray, or ArrayBuffer
+ let u8 = null;
+
+ if (content && typeof content === 'object') {
+ if (content instanceof Uint8Array) {
+ u8 = content;
+ } else if (typeof Buffer !== 'undefined' && Buffer.isBuffer && Buffer.isBuffer(content)) {
+ // Node Buffer
+ u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
+ } else if (ArrayBuffer.isView && ArrayBuffer.isView(content)) {
+ // Any ArrayBufferView: DataView or other TypedArray
+ u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
+ } else if (content.constructor && (content instanceof ArrayBuffer || content.constructor.name === 'ArrayBuffer')) {
+ u8 = new Uint8Array(content);
+ }
+ }
+
+ if (!u8) throw new Error('Unsupported content type for XML');
+
+ const enc = sniffEncoding(u8);
+ let xml = new TextDecoder(enc).decode(u8);
+ return stripBOM(xml);
+}
diff --git a/packages/super-editor/src/core/encoding-helpers.test.js b/packages/super-editor/src/core/encoding-helpers.test.js
new file mode 100644
index 000000000..7a0a154bf
--- /dev/null
+++ b/packages/super-editor/src/core/encoding-helpers.test.js
@@ -0,0 +1,142 @@
+import { describe, it, expect } from 'vitest';
+import { isXmlLike, hex, sniffEncoding, stripBOM, ensureXmlString } from './encoding-helpers.js';
+
+function utf16leWithBOM(str) {
+ const bom = Buffer.from([0xff, 0xfe]);
+ const body = Buffer.from(str, 'utf16le');
+ return Buffer.concat([bom, body]);
+}
+
+function utf16beWithBOM(str) {
+ const le = Buffer.from(str, 'utf16le');
+ const swapped = Buffer.alloc(le.length);
+ for (let i = 0; i < le.length; i += 2) {
+ swapped[i] = le[i + 1];
+ swapped[i + 1] = le[i];
+ }
+ const bom = Buffer.from([0xfe, 0xff]);
+ return Buffer.concat([bom, swapped]);
+}
+
+function noBOMUtf16leBytes(str) {
+ // UTF-16LE WITHOUT a BOM (to trigger the NUL-heuristic)
+ return Buffer.from(str, 'utf16le');
+}
+
+describe('isXmlLike', () => {
+ it('matches .xml and .rels', () => {
+ expect(isXmlLike('word/document.xml')).toBe(true);
+ expect(isXmlLike('word/_rels/document.xml.rels')).toBe(true);
+ expect(isXmlLike('docProps/core.xml')).toBe(true);
+ });
+ it('rejects non-xml', () => {
+ expect(isXmlLike('word/media/image1.png')).toBe(false);
+ expect(isXmlLike('customXml/item1.xml.bin')).toBe(false);
+ expect(isXmlLike('word/fonts/font1.odttf')).toBe(false);
+ });
+});
+
+describe('hex', () => {
+ it('renders hex dump of first N bytes', () => {
+ const u8 = new Uint8Array([0xff, 0xfe, 0x3c, 0x00, 0x3f, 0x00]);
+ expect(hex(u8, 6)).toBe('ff fe 3c 00 3f 00');
+ });
+});
+
+describe('sniffEncoding', () => {
+ it('detects UTF-16LE by BOM', () => {
+ const u8 = utf16leWithBOM('');
+ expect(sniffEncoding(u8)).toBe('utf-16le');
+ });
+ it('detects UTF-16BE by BOM', () => {
+ const u8 = utf16beWithBOM('');
+ expect(sniffEncoding(u8)).toBe('utf-16be');
+ });
+ it('defaults to utf-8 for plain ASCII/UTF-8', () => {
+ const u8 = new TextEncoder().encode('');
+ expect(sniffEncoding(u8)).toBe('utf-8');
+ });
+ it('heuristically detects UTF-16 (no BOM) via NUL density', () => {
+ const u8 = noBOMUtf16leBytes('');
+ // Our heuristic returns 'utf-16le' for lots of NULs
+ expect(sniffEncoding(u8)).toBe('utf-16le');
+ });
+});
+
+describe('stripBOM', () => {
+ it('removes U+FEFF if present', () => {
+ const s = '\uFEFF';
+ expect(stripBOM(s)).toBe('');
+ });
+ it('no-ops when no BOM present', () => {
+ const s = '';
+ expect(stripBOM(s)).toBe(s);
+ });
+});
+
+describe('ensureXmlString', () => {
+ it('returns same string when given a plain XML string', () => {
+ const s = '';
+ expect(ensureXmlString(s)).toBe(s);
+ });
+
+ it('strips leading BOM from a decoded string', () => {
+ const s = '\uFEFF';
+ expect(ensureXmlString(s)).toBe('');
+ });
+
+ it('decodes UTF-8 bytes', () => {
+ const u8 = new TextEncoder().encode('héllo');
+ const out = ensureXmlString(u8);
+ expect(out).toContain(' {
+ const u8 = utf16leWithBOM('v');
+ const out = ensureXmlString(u8);
+ expect(out.toLowerCase()).toContain('encoding="utf-16"');
+ expect(out).toContain('');
+ expect(out).not.toMatch(/\u0000/);
+ });
+
+ it('decodes UTF-16BE with BOM bytes', () => {
+ const u8 = utf16beWithBOM('v');
+ const out = ensureXmlString(u8);
+ expect(out.toLowerCase()).toContain('encoding="utf-16"');
+ expect(out).toContain('');
+ expect(out).not.toMatch(/\u0000/);
+ });
+
+ it('decodes UTF-16 (no BOM) via heuristic', () => {
+ const u8 = noBOMUtf16leBytes('NOBOM');
+ const out = ensureXmlString(u8);
+ expect(out).toContain('');
+ expect(out).toContain('NOBOM');
+ expect(out).not.toMatch(/\u0000/);
+ });
+
+ it('accepts ArrayBuffer input', () => {
+ const u8 = new TextEncoder().encode('');
+ const out = ensureXmlString(u8.buffer);
+ expect(out).toContain('');
+ });
+
+ it('throws on unsupported content types', () => {
+ expect(() => ensureXmlString(12345)).toThrow(/Unsupported content type/);
+ });
+
+ it('decodes from Node Buffer (utf-8)', () => {
+ const buf = Buffer.from('', 'utf8');
+ const out = ensureXmlString(buf);
+ expect(out).toContain('');
+ });
+});
+
+describe('ensureXmlString cross-env', () => {
+ it('decodes from Node Buffer (utf-8)', () => {
+ const buf = Buffer.from('', 'utf8');
+ const out = ensureXmlString(buf);
+ expect(out).toContain('');
+ });
+});