Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 22 additions & 26 deletions packages/super-editor/src/core/DocxZipper.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import xmljs from 'xml-js';
import JSZip from 'jszip';
import { getContentTypesFromXml } from './super-converter/helpers.js';
import { ensureXmlString, isXmlLike } from './encoding-helpers.js';

/**
* Class to handle unzipping and zipping of docx files
Expand Down Expand Up @@ -37,42 +38,37 @@ class DocxZipper {
const extractedFiles = await this.unzip(file);
const files = Object.entries(extractedFiles.files);

const mediaObjects = {};
const validTypes = ['xml', 'rels'];
for (const file of files) {
const [, zipEntry] = file;

if (validTypes.some((validType) => zipEntry.name.endsWith(validType))) {
const content = await zipEntry.async('string');
this.files.push({
name: zipEntry.name,
content,
});
for (const [, zipEntry] of files) {
const name = zipEntry.name;

if (isXmlLike(name)) {
// Read raw bytes and decode (handles UTF-8 & UTF-16)
const u8 = await zipEntry.async('uint8array');
const content = ensureXmlString(u8);
this.files.push({ name, content });
} else if (
(zipEntry.name.startsWith('word/media') && zipEntry.name !== 'word/media/') ||
(zipEntry.name.startsWith('media') && zipEntry.name !== 'media/')
(name.startsWith('word/media') && name !== 'word/media/') ||
(name.startsWith('media') && name !== 'media/')
) {
// If we are in node, we need to convert the buffer to base64
// Media files
if (isNode) {
const buffer = await zipEntry.async('nodebuffer');
const fileBase64 = buffer.toString('base64');
this.mediaFiles[zipEntry.name] = fileBase64;
}

// If we are in the browser, we can use the base64 directly
else {
this.mediaFiles[name] = fileBase64;
} else {
const blob = await zipEntry.async('blob');
const extension = this.getFileExtension(zipEntry.name);
const extension = this.getFileExtension(name);
const fileBase64 = await zipEntry.async('base64');
this.mediaFiles[zipEntry.name] = `data:image/${extension};base64,${fileBase64}`;
this.mediaFiles[name] = `data:image/${extension};base64,${fileBase64}`;

const file = new File([blob], zipEntry.name, { type: blob.type });
const imageUrl = URL.createObjectURL(file);
this.media[zipEntry.name] = imageUrl;
const fileObj = new File([blob], name, { type: blob.type });
const imageUrl = URL.createObjectURL(fileObj);
this.media[name] = imageUrl;
}
} else if (zipEntry.name.startsWith('word/fonts') && zipEntry.name !== 'word/fonts/') {
} else if (name.startsWith('word/fonts') && name !== 'word/fonts/') {
// Font files
const uint8array = await zipEntry.async('uint8array');
this.fonts[zipEntry.name] = uint8array;
this.fonts[name] = uint8array;
}
}

Expand Down
60 changes: 60 additions & 0 deletions packages/super-editor/src/core/DocxZipper.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import path from 'path';
import fs from 'fs';
import { describe, it, expect, beforeEach } from 'vitest';
import DocxZipper from './DocxZipper';
import JSZip from 'jszip';

async function readFileAsBuffer(filePath) {
const resolvedPath = path.resolve(__dirname, filePath);
Expand Down Expand Up @@ -48,3 +49,62 @@ describe('DocxZipper - file extraction', () => {
expect(documentXml).toBeTruthy();
});
});

// Helper to build a UTF-16LE Buffer with BOM
function utf16leWithBOM(str) {
const bom = Buffer.from([0xff, 0xfe]);
const body = Buffer.from(str, 'utf16le');
return Buffer.concat([bom, body]);
}

describe('DocxZipper - UTF-16 XML handling', () => {
let zipper;
beforeEach(() => {
zipper = new DocxZipper();
});

it('decodes a UTF-16LE customXml part correctly (was failing before fix)', async () => {
const zip = new JSZip();

// Minimal [Content_Types].xml to look like a docx
const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>`;
zip.file('[Content_Types].xml', contentTypes);

// A basic UTF-8 document.xml so there's at least one normal XML entry
const documentXml = `<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body>
</w:document>`;
zip.file('word/document.xml', documentXml);

// The problematic UTF-16LE customXml item
const customXmlUtf16 = `<?xml version="1.0" encoding="utf-16"?>
<properties xmlns="http://www.imanage.com/work/xmlschema">
<documentid>TELEKOM!4176814.1</documentid>
<senderid>A675398</senderid>
<senderemail>GUDRUN.JORDAN@TELEKOM.DE</senderemail>
<lastmodified>2023-07-06T15:09:00.0000000+02:00</lastmodified>
<database>TELEKOM</database>
</properties>`;
zip.file('customXml/item2.xml', utf16leWithBOM(customXmlUtf16));

// Generate the zip as a Node buffer and feed it to the zipper
const buf = await zip.generateAsync({ type: 'nodebuffer' });
const files = await zipper.getDocxData(buf /* isNode not needed for XML */);

// Find the customXml item
const item2 = files.find((f) => f.name === 'customXml/item2.xml');
expect(item2).toBeTruthy();

// ✅ With the fix, content is a clean JS string:
expect(item2.content).toContain('<?xml'); // prolog present
expect(item2.content).toContain('<properties'); // real tag (no NULs interleaved)
expect(item2.content).not.toMatch(/\u0000/); // no embedded NULs
expect(item2.content.toLowerCase()).toContain('encoding="utf-16"');
});
});
80 changes: 80 additions & 0 deletions packages/super-editor/src/core/encoding-helpers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Quick check for .xml / .rels
* @param {string} name
* @returns {boolean} True if the name has a .xml or .rels extension
*/
export const isXmlLike = (name) => /\.xml$|\.rels$/i.test(name);

/**
* Hex dump for optional debugging
* @param {Uint8Array|ArrayBuffer} bytes
* @param {number} n
* @returns {string} Hex dump
*/
export function hex(bytes, n = 32) {
const u8 = bytes instanceof Uint8Array ? bytes : new Uint8Array(bytes);
return Array.from(u8.slice(0, n))
.map((b) => b.toString(16).padStart(2, '0'))
.join(' ');
}

/**
* Try to detect encoding by BOM / null density
* @param {Uint8Array} u8
* @returns {string} Detected encoding
*/
export function sniffEncoding(u8) {
if (u8.length >= 2) {
const b0 = u8[0],
b1 = u8[1];
if (b0 === 0xff && b1 === 0xfe) return 'utf-16le';
if (b0 === 0xfe && b1 === 0xff) return 'utf-16be';
}
// Heuristic: lots of NULs near the start → likely UTF-16
let nul = 0;
for (let i = 0; i < Math.min(64, u8.length); i++) if (u8[i] === 0) nul++;
if (nul > 16) return 'utf-16le';
return 'utf-8';
}

/**
* Remove leading BOM from already-decoded JS string
* @param {string} str
* @returns {string} Cleaned string without BOM
*/
export function stripBOM(str) {
return str && str.charCodeAt(0) === 0xfeff ? str.slice(1) : str;
}

/**
* Decode XML/RELS content to a clean JS string.
* Accepts: string | Uint8Array | ArrayBuffer
* @param {string|Uint8Array|ArrayBuffer} content
* @returns {string} Clean XML string
*/
export function ensureXmlString(content) {
if (typeof content === 'string') return stripBOM(content);

// Accept: Buffer, Uint8Array, DataView, any TypedArray, or ArrayBuffer
let u8 = null;

if (content && typeof content === 'object') {
if (content instanceof Uint8Array) {
u8 = content;
} else if (typeof Buffer !== 'undefined' && Buffer.isBuffer && Buffer.isBuffer(content)) {
// Node Buffer
u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
} else if (ArrayBuffer.isView && ArrayBuffer.isView(content)) {
// Any ArrayBufferView: DataView or other TypedArray
u8 = new Uint8Array(content.buffer, content.byteOffset, content.byteLength);
} else if (content.constructor && (content instanceof ArrayBuffer || content.constructor.name === 'ArrayBuffer')) {
u8 = new Uint8Array(content);
}
}

if (!u8) throw new Error('Unsupported content type for XML');

const enc = sniffEncoding(u8);
let xml = new TextDecoder(enc).decode(u8);
return stripBOM(xml);
}
142 changes: 142 additions & 0 deletions packages/super-editor/src/core/encoding-helpers.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { describe, it, expect } from 'vitest';
import { isXmlLike, hex, sniffEncoding, stripBOM, ensureXmlString } from './encoding-helpers.js';

function utf16leWithBOM(str) {
const bom = Buffer.from([0xff, 0xfe]);
const body = Buffer.from(str, 'utf16le');
return Buffer.concat([bom, body]);
}

function utf16beWithBOM(str) {
const le = Buffer.from(str, 'utf16le');
const swapped = Buffer.alloc(le.length);
for (let i = 0; i < le.length; i += 2) {
swapped[i] = le[i + 1];
swapped[i + 1] = le[i];
}
const bom = Buffer.from([0xfe, 0xff]);
return Buffer.concat([bom, swapped]);
}

function noBOMUtf16leBytes(str) {
// UTF-16LE WITHOUT a BOM (to trigger the NUL-heuristic)
return Buffer.from(str, 'utf16le');
}

describe('isXmlLike', () => {
it('matches .xml and .rels', () => {
expect(isXmlLike('word/document.xml')).toBe(true);
expect(isXmlLike('word/_rels/document.xml.rels')).toBe(true);
expect(isXmlLike('docProps/core.xml')).toBe(true);
});
it('rejects non-xml', () => {
expect(isXmlLike('word/media/image1.png')).toBe(false);
expect(isXmlLike('customXml/item1.xml.bin')).toBe(false);
expect(isXmlLike('word/fonts/font1.odttf')).toBe(false);
});
});

describe('hex', () => {
it('renders hex dump of first N bytes', () => {
const u8 = new Uint8Array([0xff, 0xfe, 0x3c, 0x00, 0x3f, 0x00]);
expect(hex(u8, 6)).toBe('ff fe 3c 00 3f 00');
});
});

describe('sniffEncoding', () => {
it('detects UTF-16LE by BOM', () => {
const u8 = utf16leWithBOM('<?xml version="1.0"?>');
expect(sniffEncoding(u8)).toBe('utf-16le');
});
it('detects UTF-16BE by BOM', () => {
const u8 = utf16beWithBOM('<?xml version="1.0"?>');
expect(sniffEncoding(u8)).toBe('utf-16be');
});
it('defaults to utf-8 for plain ASCII/UTF-8', () => {
const u8 = new TextEncoder().encode('<?xml version="1.0"?><a/>');
expect(sniffEncoding(u8)).toBe('utf-8');
});
it('heuristically detects UTF-16 (no BOM) via NUL density', () => {
const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root/>');
// Our heuristic returns 'utf-16le' for lots of NULs
expect(sniffEncoding(u8)).toBe('utf-16le');
});
});

describe('stripBOM', () => {
it('removes U+FEFF if present', () => {
const s = '\uFEFF<?xml?><r/>';
expect(stripBOM(s)).toBe('<?xml?><r/>');
});
it('no-ops when no BOM present', () => {
const s = '<?xml?><r/>';
expect(stripBOM(s)).toBe(s);
});
});

describe('ensureXmlString', () => {
it('returns same string when given a plain XML string', () => {
const s = '<?xml version="1.0"?><r/>';
expect(ensureXmlString(s)).toBe(s);
});

it('strips leading BOM from a decoded string', () => {
const s = '\uFEFF<?xml version="1.0"?><r/>';
expect(ensureXmlString(s)).toBe('<?xml version="1.0"?><r/>');
});

it('decodes UTF-8 bytes', () => {
const u8 = new TextEncoder().encode('<?xml version="1.0"?><root>héllo</root>');
const out = ensureXmlString(u8);
expect(out).toContain('<?xml');
expect(out).toContain('héllo');
});

it('decodes UTF-16LE with BOM bytes', () => {
const u8 = utf16leWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
const out = ensureXmlString(u8);
expect(out.toLowerCase()).toContain('encoding="utf-16"');
expect(out).toContain('<props>');
expect(out).not.toMatch(/\u0000/);
});

it('decodes UTF-16BE with BOM bytes', () => {
const u8 = utf16beWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
const out = ensureXmlString(u8);
expect(out.toLowerCase()).toContain('encoding="utf-16"');
expect(out).toContain('<props>');
expect(out).not.toMatch(/\u0000/);
});

it('decodes UTF-16 (no BOM) via heuristic', () => {
const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root>NOBOM</root>');
const out = ensureXmlString(u8);
expect(out).toContain('<root>');
expect(out).toContain('NOBOM');
expect(out).not.toMatch(/\u0000/);
});

it('accepts ArrayBuffer input', () => {
const u8 = new TextEncoder().encode('<?xml version="1.0"?><r/>');
const out = ensureXmlString(u8.buffer);
expect(out).toContain('<r/>');
});

it('throws on unsupported content types', () => {
expect(() => ensureXmlString(12345)).toThrow(/Unsupported content type/);
});

it('decodes from Node Buffer (utf-8)', () => {
const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
const out = ensureXmlString(buf);
expect(out).toContain('<root/>');
});
});

describe('ensureXmlString cross-env', () => {
it('decodes from Node Buffer (utf-8)', () => {
const buf = Buffer.from('<?xml version="1.0"?><root/>', 'utf8');
const out = ensureXmlString(buf);
expect(out).toContain('<root/>');
});
});
2 changes: 1 addition & 1 deletion packages/superdoc/package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@harbour-enterprises/superdoc",
"type": "module",
"version": "0.16.0-next.6",
"version": "0.16.0",
"license": "AGPL-3.0",
"readme": "../../README.md",
"files": [
Expand Down
Loading