Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 44 additions & 2 deletions lib/utf8.js
Original file line number Diff line number Diff line change
Expand Up @@ -255,21 +255,63 @@ Utf8DecodeWorker.prototype.flush = function () {
exports.Utf8DecodeWorker = Utf8DecodeWorker;

/**
* A worker to endcode string chunks into utf8 encoded binary chunks.
* A worker to encode string chunks into utf8 encoded binary chunks.
* @constructor
*/
function Utf8EncodeWorker() {
GenericWorker.call(this, "utf-8 encode");
// Store any leftover high surrogate from the previous chunk.
// This handles the case where DataWorker splits a surrogate pair
// across chunk boundaries (since substring() operates on UTF-16 code units).
this.leftOver = null;
}
utils.inherits(Utf8EncodeWorker, GenericWorker);

/**
* @see GenericWorker.processChunk
*/
Utf8EncodeWorker.prototype.processChunk = function (chunk) {
var data = chunk.data;

// Prepend any leftover high surrogate from the previous chunk
if (this.leftOver !== null) {
data = this.leftOver + data;
this.leftOver = null;
}

// Check if this chunk ends with a high surrogate (0xD800-0xDBFF).
// If so, save it for the next chunk to ensure proper UTF-8 encoding.
if (data.length > 0) {
var lastCharCode = data.charCodeAt(data.length - 1);
// High surrogates have the pattern 110110xxxxxxxxxx (0xD800-0xDBFF)
if ((lastCharCode & 0xfc00) === 0xd800) {
this.leftOver = data.charAt(data.length - 1);
data = data.substring(0, data.length - 1);
}
}

this.push({
data : exports.utf8encode(chunk.data),
data : exports.utf8encode(data),
meta : chunk.meta
});
};

/**
* @see GenericWorker.flush
*/
Utf8EncodeWorker.prototype.flush = function () {
// If there's a leftover high surrogate with no matching low surrogate,
// encode it anyway. In Node.js, the underlying encoder may emit a
// replacement character (U+FFFD); in non-Node builds, utf8encode()
// will emit a 3-byte UTF-8 sequence for the surrogate code unit
// itself (an invalid UTF-8 sequence). In both cases this is
// best-effort handling of malformed input.
if (this.leftOver !== null) {
this.push({
data : exports.utf8encode(this.leftOver),
meta : {}
});
this.leftOver = null;
}
};
exports.Utf8EncodeWorker = Utf8EncodeWorker;
47 changes: 47 additions & 0 deletions test/asserts/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,50 @@ JSZipTestUtils.testZipFile("Zip text file and UTF-8, Pile Of Poo test", "ref/pil
done();
})["catch"](JSZipTestUtils.assertNoError);
});

QUnit.test("Astral characters at chunk boundary should not produce CESU-8", function(assert) {
// Test that surrogate pairs split across chunk boundaries are correctly
// encoded as 4-byte UTF-8 sequences, not as two 3-byte CESU-8 sequences.
// This bug occurred because DataWorker.substring() can split surrogate pairs.
var BLOCK_SIZE = 16 * 1024; // JSZip's internal chunk size

// Position an astral character (U+1F600, 😀) right at the chunk boundary
var padding = "";
for (var i = 0; i < BLOCK_SIZE - 1; i++) {
padding += "x";
}
// U+1F600 is represented as surrogate pair \uD83D\uDE00 in JavaScript
var emoji = "\uD83D\uDE00";
var testStr = padding + emoji + "end";

var zip = new JSZip();
zip.file("test.txt", testStr);

return zip.generateAsync({type:"uint8array"})
.then(function(zipData) {
return JSZip.loadAsync(zipData);
})
.then(function(loadedZip) {
return loadedZip.file("test.txt").async("uint8array");
})
.then(function(bytes) {
// Check for CESU-8 sequences (ED xx xx pattern for surrogates)
var foundCesu8 = false;
for (var j = 0; j < bytes.length - 2; j++) {
if (bytes[j] === 0xED && bytes[j+1] >= 0xA0 && bytes[j+1] <= 0xBF) {
foundCesu8 = true;
break;
}
}
assert.ok(!foundCesu8, "No CESU-8 sequences should be present");

// Also verify the content round-trips correctly
return JSZip.loadAsync(zip.generateAsync({type:"uint8array"}));
})
.then(function(reloadedZip) {
return reloadedZip.file("test.txt").async("string");
})
.then(function(content) {
assert.equal(content, testStr, "Content should round-trip correctly");
});
});