diff --git a/README.md b/README.md index d18a341..b71d50d 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,8 @@ Returns a function `encode(string)` that encodes a string to bytes. In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could not be encoded in the target encoding. +In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?'). + ##### `latin1toString(arr)` Decode `iso-8859-1` bytes to a string. diff --git a/single-byte.js b/single-byte.js index 5462587..7af224a 100644 --- a/single-byte.js +++ b/single-byte.js @@ -61,38 +61,59 @@ export function createSinglebyteDecoder(encoding, loose = false) { const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex -function encode(s, m) { +function encode(s, m, loose) { const len = s.length const x = new Uint8Array(len) let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s) - for (const len3 = len - 3; i < len3; i += 4) { + if (!m || m.length < 256) return null // perf + const len3 = len - 3 + while (i < len3) { const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore - if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null + if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break x[i] = c0 x[i + 1] = c1 x[i + 2] = c2 x[i + 3] = c3 + i += 4 } for (; i < len; i++) { const x0 = s.charCodeAt(i) const c0 = m[x0] - if (!c0 && x0) return null + if (!c0 && x0) break x[i] = c0 } - return x + if (i === len) return x + if (!loose) return null + let j = i + while (i < len) { + const x0 = s.charCodeAt(i++) + if (x0 >= 0xd8_00 && x0 < 0xdc_00) { + if (i < len) { + const x1 = s.charCodeAt(i) + if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++ + } + x[j++] = 63 // '?' + } else { + const c0 = m[x0] + x[j++] = !c0 && x0 ? 63 : c0 + } + + } + + return j === len ? x : x.subarray(0, j) } // fromBase64+btoa path is faster on everything where fromBase64 is fast const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) { - // TODO: replacement, truncate (replacement will need varying length) - if (mode !== 'fatal') throw new Error('Unsupported mode') + const loose = mode === 'replacement' + if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode') const m = encodeMap(encoding) // asserts const isLatin1 = encoding === 'iso-8859-1' @@ -106,24 +127,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) { if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) { try { return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1 - } catch { - throw new TypeError(E_STRICT) - } + } catch {} + } else if (!NON_LATIN.test(s)) { + return encodeLatin1(s) } - if (NON_LATIN.test(s)) throw new TypeError(E_STRICT) - return encodeLatin1(s) - } - - // Instead of an ASCII regex check, encode optimistically - this is faster - // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path - if (nativeEncoder && !NON_LATIN.test(s)) { + if (!loose) throw new TypeError(E_STRICT) + } else if (nativeEncoder && !NON_LATIN.test(s)) { + // Instead of an ASCII regex check, encode optimistically - this is faster + // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path try { return encodeAscii(s, E_STRICT) } catch {} } - const res = encode(s, m) + const res = encode(s, m, loose) if (!res) throw new TypeError(E_STRICT) return res } diff --git a/single-byte.node.js b/single-byte.node.js index 2f68c9c..5a4f009 100644 --- a/single-byte.node.js +++ b/single-byte.node.js @@ -61,54 +61,79 @@ export function createSinglebyteDecoder(encoding, loose = false) { const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex -function encode(s, m) { +function encode(s, m, loose) { const len = s.length let i = 0 const b = Buffer.from(s, 'utf-16le') // aligned if (!isLE) b.swap16() const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2) - for (const len3 = len - 3; i < len3; i += 4) { + if (!m || m.length < 256) return null // perf + const len3 = len - 3 + while (i < len3) { const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore - if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore + if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break x[i] = c0 x[i + 1] = c1 x[i + 2] = c2 x[i + 3] = c3 + i += 4 } + const mlen = m.length for (; i < len; i++) { const x0 = x[i] + if (x0 >= mlen) break const c0 = m[x0] - if (!c0 && x0) return null + if (!c0 && x0) break x[i] = c0 } - return new Uint8Array(x) + if (i === len) return new Uint8Array(x) + if (!loose) return null + let j = i + while (i < len) { + const x0 = x[i++] + if (x0 >= 0xd8_00 && x0 < 0xdc_00) { + if (i < len) { + const x1 = x[i] + if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++ + } + x[j++] = 63 // '?' + } else if (x0 >= mlen) { + x[j++] = 63 // '?' + } else { + const c0 = m[x0] + x[j++] = !c0 && x0 ? 63 : c0 + } + } + + return new Uint8Array(j === len ? x : x.subarray(0, j)) } export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) { - // TODO: replacement, truncate (replacement will need varying length) - if (mode !== 'fatal') throw new Error('Unsupported mode') + const loose = mode === 'replacement' + if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode') const m = encodeMap(encoding) // asserts const isLatin1 = encoding === 'iso-8859-1' return (s) => { if (typeof s !== 'string') throw new TypeError(E_STRING) if (isLatin1) { - if (NON_LATIN.test(s)) throw new TypeError(E_STRICT) - const b = Buffer.from(s, 'latin1') - return new Uint8Array(b.buffer, b.byteOffset, b.byteLength) - } - - // Instead of an ASCII regex check, encode optimistically - this is faster - // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path - if (!NON_LATIN.test(s)) { + if (!NON_LATIN.test(s)) { + const b = Buffer.from(s, 'latin1') + return new Uint8Array(b.buffer, b.byteOffset, b.byteLength) + } + + if (!loose) throw new TypeError(E_STRICT) + } else if (!NON_LATIN.test(s)) { + // Instead of an ASCII regex check, encode optimistically - this is faster + // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength) } - const res = encode(s, m) + const res = encode(s, m, loose) if (!res) throw new TypeError(E_STRICT) return res } diff --git a/tests/single-byte.test.js b/tests/single-byte.test.js index 75c09ea..59b29fe 100644 --- a/tests/single-byte.test.js +++ b/tests/single-byte.test.js @@ -15,7 +15,9 @@ describe('single-byte encodings are supersets of ascii', () => { for (const encoding of encodings) { test(encoding, (t) => { const decoder = createSinglebyteDecoder(encoding) + const decoderLoose = createSinglebyteDecoder(encoding, true) const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) for (let i = 0; i < 128; i++) { let str try { @@ -27,7 +29,9 @@ describe('single-byte encodings are supersets of ascii', () => { t.assert.strictEqual(str.length, 1, i) t.assert.strictEqual(str.codePointAt(0), i, i) + t.assert.strictEqual(decoderLoose(Uint8Array.of(i)), str, i) t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i)) + t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(i)) } }) } @@ -84,6 +88,7 @@ describe('single-byte encodings index: Unicode', () => { const decoder = createSinglebyteDecoder(encoding) const decoderLoose = createSinglebyteDecoder(encoding, true) const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) const text = readFileSync( join(import.meta.dirname, 'encoding/fixtures/unicode/', fileName), 'utf8' @@ -145,6 +150,7 @@ describe('single-byte encodings index: Unicode', () => { t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte))) t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte)) + t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte)) } } }) @@ -158,6 +164,7 @@ describe('single-byte encodings index: WHATWG', () => { const decoder = createSinglebyteDecoder(encoding) const decoderLoose = createSinglebyteDecoder(encoding, true) const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) const text = readFileSync( join(import.meta.dirname, 'encoding/fixtures/single-byte', `index-${encoding}.txt`), 'utf8' @@ -199,6 +206,7 @@ describe('single-byte encodings index: WHATWG', () => { t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte))) t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte)) + t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte)) } else { t.assert.throws(() => decoder(Uint8Array.of(byte))) try { @@ -230,6 +238,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () => const decoder = createSinglebyteDecoder(encoding) const decoderLoose = createSinglebyteDecoder(encoding, true) const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) t.assert.strictEqual(data.length, 128) for (let i = 0; i < data.length; i++) { @@ -244,6 +253,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () => t.assert.strictEqual(decoder(Uint8Array.of(byte)), str) t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), str) t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte)) + t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte)) } else { t.assert.throws(() => decoder(Uint8Array.of(byte))) t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), '\uFFFD') @@ -268,13 +278,16 @@ describe('x-user-defined', () => { test('encode', (t) => { const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) for (let byte = 0; byte < 256; byte++) { const str = String.fromCodePoint(byte >= 0x80 ? 0xf7_80 + byte - 0x80 : byte) t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte), byte) + t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte), byte) } for (let i = 128; i < 512; i++) { t.assert.throws(() => encoder(String.fromCodePoint(i)), /Input is not well-formed/) + t.assert.deepStrictEqual(encoderLoose(String.fromCodePoint(i)), Uint8Array.of(0x3f), i) } }) }) @@ -284,10 +297,15 @@ describe('codes above 0x7F are non-ASCII', () => { for (const encoding of ['iso-8859-2', 'iso-8859-16']) { test(encoding, (t) => { const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) t.assert.deepStrictEqual(encoder('\x80'), new Uint8Array(1).fill(0x80)) t.assert.deepStrictEqual(encoder('\x80'.repeat(4)), new Uint8Array(4).fill(0x80)) t.assert.deepStrictEqual(encoder('\x80'.repeat(8)), new Uint8Array(8).fill(0x80)) t.assert.deepStrictEqual(encoder('\x80'.repeat(16)), new Uint8Array(16).fill(0x80)) + t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x80)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x80)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x80)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x80)) }) } @@ -295,10 +313,15 @@ describe('codes above 0x7F are non-ASCII', () => { for (const encoding of ['windows-1250', 'windows-1252', 'x-user-defined']) { test(encoding, (t) => { const encoder = createSinglebyteEncoder(encoding) + const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' }) t.assert.throws(() => encoder('\x80')) t.assert.throws(() => encoder('\x80'.repeat(4))) t.assert.throws(() => encoder('\x80'.repeat(8))) t.assert.throws(() => encoder('\x80'.repeat(16))) + t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x3f)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x3f)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x3f)) + t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x3f)) }) } })