diff --git a/README.md b/README.md index 6d562c9..c2e1e0f 100644 --- a/README.md +++ b/README.md @@ -12,28 +12,30 @@ const sniffedEncoding = htmlEncodingSniffer(htmlBytes); The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above. -The returned value will be an [encoding label](https://encoding.spec.whatwg.org/#names-and-labels), and in particular, the label which is a lowercased version of the encoding's name. You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result: +The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result: ```js const { TextDecoder } = require("@exodus/bytes"); -const htmlString = (new TextEncoder(sniffedEncoding)).decode(htmlBytes); +const htmlString = (new TextDecoder(sniffedEncoding)).decode(htmlBytes); ``` ## Options -You can pass two potential options to `htmlEncodingSniffer`: +You can pass the following options to `htmlEncodingSniffer`: ```js const sniffedEncoding = htmlEncodingSniffer(htmlBytes, { + xml, transportLayerEncodingLabel, - defaultEncoding + defaultEncoding, }); ``` -These represent two possible inputs into the [encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm): +The `xml` option is a boolean, defaulting to `false`. If set to `true`, then we bypass the [HTML encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm) and compute the encoding based on the presence of a BOM, or the other options provided. (In the future, we may perform sniffing of the `` declaration, but for now that is not implemented.) -- `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM. -- `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. It defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). +The `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM. + +The `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. For HTML, it defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). For XML, it defaults to `"UTF-8"`. ## Credits diff --git a/lib/html-encoding-sniffer.js b/lib/html-encoding-sniffer.js index b6a97c9..08a92d3 100644 --- a/lib/html-encoding-sniffer.js +++ b/lib/html-encoding-sniffer.js @@ -1,15 +1,19 @@ "use strict"; -const { getBOMEncoding, normalizeEncoding: labelToName } = require("@exodus/bytes/encoding-lite.js"); +const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js"); // https://html.spec.whatwg.org/#encoding-sniffing-algorithm -module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { - let encoding = getBOMEncoding(uint8Array); +module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => { + if (defaultEncoding === undefined) { + defaultEncoding = xml ? "UTF-8" : "windows-1252"; + } + + let encoding = labelToName(getBOMEncoding(uint8Array)); if (encoding === null && transportLayerEncodingLabel !== undefined) { encoding = labelToName(transportLayerEncodingLabel); } - if (encoding === null) { + if (encoding === null && !xml) { encoding = prescanMetaCharset(uint8Array); } @@ -86,8 +90,8 @@ function prescanMetaCharset(uint8Array) { continue; } - if (charset === "utf-16le" || charset === "utf-16be") { - charset = "utf-8"; + if (charset === "UTF-16LE" || charset === "UTF-16BE") { + charset = "UTF-8"; } if (charset === "x-user-defined") { charset = "windows-1252"; diff --git a/package-lock.json b/package-lock.json index efc8996..84b20b7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "5.0.0", "license": "MIT", "dependencies": { - "@exodus/bytes": "^1.2.0" + "@exodus/bytes": "^1.6.0" }, "devDependencies": { "@domenic/eslint-config": "^4.0.1", @@ -188,9 +188,9 @@ } }, "node_modules/@exodus/bytes": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.2.0.tgz", - "integrity": "sha512-csqCicCB213ZYHbPZa4ljMQEH2GH2au7M2Q4feh3cGst7nYNilT5lfKF2LSqQEhaPsBX8XzlJeM2q/4hTl1ueA==", + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.6.0.tgz", + "integrity": "sha512-y32mI9627q5LR/L8fLc4YyDRJQOi+jK0D9okzLilAdiU3F9we3zC7Y7CFrR/8vAvUyv7FgBAYcNHtvbmhKCFcw==", "license": "MIT", "engines": { "node": "^20.19.0 || ^22.12.0 || >=24.0.0" diff --git a/package.json b/package.json index 4dab3de..91e4481 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,7 @@ "lint": "eslint" }, "dependencies": { - "@exodus/bytes": "^1.2.0" + "@exodus/bytes": "^1.6.0" }, "devDependencies": { "@domenic/eslint-config": "^4.0.1", diff --git a/test/fixtures/bom/utf-16be.html b/test/fixtures/bom/UTF-16BE.html similarity index 100% rename from test/fixtures/bom/utf-16be.html rename to test/fixtures/bom/UTF-16BE.html diff --git a/test/fixtures/bom/utf-16le.html b/test/fixtures/bom/UTF-16LE.html similarity index 100% rename from test/fixtures/bom/utf-16le.html rename to test/fixtures/bom/UTF-16LE.html diff --git a/test/fixtures/bom/utf-8.html b/test/fixtures/bom/UTF-8.html similarity index 100% rename from test/fixtures/bom/utf-8.html rename to test/fixtures/bom/UTF-8.html diff --git a/test/fixtures/normal/charset-bracket_utf-8.html b/test/fixtures/normal/charset-bracket_UTF-8.html similarity index 100% rename from test/fixtures/normal/charset-bracket_utf-8.html rename to test/fixtures/normal/charset-bracket_UTF-8.html diff --git a/test/fixtures/normal/charset-short-comment_iso-8859-2.html b/test/fixtures/normal/charset-short-comment_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/charset-short-comment_iso-8859-2.html rename to test/fixtures/normal/charset-short-comment_ISO-8859-2.html diff --git a/test/fixtures/normal/charset_koi8-r.html b/test/fixtures/normal/charset_KOI8-R.html similarity index 100% rename from test/fixtures/normal/charset_koi8-r.html rename to test/fixtures/normal/charset_KOI8-R.html diff --git a/test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html b/test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html similarity index 100% rename from test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html rename to test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html diff --git a/test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html b/test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html rename to test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html diff --git a/test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html b/test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html rename to test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html diff --git a/test/tests.js b/test/tests.js index dadd8b7..4903e9a 100644 --- a/test/tests.js +++ b/test/tests.js @@ -25,7 +25,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) { it(`should sniff as ${desiredEncoding}, given overriding options`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1252", - defaultEncoding: "utf-16le" + defaultEncoding: "UTF-16LE" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -47,7 +47,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -56,7 +56,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it(`should sniff as ${desiredEncoding}, given only a default encoding`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -78,7 +78,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -87,10 +87,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the default encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); - assert.strictEqual(sniffedEncoding, "iso-8859-16"); + assert.strictEqual(sniffedEncoding, "ISO-8859-16"); }); }); } @@ -102,13 +102,13 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given no options", () => { const sniffedEncoding = htmlEncodingSniffer(buffer); - assert.strictEqual(sniffedEncoding, "utf-8"); + assert.strictEqual(sniffedEncoding, "UTF-8"); }); it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -117,10 +117,111 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given only a default encoding", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); - assert.strictEqual(sniffedEncoding, "utf-8"); + assert.strictEqual(sniffedEncoding, "UTF-8"); }); }); } + +describe("xml: true", () => { + describe("BOM detection", () => { + for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) { + const buffer = read(`fixtures/bom/${file}`); + const desiredEncoding = path.basename(file, ".html"); + + it(`should sniff ${file} as ${desiredEncoding}`, () => { + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, desiredEncoding); + }); + + it(`should sniff ${file} as ${desiredEncoding}, given overriding options`, () => { + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "windows-1252", + defaultEncoding: "ISO-8859-1" + }); + + assert.strictEqual(sniffedEncoding, desiredEncoding); + }); + } + }); + + describe("UTF-32 BOMs (not recognized, should fall back to default)", () => { + it("should ignore UTF-32BE BOM and return UTF-8", () => { + // UTF-32BE BOM: 00 00 FE FF + const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should detect UTF-32LE BOM as UTF-16LE (since FF FE prefix matches)", () => { + // UTF-32LE BOM: FF FE 00 00 — but FF FE is also UTF-16LE BOM + const buffer = new Uint8Array([0xFF, 0xFE, 0x00, 0x00, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-16LE"); + }); + + it("should ignore UTF-32BE BOM and use transport layer encoding if provided", () => { + const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "KOI8-R" + }); + + assert.strictEqual(sniffedEncoding, "KOI8-R"); + }); + }); + + describe("meta charset ignored", () => { + it("should ignore meta charset and return UTF-8 default", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should ignore meta charset but use transport layer encoding", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "ISO-8859-2" + }); + + assert.strictEqual(sniffedEncoding, "ISO-8859-2"); + }); + + it("should ignore meta charset but use custom default encoding", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + defaultEncoding: "windows-1252" + }); + + assert.strictEqual(sniffedEncoding, "windows-1252"); + }); + }); + + describe("default encoding", () => { + it("should default to UTF-8 for XML", () => { + const buffer = read("fixtures/no-result/no-indicators_windows-1252.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should allow overriding the default encoding", () => { + const buffer = read("fixtures/no-result/no-indicators_windows-1252.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + defaultEncoding: "ISO-8859-1" + }); + + assert.strictEqual(sniffedEncoding, "ISO-8859-1"); + }); + }); +});