From be363f18706da10edd25fcd55b6cf288edff2623 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Fri, 26 Dec 2025 11:13:54 +0900 Subject: [PATCH 1/2] Revert back to encoding names as output Now that our dependency package has labelToName() exported, this is easy enough to do. Additionally, in jsdom we always immediately wrap the result in labelToName() anyway, so this will make it easier to consume. --- README.md | 4 ++-- lib/html-encoding-sniffer.js | 8 +++---- package-lock.json | 8 +++---- package.json | 2 +- .../bom/{utf-16be.html => UTF-16BE.html} | Bin .../bom/{utf-16le.html => UTF-16LE.html} | Bin test/fixtures/bom/{utf-8.html => UTF-8.html} | 0 ..._utf-8.html => charset-bracket_UTF-8.html} | 0 ... => charset-short-comment_ISO-8859-2.html} | 0 ...harset_koi8-r.html => charset_KOI8-R.html} | 0 ...l => http-equiv-no-quotes_ISO-8859-5.html} | 0 ...http-equiv-second-charset_ISO-8859-2.html} | 0 ...http-equiv-trailing-space_ISO-8859-2.html} | 0 test/tests.js | 20 +++++++++--------- 14 files changed, 21 insertions(+), 21 deletions(-) rename test/fixtures/bom/{utf-16be.html => UTF-16BE.html} (100%) rename test/fixtures/bom/{utf-16le.html => UTF-16LE.html} (100%) rename test/fixtures/bom/{utf-8.html => UTF-8.html} (100%) rename test/fixtures/normal/{charset-bracket_utf-8.html => charset-bracket_UTF-8.html} (100%) rename test/fixtures/normal/{charset-short-comment_iso-8859-2.html => charset-short-comment_ISO-8859-2.html} (100%) rename test/fixtures/normal/{charset_koi8-r.html => charset_KOI8-R.html} (100%) rename test/fixtures/normal/{http-equiv-no-quotes_iso-8859-5.html => http-equiv-no-quotes_ISO-8859-5.html} (100%) rename test/fixtures/normal/{http-equiv-second-charset_iso-8859-2.html => http-equiv-second-charset_ISO-8859-2.html} (100%) rename test/fixtures/normal/{http-equiv-trailing-space_iso-8859-2.html => http-equiv-trailing-space_ISO-8859-2.html} (100%) diff --git a/README.md b/README.md index 6d562c9..8c04724 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ const sniffedEncoding = htmlEncodingSniffer(htmlBytes); The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above. -The returned value will be an [encoding label](https://encoding.spec.whatwg.org/#names-and-labels), and in particular, the label which is a lowercased version of the encoding's name. You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result: +The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result: ```js const { TextDecoder } = require("@exodus/bytes"); -const htmlString = (new TextEncoder(sniffedEncoding)).decode(htmlBytes); +const htmlString = (new TextDecoder(sniffedEncoding)).decode(htmlBytes); ``` ## Options diff --git a/lib/html-encoding-sniffer.js b/lib/html-encoding-sniffer.js index b6a97c9..579a020 100644 --- a/lib/html-encoding-sniffer.js +++ b/lib/html-encoding-sniffer.js @@ -1,9 +1,9 @@ "use strict"; -const { getBOMEncoding, normalizeEncoding: labelToName } = require("@exodus/bytes/encoding-lite.js"); +const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js"); // https://html.spec.whatwg.org/#encoding-sniffing-algorithm module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { - let encoding = getBOMEncoding(uint8Array); + let encoding = labelToName(getBOMEncoding(uint8Array)); if (encoding === null && transportLayerEncodingLabel !== undefined) { encoding = labelToName(transportLayerEncodingLabel); @@ -86,8 +86,8 @@ function prescanMetaCharset(uint8Array) { continue; } - if (charset === "utf-16le" || charset === "utf-16be") { - charset = "utf-8"; + if (charset === "UTF-16LE" || charset === "UTF-16BE") { + charset = "UTF-8"; } if (charset === "x-user-defined") { charset = "windows-1252"; diff --git a/package-lock.json b/package-lock.json index efc8996..84b20b7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "5.0.0", "license": "MIT", "dependencies": { - "@exodus/bytes": "^1.2.0" + "@exodus/bytes": "^1.6.0" }, "devDependencies": { "@domenic/eslint-config": "^4.0.1", @@ -188,9 +188,9 @@ } }, "node_modules/@exodus/bytes": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.2.0.tgz", - "integrity": "sha512-csqCicCB213ZYHbPZa4ljMQEH2GH2au7M2Q4feh3cGst7nYNilT5lfKF2LSqQEhaPsBX8XzlJeM2q/4hTl1ueA==", + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.6.0.tgz", + "integrity": "sha512-y32mI9627q5LR/L8fLc4YyDRJQOi+jK0D9okzLilAdiU3F9we3zC7Y7CFrR/8vAvUyv7FgBAYcNHtvbmhKCFcw==", "license": "MIT", "engines": { "node": "^20.19.0 || ^22.12.0 || >=24.0.0" diff --git a/package.json b/package.json index 4dab3de..91e4481 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,7 @@ "lint": "eslint" }, "dependencies": { - "@exodus/bytes": "^1.2.0" + "@exodus/bytes": "^1.6.0" }, "devDependencies": { "@domenic/eslint-config": "^4.0.1", diff --git a/test/fixtures/bom/utf-16be.html b/test/fixtures/bom/UTF-16BE.html similarity index 100% rename from test/fixtures/bom/utf-16be.html rename to test/fixtures/bom/UTF-16BE.html diff --git a/test/fixtures/bom/utf-16le.html b/test/fixtures/bom/UTF-16LE.html similarity index 100% rename from test/fixtures/bom/utf-16le.html rename to test/fixtures/bom/UTF-16LE.html diff --git a/test/fixtures/bom/utf-8.html b/test/fixtures/bom/UTF-8.html similarity index 100% rename from test/fixtures/bom/utf-8.html rename to test/fixtures/bom/UTF-8.html diff --git a/test/fixtures/normal/charset-bracket_utf-8.html b/test/fixtures/normal/charset-bracket_UTF-8.html similarity index 100% rename from test/fixtures/normal/charset-bracket_utf-8.html rename to test/fixtures/normal/charset-bracket_UTF-8.html diff --git a/test/fixtures/normal/charset-short-comment_iso-8859-2.html b/test/fixtures/normal/charset-short-comment_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/charset-short-comment_iso-8859-2.html rename to test/fixtures/normal/charset-short-comment_ISO-8859-2.html diff --git a/test/fixtures/normal/charset_koi8-r.html b/test/fixtures/normal/charset_KOI8-R.html similarity index 100% rename from test/fixtures/normal/charset_koi8-r.html rename to test/fixtures/normal/charset_KOI8-R.html diff --git a/test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html b/test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html similarity index 100% rename from test/fixtures/normal/http-equiv-no-quotes_iso-8859-5.html rename to test/fixtures/normal/http-equiv-no-quotes_ISO-8859-5.html diff --git a/test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html b/test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-second-charset_iso-8859-2.html rename to test/fixtures/normal/http-equiv-second-charset_ISO-8859-2.html diff --git a/test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html b/test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html similarity index 100% rename from test/fixtures/normal/http-equiv-trailing-space_iso-8859-2.html rename to test/fixtures/normal/http-equiv-trailing-space_ISO-8859-2.html diff --git a/test/tests.js b/test/tests.js index dadd8b7..74ec2e0 100644 --- a/test/tests.js +++ b/test/tests.js @@ -25,7 +25,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) { it(`should sniff as ${desiredEncoding}, given overriding options`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1252", - defaultEncoding: "utf-16le" + defaultEncoding: "UTF-16LE" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -47,7 +47,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -56,7 +56,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) { it(`should sniff as ${desiredEncoding}, given only a default encoding`, () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, desiredEncoding); @@ -78,7 +78,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -87,10 +87,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result")) it("should sniff as the default encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); - assert.strictEqual(sniffedEncoding, "iso-8859-16"); + assert.strictEqual(sniffedEncoding, "ISO-8859-16"); }); }); } @@ -102,13 +102,13 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given no options", () => { const sniffedEncoding = htmlEncodingSniffer(buffer); - assert.strictEqual(sniffedEncoding, "utf-8"); + assert.strictEqual(sniffedEncoding, "UTF-8"); }); it("should sniff as the transport layer encoding, given that", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { transportLayerEncodingLabel: "windows-1251", - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); assert.strictEqual(sniffedEncoding, "windows-1251"); @@ -117,10 +117,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { it("should sniff as UTF-8, given only a default encoding", () => { const sniffedEncoding = htmlEncodingSniffer(buffer, { - defaultEncoding: "iso-8859-16" + defaultEncoding: "ISO-8859-16" }); - assert.strictEqual(sniffedEncoding, "utf-8"); + assert.strictEqual(sniffedEncoding, "UTF-8"); }); }); } From 3a0023c24f33b734dee2c067bd99d7eb0a862139 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Fri, 26 Dec 2025 11:23:23 +0900 Subject: [PATCH 2/2] Add xml option Although it's possible to do more extensive sniffing, and we might do so in the future, this is enough to pass the relevant web platform tests we're currently working on in jsdom. --- README.md | 12 +++-- lib/html-encoding-sniffer.js | 8 ++- test/tests.js | 101 +++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8c04724..c2e1e0f 100644 --- a/README.md +++ b/README.md @@ -21,19 +21,21 @@ const htmlString = (new TextDecoder(sniffedEncoding)).decode(htmlBytes); ## Options -You can pass two potential options to `htmlEncodingSniffer`: +You can pass the following options to `htmlEncodingSniffer`: ```js const sniffedEncoding = htmlEncodingSniffer(htmlBytes, { + xml, transportLayerEncodingLabel, - defaultEncoding + defaultEncoding, }); ``` -These represent two possible inputs into the [encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm): +The `xml` option is a boolean, defaulting to `false`. If set to `true`, then we bypass the [HTML encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm) and compute the encoding based on the presence of a BOM, or the other options provided. (In the future, we may perform sniffing of the `` declaration, but for now that is not implemented.) -- `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM. -- `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. It defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). +The `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM. + +The `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. For HTML, it defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). For XML, it defaults to `"UTF-8"`. ## Credits diff --git a/lib/html-encoding-sniffer.js b/lib/html-encoding-sniffer.js index 579a020..08a92d3 100644 --- a/lib/html-encoding-sniffer.js +++ b/lib/html-encoding-sniffer.js @@ -2,14 +2,18 @@ const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js"); // https://html.spec.whatwg.org/#encoding-sniffing-algorithm -module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { +module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => { + if (defaultEncoding === undefined) { + defaultEncoding = xml ? "UTF-8" : "windows-1252"; + } + let encoding = labelToName(getBOMEncoding(uint8Array)); if (encoding === null && transportLayerEncodingLabel !== undefined) { encoding = labelToName(transportLayerEncodingLabel); } - if (encoding === null) { + if (encoding === null && !xml) { encoding = prescanMetaCharset(uint8Array); } diff --git a/test/tests.js b/test/tests.js index 74ec2e0..4903e9a 100644 --- a/test/tests.js +++ b/test/tests.js @@ -124,3 +124,104 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) { }); }); } + +describe("xml: true", () => { + describe("BOM detection", () => { + for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) { + const buffer = read(`fixtures/bom/${file}`); + const desiredEncoding = path.basename(file, ".html"); + + it(`should sniff ${file} as ${desiredEncoding}`, () => { + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, desiredEncoding); + }); + + it(`should sniff ${file} as ${desiredEncoding}, given overriding options`, () => { + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "windows-1252", + defaultEncoding: "ISO-8859-1" + }); + + assert.strictEqual(sniffedEncoding, desiredEncoding); + }); + } + }); + + describe("UTF-32 BOMs (not recognized, should fall back to default)", () => { + it("should ignore UTF-32BE BOM and return UTF-8", () => { + // UTF-32BE BOM: 00 00 FE FF + const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should detect UTF-32LE BOM as UTF-16LE (since FF FE prefix matches)", () => { + // UTF-32LE BOM: FF FE 00 00 — but FF FE is also UTF-16LE BOM + const buffer = new Uint8Array([0xFF, 0xFE, 0x00, 0x00, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-16LE"); + }); + + it("should ignore UTF-32BE BOM and use transport layer encoding if provided", () => { + const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "KOI8-R" + }); + + assert.strictEqual(sniffedEncoding, "KOI8-R"); + }); + }); + + describe("meta charset ignored", () => { + it("should ignore meta charset and return UTF-8 default", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should ignore meta charset but use transport layer encoding", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + transportLayerEncodingLabel: "ISO-8859-2" + }); + + assert.strictEqual(sniffedEncoding, "ISO-8859-2"); + }); + + it("should ignore meta charset but use custom default encoding", () => { + const buffer = read("fixtures/normal/charset_KOI8-R.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + defaultEncoding: "windows-1252" + }); + + assert.strictEqual(sniffedEncoding, "windows-1252"); + }); + }); + + describe("default encoding", () => { + it("should default to UTF-8 for XML", () => { + const buffer = read("fixtures/no-result/no-indicators_windows-1252.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true }); + + assert.strictEqual(sniffedEncoding, "UTF-8"); + }); + + it("should allow overriding the default encoding", () => { + const buffer = read("fixtures/no-result/no-indicators_windows-1252.html"); + const sniffedEncoding = htmlEncodingSniffer(buffer, { + xml: true, + defaultEncoding: "ISO-8859-1" + }); + + assert.strictEqual(sniffedEncoding, "ISO-8859-1"); + }); + }); +});