Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,30 @@ const sniffedEncoding = htmlEncodingSniffer(htmlBytes);

The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above.

The returned value will be an [encoding label](https://encoding.spec.whatwg.org/#names-and-labels), and in particular, the label which is a lowercased version of the encoding's name. You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result:
The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [`@exodus/bytes`](https://github.com/ExodusOSS/bytes/) package to decode the result:

```js
const { TextDecoder } = require("@exodus/bytes");
const htmlString = (new TextEncoder(sniffedEncoding)).decode(htmlBytes);
const htmlString = (new TextDecoder(sniffedEncoding)).decode(htmlBytes);
```

## Options

You can pass two potential options to `htmlEncodingSniffer`:
You can pass the following options to `htmlEncodingSniffer`:

```js
const sniffedEncoding = htmlEncodingSniffer(htmlBytes, {
xml,
transportLayerEncodingLabel,
defaultEncoding
defaultEncoding,
});
```

These represent two possible inputs into the [encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm):
The `xml` option is a boolean, defaulting to `false`. If set to `true`, then we bypass the [HTML encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm) and compute the encoding based on the presence of a BOM, or the other options provided. (In the future, we may perform sniffing of the `<?xml?>` declaration, but for now that is not implemented.)

- `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM.
- `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. It defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale).
The `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM.

The `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. For HTML, it defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). For XML, it defaults to `"UTF-8"`.

## Credits

Expand Down
16 changes: 10 additions & 6 deletions lib/html-encoding-sniffer.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
"use strict";
const { getBOMEncoding, normalizeEncoding: labelToName } = require("@exodus/bytes/encoding-lite.js");
const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js");

// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
let encoding = getBOMEncoding(uint8Array);
module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => {
if (defaultEncoding === undefined) {
defaultEncoding = xml ? "UTF-8" : "windows-1252";
}

let encoding = labelToName(getBOMEncoding(uint8Array));

if (encoding === null && transportLayerEncodingLabel !== undefined) {
encoding = labelToName(transportLayerEncodingLabel);
}

if (encoding === null) {
if (encoding === null && !xml) {
encoding = prescanMetaCharset(uint8Array);
}

Expand Down Expand Up @@ -86,8 +90,8 @@ function prescanMetaCharset(uint8Array) {
continue;
}

if (charset === "utf-16le" || charset === "utf-16be") {
charset = "utf-8";
if (charset === "UTF-16LE" || charset === "UTF-16BE") {
charset = "UTF-8";
}
if (charset === "x-user-defined") {
charset = "windows-1252";
Expand Down
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"lint": "eslint"
},
"dependencies": {
"@exodus/bytes": "^1.2.0"
"@exodus/bytes": "^1.6.0"
},
"devDependencies": {
"@domenic/eslint-config": "^4.0.1",
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
121 changes: 111 additions & 10 deletions test/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) {
it(`should sniff as ${desiredEncoding}, given overriding options`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1252",
defaultEncoding: "utf-16le"
defaultEncoding: "UTF-16LE"
});

assert.strictEqual(sniffedEncoding, desiredEncoding);
Expand All @@ -47,7 +47,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) {
it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -56,7 +56,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/normal"))) {

it(`should sniff as ${desiredEncoding}, given only a default encoding`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, desiredEncoding);
Expand All @@ -78,7 +78,7 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result"))
it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -87,10 +87,10 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/no-result"))

it("should sniff as the default encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, "iso-8859-16");
assert.strictEqual(sniffedEncoding, "ISO-8859-16");
});
});
}
Expand All @@ -102,13 +102,13 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) {
it("should sniff as UTF-8, given no options", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer);

assert.strictEqual(sniffedEncoding, "utf-8");
assert.strictEqual(sniffedEncoding, "UTF-8");
});

it("should sniff as the transport layer encoding, given that", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
transportLayerEncodingLabel: "windows-1251",
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, "windows-1251");
Expand All @@ -117,10 +117,111 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) {

it("should sniff as UTF-8, given only a default encoding", () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
defaultEncoding: "iso-8859-16"
defaultEncoding: "ISO-8859-16"
});

assert.strictEqual(sniffedEncoding, "utf-8");
assert.strictEqual(sniffedEncoding, "UTF-8");
});
});
}

describe("xml: true", () => {
describe("BOM detection", () => {
for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) {
const buffer = read(`fixtures/bom/${file}`);
const desiredEncoding = path.basename(file, ".html");

it(`should sniff ${file} as ${desiredEncoding}`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });

assert.strictEqual(sniffedEncoding, desiredEncoding);
});

it(`should sniff ${file} as ${desiredEncoding}, given overriding options`, () => {
const sniffedEncoding = htmlEncodingSniffer(buffer, {
xml: true,
transportLayerEncodingLabel: "windows-1252",
defaultEncoding: "ISO-8859-1"
});

assert.strictEqual(sniffedEncoding, desiredEncoding);
});
}
});

describe("UTF-32 BOMs (not recognized, should fall back to default)", () => {
it("should ignore UTF-32BE BOM and return UTF-8", () => {
// UTF-32BE BOM: 00 00 FE FF
const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });

assert.strictEqual(sniffedEncoding, "UTF-8");
});

it("should detect UTF-32LE BOM as UTF-16LE (since FF FE prefix matches)", () => {
// UTF-32LE BOM: FF FE 00 00 — but FF FE is also UTF-16LE BOM
const buffer = new Uint8Array([0xFF, 0xFE, 0x00, 0x00, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });

assert.strictEqual(sniffedEncoding, "UTF-16LE");
});

it("should ignore UTF-32BE BOM and use transport layer encoding if provided", () => {
const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
const sniffedEncoding = htmlEncodingSniffer(buffer, {
xml: true,
transportLayerEncodingLabel: "KOI8-R"
});

assert.strictEqual(sniffedEncoding, "KOI8-R");
});
});

describe("meta charset ignored", () => {
it("should ignore meta charset and return UTF-8 default", () => {
const buffer = read("fixtures/normal/charset_KOI8-R.html");
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });

assert.strictEqual(sniffedEncoding, "UTF-8");
});

it("should ignore meta charset but use transport layer encoding", () => {
const buffer = read("fixtures/normal/charset_KOI8-R.html");
const sniffedEncoding = htmlEncodingSniffer(buffer, {
xml: true,
transportLayerEncodingLabel: "ISO-8859-2"
});

assert.strictEqual(sniffedEncoding, "ISO-8859-2");
});

it("should ignore meta charset but use custom default encoding", () => {
const buffer = read("fixtures/normal/charset_KOI8-R.html");
const sniffedEncoding = htmlEncodingSniffer(buffer, {
xml: true,
defaultEncoding: "windows-1252"
});

assert.strictEqual(sniffedEncoding, "windows-1252");
});
});

describe("default encoding", () => {
it("should default to UTF-8 for XML", () => {
const buffer = read("fixtures/no-result/no-indicators_windows-1252.html");
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });

assert.strictEqual(sniffedEncoding, "UTF-8");
});

it("should allow overriding the default encoding", () => {
const buffer = read("fixtures/no-result/no-indicators_windows-1252.html");
const sniffedEncoding = htmlEncodingSniffer(buffer, {
xml: true,
defaultEncoding: "ISO-8859-1"
});

assert.strictEqual(sniffedEncoding, "ISO-8859-1");
});
});
});