Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 138 additions & 17 deletions lib/escape_html.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,146 @@
const escapeTestNoEncode = /[<>"'`/=]|&(?!(#\d{1,7}|#[Xx][a-fA-F0-9]{1,6}|\w+);)/;
const escapeReplaceNoEncode = new RegExp(escapeTestNoEncode.source, 'g');
const escapeReplacements = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
'\'': '&#39;',
'`': '&#96;',
'/': '&#x2F;',
'=': '&#x3D;'
};
const getEscapeReplacement = (ch: string) => escapeReplacements[ch];
/**
* fast-escape-html - MIT License - Made by SukkaW <https://skk.moe>
* The fastest known HTML unescape function.
* https://github.com/SukkaW/fast-escape-html
* https://github.com/SukkaW/fast-escape-html/blob/1bb80ac857f0645b321761cbd0dc0d0098240407/src/unescape.ts
*
* This is also modified by SukkaW for use w/ Hexo. Hexo needs to escape more characters (=, /, `)
* to work with template languages (nunjucks/pug/mustache), and also needs to avoid double escaping
* HTML entities. After modification, this function is of course slower than `fast-escape-html`, but
* is still faster than `lodash.escape` and `escape-goat` (where they even escape less symbols and do
* not avoid double escaping).
*/
const reHtmlEntity = /[&<>"'`/=]/;

function escapeHTML(str: string) {
if (typeof str !== 'string') throw new TypeError('str must be a string!');

// https://github.com/markedjs/marked/blob/master/src/helpers.js
if (escapeTestNoEncode.test(str)) {
return str.replace(escapeReplaceNoEncode, getEscapeReplacement);
// if (rAlreadyEscaped.test(str)) {
// // If the string is already escaped, return it as is
// return str;
// }

const match = reHtmlEntity.exec(str);

if (match === null) { // faster than !match since no type conversion
return str;
}
return str;

let escape = '';
let html = '';

let index = match.index;
let lastIndex = 0;
const len = str.length;


let next = 0;
let nextIndex = index;

// iterate from the first match
for (; index < len; index++) {

/**
* Adjust order for commonly seen symbols:
* Take https://tc39.es/ecma262 as an example
*/
switch (str.charCodeAt(index)) {
case 60: // <
escape = '&lt;';
break;
case 62: // >
escape = '&gt;';
break;
case 34: // "
escape = '&quot;';
break;
case 39: // '
escape = '&#39;';
break;
case 38: { // &
// We need to skip already escaped entities
// But instead of matching with regexp, we manually check the char code
// https://github.com/markedjs/marked/blob/cb549065f16fbd4a01bab3292bfd2ab0b116c1b2/src/helpers.ts#L10
nextIndex = index + 1;
next = str.charCodeAt(nextIndex);
if (next === 35) { // #, whether the it is "&#" combined
nextIndex++;
next = str.charCodeAt(nextIndex);
if (next === 120 || next === 88) { // x or X, whether the it is "&#x" combined
nextIndex++;
next = str.charCodeAt(nextIndex);
}
}

let breakout = false;

console.log({
index,
char: str.charAt(index),
nextIndex,
next,
nextChar: str.charAt(nextIndex)
});

while (
nextIndex < len && (
(next >= 48 && next <= 57) // 0-9
|| (next >= 97 && next <= 122) // a-z
|| (next >= 65 && next <= 90) // A-Z
)
) {
nextIndex++;
next = str.charCodeAt(nextIndex);

console.log({
index,
char: str.charAt(index),
nextIndex,
next,
nextChar: str.charAt(nextIndex)
});

if (next === 59) { // ;
breakout = true;
break;
}
}

if (breakout) {
// If we found a semicolon, we can skip the rest of the loop
index = nextIndex; // we skip already looked up
continue;
}

escape = '&amp;';
break;
}
case 96: // `
escape = '&#96;';
break;
case 47: // /
escape = '&#x2F;';
break;
case 61: // =
escape = '&#x3D;';
break;
default:
continue;
}

if (lastIndex !== index) {
html += str.slice(lastIndex, index);
}
html += escape;

lastIndex = index + 1;
}

if (lastIndex !== index) {
html += str.slice(lastIndex, index);
}

return html;
}

export = escapeHTML;
40 changes: 28 additions & 12 deletions lib/unescape_html.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,36 @@
const htmlEntityMap = {
'&amp;': '&',
'&lt;': '<',
'&gt;': '>',
'&quot;': '"',
'&#39;': '\'',
'&#96;': '`',
'&#x2F;': '/',
'&#x3D;': '='
};
/**
* fast-escape-html - MIT License - Made by SukkaW <https://skk.moe>
* The fastest known HTML unescape function.
* https://github.com/SukkaW/fast-escape-html
* https://github.com/SukkaW/fast-escape-html/blob/1bb80ac857f0645b321761cbd0dc0d0098240407/src/unescape.ts
*
* This is also modified by SukkaW for use w/ Hexo. Hexo needs to unescape more characters, but I managed
* to adopt a few techniques from `fast-escape-html` to make this function faster than before
*/

// Specifically uses `Object.create(null)` to make lookup faster (no prototype chain lookup)
const htmlEntityMap = Object.create(null);

// Common HTML entities is placed first for faster lookup
htmlEntityMap['&lt;'] = '<';
htmlEntityMap['&gt;'] = '>';
htmlEntityMap['&quot;'] = '"';
htmlEntityMap['&#39;'] = '\'';
htmlEntityMap['&#x3D;'] = '=';
htmlEntityMap['&#x2F;'] = '/';
htmlEntityMap['&amp;'] = '&';
htmlEntityMap['&#96;'] = '`';

// This is specifically hand-crafted regexp to match common HTML entities first (for early return)
const reHtmlEntityGlobal = /&(?:[gl]t|quot|#39|#x(?:3D|2F)|amp|#6[02]|#34|apos|#38|#96);/g;

const regexHtml = new RegExp(Object.keys(htmlEntityMap).join('|'), 'g');
// Hoist function to maximize the function cache
const replacer = (match: string) => htmlEntityMap[match];

const unescapeHTML = (str: string) => {
if (typeof str !== 'string') throw new TypeError('str must be a string!');

return str.replace(regexHtml, a => htmlEntityMap[a]);
return str.replace(reHtmlEntityGlobal, replacer);
};

export = unescapeHTML;
8 changes: 6 additions & 2 deletions test/escape_html.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ chai.should();

describe('escapeHTML', () => {
it('default', () => {
escapeHTML('<p class="foo">Hello "world".</p>').should.eql('&lt;p class&#x3D;&quot;foo&quot;&gt;Hello &quot;world&quot;.&lt;&#x2F;p&gt;');
escapeHTML('<p class="foo">Hello `world`.</p>').should.eql('&lt;p class&#x3D;&quot;foo&quot;&gt;Hello &#96;world&#96;.&lt;&#x2F;p&gt;');
});

it('str must be a string', () => {
escapeHTML.should.throw('str must be a string!');
});

it('avoid double escape', () => {
escapeHTML('&lt;foo>bar</foo&gt;').should.eql('&lt;foo&gt;bar&lt;&#x2F;foo&gt;');
escapeHTML('&lt;foo>bar</foo&gt;&#x2F;|&6>').should.eql('&lt;foo&gt;bar&lt;&#x2F;foo&gt;&#x2F;|&amp;6&gt;');
});

it('avoid double escape https://github.com/hexojs/hexo/issues/4946', () => {
escapeHTML('&emsp;&nbsp;&ensp;').should.eql('&emsp;&nbsp;&ensp;');
});

it('proper escape', () => {
escapeHTML('&0').should.eql('&amp;0');
});
});
Loading