hexojs · SukkaW · May 31, 2025 · May 31, 2025 · May 31, 2025 · May 31, 2025
diff --git a/lib/escape_html.ts b/lib/escape_html.ts
@@ -1,25 +1,146 @@
-const escapeTestNoEncode = /[<>"'`/=]|&(?!(#\d{1,7}|#[Xx][a-fA-F0-9]{1,6}|\w+);)/;
-const escapeReplaceNoEncode = new RegExp(escapeTestNoEncode.source, 'g');
-const escapeReplacements = {
-  '&': '&amp;',
-  '<': '&lt;',
-  '>': '&gt;',
-  '"': '&quot;',
-  '\'': '&#39;',
-  '`': '&#96;',
-  '/': '&#x2F;',
-  '=': '&#x3D;'
-};
-const getEscapeReplacement = (ch: string) => escapeReplacements[ch];
+/**
+ * fast-escape-html - MIT License - Made by SukkaW <https://skk.moe>
+ * The fastest known HTML unescape function.
+ * https://github.com/SukkaW/fast-escape-html
+ * https://github.com/SukkaW/fast-escape-html/blob/1bb80ac857f0645b321761cbd0dc0d0098240407/src/unescape.ts
+ *
+ * This is also modified by SukkaW for use w/ Hexo. Hexo needs to escape more characters (=, /, `)
+ * to work with template languages (nunjucks/pug/mustache), and also needs to avoid double escaping
+ * HTML entities. After modification, this function is of course slower than `fast-escape-html`, but
+ * is still faster than `lodash.escape` and `escape-goat` (where they even escape less symbols and do
+ * not avoid double escaping).
+ */
+const reHtmlEntity = /[&<>"'`/=]/;
 
 function escapeHTML(str: string) {
   if (typeof str !== 'string') throw new TypeError('str must be a string!');
 
-  // https://github.com/markedjs/marked/blob/master/src/helpers.js
-  if (escapeTestNoEncode.test(str)) {
-    return str.replace(escapeReplaceNoEncode, getEscapeReplacement);
+  // if (rAlreadyEscaped.test(str)) {
+  //   // If the string is already escaped, return it as is
+  //   return str;
+  // }
+
+  const match = reHtmlEntity.exec(str);
+
+  if (match === null) { // faster than !match since no type conversion
+    return str;
   }
-  return str;
+
+  let escape = '';
+  let html = '';
+
+  let index = match.index;
+  let lastIndex = 0;
+  const len = str.length;
+
+
+  let next = 0;
+  let nextIndex = index;
+
+  // iterate from the first match
+  for (; index < len; index++) {
+
+    /**
+     * Adjust order for commonly seen symbols:
+     * Take https://tc39.es/ecma262 as an example
+    */
+    switch (str.charCodeAt(index)) {
+      case 60: // <
+        escape = '&lt;';
+        break;
+      case 62: // >
+        escape = '&gt;';
+        break;
+      case 34: // "
+        escape = '&quot;';
+        break;
+      case 39: // '
+        escape = '&#39;';
+        break;
+      case 38: { // &
+        // We need to skip already escaped entities
+        // But instead of matching with regexp, we manually check the char code
+        // https://github.com/markedjs/marked/blob/cb549065f16fbd4a01bab3292bfd2ab0b116c1b2/src/helpers.ts#L10
+        nextIndex = index + 1;
+        next = str.charCodeAt(nextIndex);
+        if (next === 35) { // #, whether the it is "&#" combined
+          nextIndex++;
+          next = str.charCodeAt(nextIndex);
+          if (next === 120 || next === 88) { // x or X, whether the it is "&#x" combined
+            nextIndex++;
+            next = str.charCodeAt(nextIndex);
+          }
+        }
+
+        let breakout = false;
+
+        console.log({
+          index,
+          char: str.charAt(index),
+          nextIndex,
+          next,
+          nextChar: str.charAt(nextIndex)
+        });
+
+        while (
+          nextIndex < len && (
+            (next >= 48 && next <= 57) // 0-9
+            || (next >= 97 && next <= 122) // a-z
+            || (next >= 65 && next <= 90) // A-Z
+          )
+        ) {
+          nextIndex++;
+          next = str.charCodeAt(nextIndex);
+
+          console.log({
+            index,
+            char: str.charAt(index),
+            nextIndex,
+            next,
+            nextChar: str.charAt(nextIndex)
+          });
+
+          if (next === 59) { // ;
+            breakout = true;
+            break;
+          }
+        }
+
+        if (breakout) {
+          // If we found a semicolon, we can skip the rest of the loop
+          index = nextIndex; // we skip already looked up
+          continue;
+        }
+
+        escape = '&amp;';
+        break;
+      }
+      case 96: // `
+        escape = '&#96;';
+        break;
+      case 47: // /
+        escape = '&#x2F;';
+        break;
+      case 61: // =
+        escape = '&#x3D;';
+        break;
+      default:
+        continue;
+    }
+
+    if (lastIndex !== index) {
+      html += str.slice(lastIndex, index);
+    }
+    html += escape;
+
+    lastIndex = index + 1;
+  }
+
+  if (lastIndex !== index) {
+    html += str.slice(lastIndex, index);
+  }
+
+  return html;
 }
 
 export = escapeHTML;
diff --git a/lib/unescape_html.ts b/lib/unescape_html.ts
@@ -1,20 +1,36 @@
-const htmlEntityMap = {
-  '&amp;': '&',
-  '&lt;': '<',
-  '&gt;': '>',
-  '&quot;': '"',
-  '&#39;': '\'',
-  '&#96;': '`',
-  '&#x2F;': '/',
-  '&#x3D;': '='
-};
+/**
+ * fast-escape-html - MIT License - Made by SukkaW <https://skk.moe>
+ * The fastest known HTML unescape function.
+ * https://github.com/SukkaW/fast-escape-html
+ * https://github.com/SukkaW/fast-escape-html/blob/1bb80ac857f0645b321761cbd0dc0d0098240407/src/unescape.ts
+ *
+ * This is also modified by SukkaW for use w/ Hexo. Hexo needs to unescape more characters, but I managed
+ * to adopt a few techniques from `fast-escape-html` to make this function faster than before
+ */
+
+// Specifically uses `Object.create(null)` to make lookup faster (no prototype chain lookup)
+const htmlEntityMap = Object.create(null);
+
+// Common HTML entities is placed first for faster lookup
+htmlEntityMap['&lt;'] = '<';
+htmlEntityMap['&gt;'] = '>';
+htmlEntityMap['&quot;'] = '"';
+htmlEntityMap['&#39;'] = '\'';
+htmlEntityMap['&#x3D;'] = '=';
+htmlEntityMap['&#x2F;'] = '/';
+htmlEntityMap['&amp;'] = '&';
+htmlEntityMap['&#96;'] = '`';
+
+// This is specifically hand-crafted regexp to match common HTML entities first (for early return)
+const reHtmlEntityGlobal = /&(?:[gl]t|quot|#39|#x(?:3D|2F)|amp|#6[02]|#34|apos|#38|#96);/g;
 
-const regexHtml = new RegExp(Object.keys(htmlEntityMap).join('|'), 'g');
+// Hoist function to maximize the function cache
+const replacer = (match: string) => htmlEntityMap[match];
 
 const unescapeHTML = (str: string) => {
   if (typeof str !== 'string') throw new TypeError('str must be a string!');
 
-  return str.replace(regexHtml, a => htmlEntityMap[a]);
+  return str.replace(reHtmlEntityGlobal, replacer);
 };
 
 export = unescapeHTML;
diff --git a/test/escape_html.spec.ts b/test/escape_html.spec.ts
@@ -4,18 +4,22 @@ chai.should();
 
 describe('escapeHTML', () => {
   it('default', () => {
-    escapeHTML('<p class="foo">Hello "world".</p>').should.eql('&lt;p class&#x3D;&quot;foo&quot;&gt;Hello &quot;world&quot;.&lt;&#x2F;p&gt;');
+    escapeHTML('<p class="foo">Hello `world`.</p>').should.eql('&lt;p class&#x3D;&quot;foo&quot;&gt;Hello &#96;world&#96;.&lt;&#x2F;p&gt;');
   });
 
   it('str must be a string', () => {
     escapeHTML.should.throw('str must be a string!');
   });
 
   it('avoid double escape', () => {
-    escapeHTML('&lt;foo>bar</foo&gt;').should.eql('&lt;foo&gt;bar&lt;&#x2F;foo&gt;');
+    escapeHTML('&lt;foo>bar</foo&gt;&#x2F;|&6>').should.eql('&lt;foo&gt;bar&lt;&#x2F;foo&gt;&#x2F;|&amp;6&gt;');
   });
 
   it('avoid double escape https://github.com/hexojs/hexo/issues/4946', () => {
     escapeHTML('&emsp;&nbsp;&ensp;').should.eql('&emsp;&nbsp;&ensp;');
   });
+
+  it('proper escape', () => {
+    escapeHTML('&0').should.eql('&amp;0');
+  });
 });