From 1d7e86c180ad71dad3de4f3506964bef1fd13ba7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Gruszczy=C5=84ski?= <maciek.gruszka@gmail.com>
Date: Fri, 17 Apr 2026 11:39:52 +0200
Subject: [PATCH 1/2] chore: optionally keep original title headers

---
 Readability.js                           | 20 ++++--
 index.d.ts                               |  6 ++
 test/test-keep-original-title-headers.js | 83 ++++++++++++++++++++++++
 test/test-readability.js                 |  8 +++
 4 files changed, 111 insertions(+), 6 deletions(-)
 create mode 100644 test/test-keep-original-title-headers.js
diff --git a/Readability.js b/Readability.js
index 5cff4540..7cf310ef 100644
--- a/Readability.js
+++ b/Readability.js
@@ -64,6 +64,12 @@ function Readability(doc, options) {
   this._disableJSONLD = !!options.disableJSONLD;
   this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
   this._linkDensityModifier = options.linkDensityModifier || 0;
+  /**
+   * If true, keep the first in-article H1/H2 that duplicates the article title
+   * and leave H1 tags in the extracted content. Defaults to false (strip the
+   * duplicate title header and normalize remaining H1 elements to H2).
+   */
+  this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders;
 
   // Start with all flags set
   this._flags =
@@ -835,11 +841,13 @@ Readability.prototype = {
     this._cleanConditionally(articleContent, "ul");
     this._cleanConditionally(articleContent, "div");
 
-    // replace H1 with H2 as H1 should be only title that is displayed separately
-    this._replaceNodeTags(
-      this._getAllNodesWithTag(articleContent, ["h1"]),
-      "h2"
-    );
+    if (!this._keepOriginalTitleHeaders) {
+      // replace H1 with H2 as H1 should be only title that is displayed separately
+      this._replaceNodeTags(
+        this._getAllNodesWithTag(articleContent, ["h1"]),
+        "h2"
+      );
+    }
 
     // Remove extra paragraphs
     this._removeNodes(
@@ -1064,7 +1072,7 @@ Readability.prototype = {
       var elementsToScore = [];
       var node = this._doc.documentElement;
 
-      let shouldRemoveTitleHeader = true;
+      let shouldRemoveTitleHeader = !this._keepOriginalTitleHeaders;
 
       while (node) {
         if (node.tagName === "HTML") {
diff --git a/index.d.ts b/index.d.ts
index 7ad8dd58..11685104 100644
--- a/index.d.ts
+++ b/index.d.ts
@@ -75,6 +75,12 @@ export interface ReadabilityOptions<T = string> {
    * Defaults to 1.
    */
   linkDensityModifier?: number;
+  /**
+   * If `true`, the first in-article heading that closely matches the article
+   * title is kept, and H1 tags in the extracted content are not rewritten to H2.
+   * Defaults to `false`.
+   */
+  keepOriginalTitleHeaders?: boolean;
 }
 
 export class Readability<T = string> {
diff --git a/test/test-keep-original-title-headers.js b/test/test-keep-original-title-headers.js
new file mode 100644
index 00000000..2e53b666
--- /dev/null
+++ b/test/test-keep-original-title-headers.js
@@ -0,0 +1,83 @@
+/* eslint-env node, mocha */
+
+var JSDOM = require("jsdom").JSDOM;
+var chai = require("chai");
+var expect = chai.expect;
+
+var Readability = require("../index").Readability;
+
+function articleHtml(titleText, headingTag, headingText) {
+  var long =
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
+    "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
+    "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
+    "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
+    "in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
+  return (
+    "<!DOCTYPE html><html><head><title>" +
+    titleText +
+    "</title></head><body><article>" +
+    "<" +
+    headingTag +
+    ">" +
+    headingText +
+    "</" +
+    headingTag +
+    "><p>" +
+    long +
+    "</p><p>" +
+    long +
+    "</p></article></body></html>"
+  );
+}
+
+describe("keepOriginalTitleHeaders option", function () {
+  this.timeout(30000);
+
+  it("when false, removes the first heading that duplicates the title and rewrites other H1 to H2", function () {
+    var titleText = "Readability Title Headers Option Test 7f3a";
+    var source = articleHtml(titleText, "h1", titleText);
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc).parse();
+    expect(result.content).to.not.include("<h1>");
+    expect(result.content).to.not.include("<h2>" + titleText);
+    expect(result.title).to.eql(titleText);
+  });
+
+  it("when true, keeps the duplicate title header as H1 and does not rewrite it to H2", function () {
+    var titleText = "Readability Title Headers Option Test 7f3b";
+    var source = articleHtml(titleText, "h1", titleText);
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc, {
+      keepOriginalTitleHeaders: true,
+    }).parse();
+    expect(result.content).to.include("<h1>" + titleText + "</h1>");
+    expect(result.title).to.eql(titleText);
+  });
+
+  it("when false, rewrites a non-title H1 in the article body to H2", function () {
+    var titleText = "Readability Title Headers Option Test 7f3c";
+    var bodyHeading = "Distinct In Article Heading 9z2q";
+    var source = articleHtml(titleText, "h1", bodyHeading);
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc).parse();
+    expect(result.content).to.include("<h2>" + bodyHeading + "</h2>");
+    expect(result.content).to.not.include("<h1>" + bodyHeading);
+  });
+
+  it("when true, leaves a non-title H1 in the article body as H1", function () {
+    var titleText = "Readability Title Headers Option Test 7f3d";
+    var bodyHeading = "Distinct In Article Heading 9z2r";
+    var source = articleHtml(titleText, "h1", bodyHeading);
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc, {
+      keepOriginalTitleHeaders: true,
+    }).parse();
+    expect(result.content).to.include("<h1>" + bodyHeading + "</h1>");
+    expect(result.content).to.not.include("<h2>" + bodyHeading);
+  });
+});
diff --git a/test/test-readability.js b/test/test-readability.js
index ebd4e618..6842fdaa 100644
--- a/test/test-readability.js
+++ b/test/test-readability.js
@@ -273,6 +273,14 @@ describe("Readability API", function () {
       );
     });
 
+    it("should accept a keepOriginalTitleHeaders option", function () {
+      expect(new Readability(doc)._keepOriginalTitleHeaders).eql(false);
+      expect(
+        new Readability(doc, { keepOriginalTitleHeaders: true })
+          ._keepOriginalTitleHeaders
+      ).eql(true);
+    });
+
     it("should accept a allowedVideoRegex option or default it", function () {
       expect(new Readability(doc)._allowedVideoRegex).eql(
         Readability.prototype.REGEXPS.videos

From 657a64aa8a4c79e0fffaf58c8cc6309eb6da5777 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Gruszczy=C5=84ski?= <maciek.gruszka@gmail.com>
Date: Wed, 29 Apr 2026 10:17:59 +0200
Subject: [PATCH 2/2] fix: preserve all h1s before the article subtree

---
 Readability.js                           | 100 ++++++++++++++++++++++
 index.d.ts                               |   5 +-
 test/test-keep-original-title-headers.js | 102 +++++++++++++++++++++++
 3 files changed, 206 insertions(+), 1 deletion(-)

diff --git a/Readability.js b/Readability.js
index 7cf310ef..96ac3b02 100644
--- a/Readability.js
+++ b/Readability.js
@@ -68,6 +68,11 @@ function Readability(doc, options) {
    * If true, keep the first in-article H1/H2 that duplicates the article title
    * and leave H1 tags in the extracted content. Defaults to false (strip the
    * duplicate title header and normalize remaining H1 elements to H2).
+   * When true, also prepend clones of document `h1` nodes that lie outside the
+   * extracted subtree and precede the grabbed content in document order (for example
+   * hero headings); snapshots are taken before `_grabArticle` because extraction
+   * mutates the DOM. Those clones are inserted before `_postProcessContent` so they
+   * receive URI fixes and class cleanup.
    */
   this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders;
 
@@ -2717,6 +2722,76 @@ Readability.prototype = {
     return this._textSimilarity(this._articleTitle, heading) > 0.75;
   },
 
+  /**
+   * Assign stable preorder indices (depth-first, element-only) so we can compare what
+   * appeared before extracted content while `_grabArticle` still sees the original tree.
+   *
+   * @param Element root
+   * @param {{ i: number }} counterHolder mutable `{ i }` counter.
+   */
+  _documentPreorderWalk(root, counterHolder) {
+    if (!root || root.nodeType !== this.ELEMENT_NODE) {
+      return;
+    }
+    this._elementPreorderIndex.set(root, counterHolder.i++);
+    var child = root.firstElementChild;
+    while (child) {
+      this._documentPreorderWalk(child, counterHolder);
+      child = child.nextElementSibling;
+    }
+  },
+
+  /**
+   * Prepend `h1` clones that existed elsewhere on the page before extraction (hero,
+   * etc.), in document order. Snapshots pair each original node with its clone because
+   * `_grabArticle` may remove or move originals. Only headings whose preorder index is
+   * strictly before the earliest preorder among nodes inside `articleContent` are kept
+   * ("before grabbed content").
+   *
+   * @param Element articleContent root returned by `_grabArticle`.
+   * @param Array<{original: Element, clone: Element, preorder?: number}> snapshots from before grab.
+   */
+  _prependExternalH1HeadingsBeforePostProcess(articleContent, snapshots) {
+    if (!snapshots || !snapshots.length) {
+      return;
+    }
+
+    var minPreorderInGrabbed = Infinity;
+    var descendants = articleContent.querySelectorAll("*");
+    for (var j = 0; j < descendants.length; j++) {
+      var grabbedPo = this._elementPreorderIndex.get(descendants[j]);
+      if (grabbedPo !== undefined) {
+        minPreorderInGrabbed = Math.min(minPreorderInGrabbed, grabbedPo);
+      }
+    }
+
+    var fragment = this._doc.createDocumentFragment();
+
+    for (var i = 0; i < snapshots.length; i++) {
+      var entry = snapshots[i];
+      if (articleContent.contains(entry.original)) {
+        continue;
+      }
+      if (!this._isProbablyVisible(entry.original)) {
+        continue;
+      }
+      if (
+        entry.preorder === undefined ||
+        minPreorderInGrabbed === Infinity ||
+        entry.preorder >= minPreorderInGrabbed
+      ) {
+        continue;
+      }
+      fragment.appendChild(entry.clone);
+    }
+
+    if (!fragment.childNodes.length) {
+      return;
+    }
+
+    articleContent.insertBefore(fragment, articleContent.firstChild);
+  },
+
   _flagIsActive(flag) {
     return (this._flags & flag) > 0;
   },
@@ -2778,6 +2853,24 @@ Readability.prototype = {
     this._metadata = metadata;
     this._articleTitle = metadata.title;
 
+    var prefgrabH1Snapshots = null;
+    if (this._keepOriginalTitleHeaders) {
+      this._elementPreorderIndex = new WeakMap();
+      var preorderCounter = { i: 0 };
+      this._documentPreorderWalk(this._doc.documentElement, preorderCounter);
+
+      prefgrabH1Snapshots = Array.from(
+        this._doc.getElementsByTagName("h1"),
+        function (h) {
+          return {
+            original: h,
+            clone: h.cloneNode(true),
+            preorder: this._elementPreorderIndex.get(h),
+          };
+        }.bind(this)
+      );
+    }
+
     var articleContent = this._grabArticle();
     if (!articleContent) {
       return null;
@@ -2785,6 +2878,13 @@ Readability.prototype = {
 
     this.log("Grabbed: " + articleContent.innerHTML);
 
+    if (prefgrabH1Snapshots) {
+      this._prependExternalH1HeadingsBeforePostProcess(
+        articleContent,
+        prefgrabH1Snapshots
+      );
+    }
+
     this._postProcessContent(articleContent);
 
     // If we haven't found an excerpt in the article's metadata, use the article's
diff --git a/index.d.ts b/index.d.ts
index 11685104..61af8845 100644
--- a/index.d.ts
+++ b/index.d.ts
@@ -78,7 +78,10 @@ export interface ReadabilityOptions<T = string> {
   /**
    * If `true`, the first in-article heading that closely matches the article
    * title is kept, and H1 tags in the extracted content are not rewritten to H2.
-   * Defaults to `false`.
+   * When `true`, also prepends clones of those `h1` elements that lie outside the
+   * extracted subtree **and** precede the grabbed content in document order (for example
+   * hero titles), captured before extraction so they still run through post-processing
+   * (relative URL fixes, etc.). Defaults to `false`.
    */
   keepOriginalTitleHeaders?: boolean;
 }
diff --git a/test/test-keep-original-title-headers.js b/test/test-keep-original-title-headers.js
index 2e53b666..b063f762 100644
--- a/test/test-keep-original-title-headers.js
+++ b/test/test-keep-original-title-headers.js
@@ -80,4 +80,106 @@ describe("keepOriginalTitleHeaders option", function () {
     expect(result.content).to.include("<h1>" + bodyHeading + "</h1>");
     expect(result.content).to.not.include("<h2>" + bodyHeading);
   });
+
+  it("when true, prepends clones of document-level H1 outside the extracted subtree (before post-processing)", function () {
+    var titleText = "Readability External Hero H1 Title Option Test 9x4m";
+    var long =
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
+      "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
+      "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
+      "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
+      "in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
+    var source =
+      "<!DOCTYPE html><html><head><title>" +
+      titleText +
+      "</title></head><body><main>" +
+      '<section class="hero"><h1 class="hero-title">' +
+      titleText +
+      "</h1></section>" +
+      "<article><p>" +
+      long +
+      "</p><p>" +
+      long +
+      "</p></article>" +
+      "</main></body></html>";
+
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc, {
+      keepOriginalTitleHeaders: true,
+    }).parse();
+
+    expect(result.content).to.include("<h1>" + titleText + "</h1>");
+    expect(result.content.indexOf("<h1>" + titleText)).to.be.lessThan(
+      result.content.indexOf('id="readability-page-1"')
+    );
+    expect(result.title).to.eql(titleText);
+  });
+
+  it("when true, does not prepend H1 that appear after grabbed content in document order", function () {
+    var titleText =
+      "Readability Article Title After Hero Ignore Later H1 Test 9x5p";
+    var sidebarHeading = "Sidebar Or Footer H1 Must Not Prepend 9x5q";
+    var long =
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
+      "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
+      "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
+      "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
+      "in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
+    var source =
+      "<!DOCTYPE html><html><head><title>" +
+      titleText +
+      "</title></head><body><main>" +
+      "<article><p>" +
+      long +
+      "</p><p>" +
+      long +
+      "</p></article>" +
+      "<aside><h1>" +
+      sidebarHeading +
+      "</h1><p>" +
+      long +
+      "</p></aside>" +
+      "</main></body></html>";
+
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc, {
+      keepOriginalTitleHeaders: true,
+    }).parse();
+
+    expect(result.content).to.not.include(sidebarHeading);
+    expect(result.title).to.eql(titleText);
+  });
+
+  it("when false, does not prepend hero H1 from outside the extracted subtree", function () {
+    var titleText =
+      "Readability External Hero H1 Absent When Option False Test 9x4n";
+    var long =
+      "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
+      "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
+      "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
+      "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
+      "in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
+    var source =
+      "<!DOCTYPE html><html><head><title>" +
+      titleText +
+      "</title></head><body><main>" +
+      '<section class="hero"><h1 class="hero-title">' +
+      titleText +
+      "</h1></section>" +
+      "<article><p>" +
+      long +
+      "</p><p>" +
+      long +
+      "</p></article>" +
+      "</main></body></html>";
+
+    var doc = new JSDOM(source, { url: "http://example.com/article" }).window
+      .document;
+    var result = new Readability(doc).parse();
+
+    expect(result.content).to.not.include("<h1");
+    expect(result.title).to.eql(titleText);
+  });
 });