From 1d7e86c180ad71dad3de4f3506964bef1fd13ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Gruszczy=C5=84ski?= Date: Fri, 17 Apr 2026 11:39:52 +0200 Subject: [PATCH 1/2] chore: optionally keep original title headers --- Readability.js | 20 ++++-- index.d.ts | 6 ++ test/test-keep-original-title-headers.js | 83 ++++++++++++++++++++++++ test/test-readability.js | 8 +++ 4 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 test/test-keep-original-title-headers.js diff --git a/Readability.js b/Readability.js index 5cff4540..7cf310ef 100644 --- a/Readability.js +++ b/Readability.js @@ -64,6 +64,12 @@ function Readability(doc, options) { this._disableJSONLD = !!options.disableJSONLD; this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; this._linkDensityModifier = options.linkDensityModifier || 0; + /** + * If true, keep the first in-article H1/H2 that duplicates the article title + * and leave H1 tags in the extracted content. Defaults to false (strip the + * duplicate title header and normalize remaining H1 elements to H2). + */ + this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders; // Start with all flags set this._flags = @@ -835,11 +841,13 @@ Readability.prototype = { this._cleanConditionally(articleContent, "ul"); this._cleanConditionally(articleContent, "div"); - // replace H1 with H2 as H1 should be only title that is displayed separately - this._replaceNodeTags( - this._getAllNodesWithTag(articleContent, ["h1"]), - "h2" - ); + if (!this._keepOriginalTitleHeaders) { + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags( + this._getAllNodesWithTag(articleContent, ["h1"]), + "h2" + ); + } // Remove extra paragraphs this._removeNodes( @@ -1064,7 +1072,7 @@ Readability.prototype = { var elementsToScore = []; var node = this._doc.documentElement; - let shouldRemoveTitleHeader = true; + let shouldRemoveTitleHeader = !this._keepOriginalTitleHeaders; while (node) { if (node.tagName === "HTML") { diff --git a/index.d.ts b/index.d.ts index 7ad8dd58..11685104 100644 --- a/index.d.ts +++ b/index.d.ts @@ -75,6 +75,12 @@ export interface ReadabilityOptions { * Defaults to 1. */ linkDensityModifier?: number; + /** + * If `true`, the first in-article heading that closely matches the article + * title is kept, and H1 tags in the extracted content are not rewritten to H2. + * Defaults to `false`. + */ + keepOriginalTitleHeaders?: boolean; } export class Readability { diff --git a/test/test-keep-original-title-headers.js b/test/test-keep-original-title-headers.js new file mode 100644 index 00000000..2e53b666 --- /dev/null +++ b/test/test-keep-original-title-headers.js @@ -0,0 +1,83 @@ +/* eslint-env node, mocha */ + +var JSDOM = require("jsdom").JSDOM; +var chai = require("chai"); +var expect = chai.expect; + +var Readability = require("../index").Readability; + +function articleHtml(titleText, headingTag, headingText) { + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + return ( + "" + + titleText + + "
" + + "<" + + headingTag + + ">" + + headingText + + "

" + + long + + "

" + + long + + "

" + ); +} + +describe("keepOriginalTitleHeaders option", function () { + this.timeout(30000); + + it("when false, removes the first heading that duplicates the title and rewrites other H1 to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3a"; + var source = articleHtml(titleText, "h1", titleText); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + expect(result.content).to.not.include("

"); + expect(result.content).to.not.include("

" + titleText); + expect(result.title).to.eql(titleText); + }); + + it("when true, keeps the duplicate title header as H1 and does not rewrite it to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3b"; + var source = articleHtml(titleText, "h1", titleText); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + expect(result.content).to.include("

" + titleText + "

"); + expect(result.title).to.eql(titleText); + }); + + it("when false, rewrites a non-title H1 in the article body to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3c"; + var bodyHeading = "Distinct In Article Heading 9z2q"; + var source = articleHtml(titleText, "h1", bodyHeading); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + expect(result.content).to.include("

" + bodyHeading + "

"); + expect(result.content).to.not.include("

" + bodyHeading); + }); + + it("when true, leaves a non-title H1 in the article body as H1", function () { + var titleText = "Readability Title Headers Option Test 7f3d"; + var bodyHeading = "Distinct In Article Heading 9z2r"; + var source = articleHtml(titleText, "h1", bodyHeading); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + expect(result.content).to.include("

" + bodyHeading + "

"); + expect(result.content).to.not.include("

" + bodyHeading); + }); +}); diff --git a/test/test-readability.js b/test/test-readability.js index ebd4e618..6842fdaa 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -273,6 +273,14 @@ describe("Readability API", function () { ); }); + it("should accept a keepOriginalTitleHeaders option", function () { + expect(new Readability(doc)._keepOriginalTitleHeaders).eql(false); + expect( + new Readability(doc, { keepOriginalTitleHeaders: true }) + ._keepOriginalTitleHeaders + ).eql(true); + }); + it("should accept a allowedVideoRegex option or default it", function () { expect(new Readability(doc)._allowedVideoRegex).eql( Readability.prototype.REGEXPS.videos From 657a64aa8a4c79e0fffaf58c8cc6309eb6da5777 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Gruszczy=C5=84ski?= Date: Wed, 29 Apr 2026 10:17:59 +0200 Subject: [PATCH 2/2] fix: preserve all h1s before the article subtree --- Readability.js | 100 ++++++++++++++++++++++ index.d.ts | 5 +- test/test-keep-original-title-headers.js | 102 +++++++++++++++++++++++ 3 files changed, 206 insertions(+), 1 deletion(-) diff --git a/Readability.js b/Readability.js index 7cf310ef..96ac3b02 100644 --- a/Readability.js +++ b/Readability.js @@ -68,6 +68,11 @@ function Readability(doc, options) { * If true, keep the first in-article H1/H2 that duplicates the article title * and leave H1 tags in the extracted content. Defaults to false (strip the * duplicate title header and normalize remaining H1 elements to H2). + * When true, also prepend clones of document `h1` nodes that lie outside the + * extracted subtree and precede the grabbed content in document order (for example + * hero headings); snapshots are taken before `_grabArticle` because extraction + * mutates the DOM. Those clones are inserted before `_postProcessContent` so they + * receive URI fixes and class cleanup. */ this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders; @@ -2717,6 +2722,76 @@ Readability.prototype = { return this._textSimilarity(this._articleTitle, heading) > 0.75; }, + /** + * Assign stable preorder indices (depth-first, element-only) so we can compare what + * appeared before extracted content while `_grabArticle` still sees the original tree. + * + * @param Element root + * @param {{ i: number }} counterHolder mutable `{ i }` counter. + */ + _documentPreorderWalk(root, counterHolder) { + if (!root || root.nodeType !== this.ELEMENT_NODE) { + return; + } + this._elementPreorderIndex.set(root, counterHolder.i++); + var child = root.firstElementChild; + while (child) { + this._documentPreorderWalk(child, counterHolder); + child = child.nextElementSibling; + } + }, + + /** + * Prepend `h1` clones that existed elsewhere on the page before extraction (hero, + * etc.), in document order. Snapshots pair each original node with its clone because + * `_grabArticle` may remove or move originals. Only headings whose preorder index is + * strictly before the earliest preorder among nodes inside `articleContent` are kept + * ("before grabbed content"). + * + * @param Element articleContent root returned by `_grabArticle`. + * @param Array<{original: Element, clone: Element, preorder?: number}> snapshots from before grab. + */ + _prependExternalH1HeadingsBeforePostProcess(articleContent, snapshots) { + if (!snapshots || !snapshots.length) { + return; + } + + var minPreorderInGrabbed = Infinity; + var descendants = articleContent.querySelectorAll("*"); + for (var j = 0; j < descendants.length; j++) { + var grabbedPo = this._elementPreorderIndex.get(descendants[j]); + if (grabbedPo !== undefined) { + minPreorderInGrabbed = Math.min(minPreorderInGrabbed, grabbedPo); + } + } + + var fragment = this._doc.createDocumentFragment(); + + for (var i = 0; i < snapshots.length; i++) { + var entry = snapshots[i]; + if (articleContent.contains(entry.original)) { + continue; + } + if (!this._isProbablyVisible(entry.original)) { + continue; + } + if ( + entry.preorder === undefined || + minPreorderInGrabbed === Infinity || + entry.preorder >= minPreorderInGrabbed + ) { + continue; + } + fragment.appendChild(entry.clone); + } + + if (!fragment.childNodes.length) { + return; + } + + articleContent.insertBefore(fragment, articleContent.firstChild); + }, + _flagIsActive(flag) { return (this._flags & flag) > 0; }, @@ -2778,6 +2853,24 @@ Readability.prototype = { this._metadata = metadata; this._articleTitle = metadata.title; + var prefgrabH1Snapshots = null; + if (this._keepOriginalTitleHeaders) { + this._elementPreorderIndex = new WeakMap(); + var preorderCounter = { i: 0 }; + this._documentPreorderWalk(this._doc.documentElement, preorderCounter); + + prefgrabH1Snapshots = Array.from( + this._doc.getElementsByTagName("h1"), + function (h) { + return { + original: h, + clone: h.cloneNode(true), + preorder: this._elementPreorderIndex.get(h), + }; + }.bind(this) + ); + } + var articleContent = this._grabArticle(); if (!articleContent) { return null; @@ -2785,6 +2878,13 @@ Readability.prototype = { this.log("Grabbed: " + articleContent.innerHTML); + if (prefgrabH1Snapshots) { + this._prependExternalH1HeadingsBeforePostProcess( + articleContent, + prefgrabH1Snapshots + ); + } + this._postProcessContent(articleContent); // If we haven't found an excerpt in the article's metadata, use the article's diff --git a/index.d.ts b/index.d.ts index 11685104..61af8845 100644 --- a/index.d.ts +++ b/index.d.ts @@ -78,7 +78,10 @@ export interface ReadabilityOptions { /** * If `true`, the first in-article heading that closely matches the article * title is kept, and H1 tags in the extracted content are not rewritten to H2. - * Defaults to `false`. + * When `true`, also prepends clones of those `h1` elements that lie outside the + * extracted subtree **and** precede the grabbed content in document order (for example + * hero titles), captured before extraction so they still run through post-processing + * (relative URL fixes, etc.). Defaults to `false`. */ keepOriginalTitleHeaders?: boolean; } diff --git a/test/test-keep-original-title-headers.js b/test/test-keep-original-title-headers.js index 2e53b666..b063f762 100644 --- a/test/test-keep-original-title-headers.js +++ b/test/test-keep-original-title-headers.js @@ -80,4 +80,106 @@ describe("keepOriginalTitleHeaders option", function () { expect(result.content).to.include("

" + bodyHeading + "

"); expect(result.content).to.not.include("

" + bodyHeading); }); + + it("when true, prepends clones of document-level H1 outside the extracted subtree (before post-processing)", function () { + var titleText = "Readability External Hero H1 Title Option Test 9x4m"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + '

' + + titleText + + "

" + + "

" + + long + + "

" + + long + + "

" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + + expect(result.content).to.include("

" + titleText + "

"); + expect(result.content.indexOf("

" + titleText)).to.be.lessThan( + result.content.indexOf('id="readability-page-1"') + ); + expect(result.title).to.eql(titleText); + }); + + it("when true, does not prepend H1 that appear after grabbed content in document order", function () { + var titleText = + "Readability Article Title After Hero Ignore Later H1 Test 9x5p"; + var sidebarHeading = "Sidebar Or Footer H1 Must Not Prepend 9x5q"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + "

" + + long + + "

" + + long + + "

" + + "" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + + expect(result.content).to.not.include(sidebarHeading); + expect(result.title).to.eql(titleText); + }); + + it("when false, does not prepend hero H1 from outside the extracted subtree", function () { + var titleText = + "Readability External Hero H1 Absent When Option False Test 9x4n"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + '

' + + titleText + + "

" + + "

" + + long + + "

" + + long + + "

" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + + expect(result.content).to.not.include("