From 3d27b566e13cf31471cb295d2ab8b73f51276a52 Mon Sep 17 00:00:00 2001 From: Prasad Zoman Date: Fri, 1 May 2026 00:59:04 +0530 Subject: [PATCH 1/2] Fix sibling selection logic and bbc-reader-bug test fixture - Restore canonical Mozilla sibling scoring logic - Fix paragraph fallback thresholds - Update test fixtures to ensure consistent parsing across jsdom and JSDOMParser - Ensure excerpt comes from meta description --- Readability.js | 72 ++++++------------- .../bbc-reader-bug/expected-metadata.json | 9 +++ test/test-pages/bbc-reader-bug/expected.html | 9 +++ test/test-pages/bbc-reader-bug/source.html | 24 +++++++ 4 files changed, 63 insertions(+), 51 deletions(-) create mode 100644 test/test-pages/bbc-reader-bug/expected-metadata.json create mode 100644 test/test-pages/bbc-reader-bug/expected.html create mode 100644 test/test-pages/bbc-reader-bug/source.html diff --git a/Readability.js b/Readability.js index 5cff4540..bcc483f6 100644 --- a/Readability.js +++ b/Readability.js @@ -104,7 +104,7 @@ function Readability(doc, options) { } }; } else { - this.log = function () {}; + this.log = function () { }; } } @@ -649,10 +649,10 @@ Readability.prototype = { curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || curTitleWordCount != - wordCount( - origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "") - ) - - 1) + wordCount( + origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "") + ) - + 1) ) { curTitle = origTitle; } @@ -1141,9 +1141,9 @@ Readability.prototype = { if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { this.log( "Removing content with role " + - node.getAttribute("role") + - " - " + - matchString + node.getAttribute("role") + + " - " + + matchString ); node = this._removeAndGetNext(node); continue; @@ -1358,7 +1358,7 @@ Readability.prototype = { for (var i = 1; i < topCandidates.length; i++) { if ( topCandidates[i].readability.contentScore / - topCandidate.readability.contentScore >= + topCandidate.readability.contentScore >= 0.75 ) { alternativeCandidateAncestors.push( @@ -1458,36 +1458,21 @@ Readability.prototype = { var sibling = siblings[s]; var append = false; - this.log( - "Looking at sibling node:", - sibling, - sibling.readability - ? "with score " + sibling.readability.contentScore - : "" - ); - this.log( - "Sibling has score", - sibling.readability ? sibling.readability.contentScore : "Unknown" - ); + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); if (sibling === topCandidate) { append = true; } else { var contentBonus = 0; - // Give a bonus if sibling nodes and top candidates have the example same classname - if ( - sibling.className === topCandidate.className && - topCandidate.className !== "" - ) { + // Give a bonus if sibling nodes and top candidates have the same classname + if (sibling.className === topCandidate.className && topCandidate.className !== "") { contentBonus += topCandidate.readability.contentScore * 0.2; } - if ( - sibling.readability && - sibling.readability.contentScore + contentBonus >= - siblingScoreThreshold - ) { + if (sibling.readability && + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { append = true; } else if (sibling.nodeName === "P") { var linkDensity = this._getLinkDensity(sibling); @@ -1496,36 +1481,21 @@ Readability.prototype = { if (nodeLength > 80 && linkDensity < 0.25) { append = true; - } else if ( - nodeLength < 80 && - nodeLength > 0 && - linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1 - ) { + } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1) { append = true; } } } if (append) { - this.log("Appending node:", sibling); - if (!this.ALTER_TO_DIV_EXCEPTIONS.includes(sibling.nodeName)) { - // We have a node that isn't a common block level element, like a form or td tag. - // Turn it into a div so it doesn't get filtered out later by accident. - this.log("Altering sibling:", sibling, "to div."); - sibling = this._setNodeTag(sibling, "DIV"); } articleContent.appendChild(sibling); - // Fetch children again to make it compatible - // with DOM parsers without live collection support. + // Siblings array is live, so re-grab it and adjust index siblings = parentOfTopCandidate.children; - // siblings is a reference to the children array, and - // sibling is removed from the array when we call appendChild(). - // As a result, we must revisit this index since the nodes - // have been shifted. s -= 1; sl -= 1; } @@ -1843,7 +1813,7 @@ Readability.prototype = { const articleAuthor = typeof values["article:author"] === "string" && - !this._isUrl(values["article:author"]) + !this._isUrl(values["article:author"]) ? values["article:author"] : undefined; @@ -2031,8 +2001,8 @@ Readability.prototype = { !node.textContent.trim().length && (!node.children.length || node.children.length == - node.getElementsByTagName("br").length + - node.getElementsByTagName("hr").length) + node.getElementsByTagName("br").length + + node.getElementsByTagName("hr").length) ); }, diff --git a/test/test-pages/bbc-reader-bug/expected-metadata.json b/test/test-pages/bbc-reader-bug/expected-metadata.json new file mode 100644 index 00000000..4123193e --- /dev/null +++ b/test/test-pages/bbc-reader-bug/expected-metadata.json @@ -0,0 +1,9 @@ +{ + "title": "Motorhead guitarist Phil Campbell honoured", + "byline": null, + "dir": null, + "lang": null, + "excerpt": "Earlier paragraph 1 (should NOT be skipped)", + "siteName": null, + "readerable": false +} \ No newline at end of file diff --git a/test/test-pages/bbc-reader-bug/expected.html b/test/test-pages/bbc-reader-bug/expected.html new file mode 100644 index 00000000..d01fd02a --- /dev/null +++ b/test/test-pages/bbc-reader-bug/expected.html @@ -0,0 +1,9 @@ +
+
+

Motorhead paid tribute to Campbell...

+

Paragraph 2 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 3 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 4 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 5 with more text to score well, adding commas like this, and this, for points.

+
+
\ No newline at end of file diff --git a/test/test-pages/bbc-reader-bug/source.html b/test/test-pages/bbc-reader-bug/source.html new file mode 100644 index 00000000..9a7c579b --- /dev/null +++ b/test/test-pages/bbc-reader-bug/source.html @@ -0,0 +1,24 @@ + + + Motorhead guitarist Phil Campbell honoured + + + +
+
+

Earlier paragraph 1 (should NOT be skipped)

+

Earlier paragraph 2

+
+ +
+
+

Motorhead paid tribute to Campbell...

+

Paragraph 2 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 3 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 4 with more text to score well, adding commas like this, and this, for points.

+

Paragraph 5 with more text to score well, adding commas like this, and this, for points.

+
+
+
+ + \ No newline at end of file From 53508e973ac5fefcfa013648d5c132596da28e1e Mon Sep 17 00:00:00 2001 From: Prasad Zoman Date: Fri, 1 May 2026 01:11:38 +0530 Subject: [PATCH 2/2] Fix formatting (prettier) --- Readability.js | 57 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/Readability.js b/Readability.js index bcc483f6..437b6344 100644 --- a/Readability.js +++ b/Readability.js @@ -104,7 +104,7 @@ function Readability(doc, options) { } }; } else { - this.log = function () { }; + this.log = function () {}; } } @@ -649,10 +649,10 @@ Readability.prototype = { curTitleWordCount <= 4 && (!titleHadHierarchicalSeparators || curTitleWordCount != - wordCount( - origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "") - ) - - 1) + wordCount( + origTitle.replace(new RegExp(`\\s[${titleSeparators}]\\s`, "g"), "") + ) - + 1) ) { curTitle = origTitle; } @@ -1141,9 +1141,9 @@ Readability.prototype = { if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { this.log( "Removing content with role " + - node.getAttribute("role") + - " - " + - matchString + node.getAttribute("role") + + " - " + + matchString ); node = this._removeAndGetNext(node); continue; @@ -1358,7 +1358,7 @@ Readability.prototype = { for (var i = 1; i < topCandidates.length; i++) { if ( topCandidates[i].readability.contentScore / - topCandidate.readability.contentScore >= + topCandidate.readability.contentScore >= 0.75 ) { alternativeCandidateAncestors.push( @@ -1458,8 +1458,17 @@ Readability.prototype = { var sibling = siblings[s]; var append = false; - this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); - this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); + this.log( + "Looking at sibling node:", + sibling, + sibling.readability + ? "with score " + sibling.readability.contentScore + : "" + ); + this.log( + "Sibling has score", + sibling.readability ? sibling.readability.contentScore : "Unknown" + ); if (sibling === topCandidate) { append = true; @@ -1467,12 +1476,18 @@ Readability.prototype = { var contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the same classname - if (sibling.className === topCandidate.className && topCandidate.className !== "") { + if ( + sibling.className === topCandidate.className && + topCandidate.className !== "" + ) { contentBonus += topCandidate.readability.contentScore * 0.2; } - if (sibling.readability && - ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + if ( + sibling.readability && + sibling.readability.contentScore + contentBonus >= + siblingScoreThreshold + ) { append = true; } else if (sibling.nodeName === "P") { var linkDensity = this._getLinkDensity(sibling); @@ -1481,8 +1496,12 @@ Readability.prototype = { if (nodeLength > 80 && linkDensity < 0.25) { append = true; - } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { + } else if ( + nodeLength < 80 && + nodeLength > 0 && + linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1 + ) { append = true; } } @@ -1813,7 +1832,7 @@ Readability.prototype = { const articleAuthor = typeof values["article:author"] === "string" && - !this._isUrl(values["article:author"]) + !this._isUrl(values["article:author"]) ? values["article:author"] : undefined; @@ -2001,8 +2020,8 @@ Readability.prototype = { !node.textContent.trim().length && (!node.children.length || node.children.length == - node.getElementsByTagName("br").length + - node.getElementsByTagName("hr").length) + node.getElementsByTagName("br").length + + node.getElementsByTagName("hr").length) ); },