-
Notifications
You must be signed in to change notification settings - Fork 707
Fix #986: Preserve Wikipedia <h2> headings #987
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -677,6 +677,15 @@ Readability.prototype = { | |
| } | ||
|
|
||
| this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); | ||
|
|
||
| // Fix for issue #986: Remove Wikipedia edit section links that cause headings | ||
| // to be improperly removed. These spans contain "edit" links and add negative | ||
| // weight to otherwise good headings. | ||
| this._forEachNode(this._getAllNodesWithTag(doc, ["span"]), function (span) { | ||
| if (span.className && span.className.includes("mw-editsection")) { | ||
| span.remove(); | ||
| } | ||
| }); | ||
| }, | ||
|
|
||
| /** | ||
|
|
@@ -2540,18 +2549,18 @@ Readability.prototype = { | |
| "iframe", | ||
| ]); | ||
|
|
||
| for (var i = 0; i < embeds.length; i++) { | ||
| for (var k = 0; k < embeds.length; k++) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change seems unrelated, can you revert it? |
||
| // If this embed has attribute that matches video regex, don't delete it. | ||
| for (var j = 0; j < embeds[i].attributes.length; j++) { | ||
| if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) { | ||
| for (var j = 0; j < embeds[k].attributes.length; j++) { | ||
| if (this._allowedVideoRegex.test(embeds[k].attributes[j].value)) { | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| // For embed with <object> tag, check inner HTML as well. | ||
| if ( | ||
| embeds[i].tagName === "object" && | ||
| this._allowedVideoRegex.test(embeds[i].innerHTML) | ||
| embeds[k].tagName === "object" && | ||
| this._allowedVideoRegex.test(embeds[k].innerHTML) | ||
| ) { | ||
| return false; | ||
| } | ||
|
|
@@ -2684,10 +2693,101 @@ Readability.prototype = { | |
| **/ | ||
| _cleanHeaders(e) { | ||
| let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); | ||
| const isPunctuationOnly = function (text) { | ||
| return !!text && /^[\s\[\]\(\){}#*¶·•:;|/\\\-.,!?'"`~_+=<>]+$/.test(text); | ||
| }; | ||
| const getNodeText = function (node) { | ||
| if (!node) { | ||
| return ""; | ||
| } | ||
| return node.textContent ? node.textContent.trim() : ""; | ||
| }; | ||
| const isLowValueNode = function (node, hasOtherMeaningfulText) { | ||
| let text = getNodeText.call(this, node); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see us using |
||
| if (!text) { | ||
| return true; | ||
| } | ||
| if (isPunctuationOnly(text)) { | ||
| return true; | ||
| } | ||
| return hasOtherMeaningfulText && text.length <= 1; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a negation missing here? |
||
| }; | ||
| const collectTextNodes = function (node, texts) { | ||
| if (!node) { | ||
| return; | ||
| } | ||
| if (node.nodeType === this.TEXT_NODE) { | ||
| let text = getNodeText.call(this, node); | ||
| if (text) { | ||
| texts.push(text); | ||
| } | ||
| return; | ||
| } | ||
| if (node.nodeType !== this.ELEMENT_NODE || !node.childNodes) { | ||
| return; | ||
| } | ||
| for (let i = 0; i < node.childNodes.length; i++) { | ||
| collectTextNodes.call(this, node.childNodes[i], texts); | ||
| } | ||
| }; | ||
| const headerHasLongMeaningfulText = function (heading) { | ||
| let texts = []; | ||
| collectTextNodes.call(this, heading, texts); | ||
| for (let i = 0; i < texts.length; i++) { | ||
| if (!isPunctuationOnly(texts[i]) && texts[i].length > 1) { | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| }; | ||
| const getMeaningfulHeaderText = function (heading) { | ||
| let hasOtherMeaningfulText = headerHasLongMeaningfulText.call(this, heading); | ||
| let meaningfulParts = []; | ||
|
|
||
| for (let i = 0; i < heading.childNodes.length; i++) { | ||
| let child = heading.childNodes[i]; | ||
| if (isLowValueNode.call(this, child, hasOtherMeaningfulText)) { | ||
| continue; | ||
| } | ||
|
|
||
| let childTexts = []; | ||
| collectTextNodes.call(this, child, childTexts); | ||
| for (let j = 0; j < childTexts.length; j++) { | ||
| let text = childTexts[j]; | ||
| if (!isLowValueNode.call(this, { textContent: text }, hasOtherMeaningfulText)) { | ||
| meaningfulParts.push(text); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return meaningfulParts.join(" ").replace(this.REGEXPS.normalize, " ").trim(); | ||
| }; | ||
| const hasSemanticAnchorChild = function (heading) { | ||
| if (!heading || !heading.childNodes) { | ||
| return false; | ||
| } | ||
| for (let i = 0; i < heading.childNodes.length; i++) { | ||
| let child = heading.childNodes[i]; | ||
| if (child.nodeType !== this.ELEMENT_NODE) { | ||
| continue; | ||
| } | ||
| if ((child.id || child.getAttribute("name")) && getNodeText.call(this, child)) { | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| }; | ||
| this._removeNodes(headingNodes, function (node) { | ||
| let shouldRemove = this._getClassWeight(node) < 0; | ||
| let meaningfulText = getMeaningfulHeaderText.call(this, node); | ||
| let shouldPreserveWithMeaningfulText = | ||
| meaningfulText.length > 0 && hasSemanticAnchorChild.call(this, node); | ||
| let shouldRemove = | ||
| this._getClassWeight(node) < 0 && !shouldPreserveWithMeaningfulText; | ||
| if (shouldRemove) { | ||
| this.log("Removing header with low class weight:", node); | ||
| this.log( | ||
| "Removing header with low class weight and no meaningful text:", | ||
| node | ||
| ); | ||
|
Comment on lines
2780
to
+2790
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm very confused how these changes help / are supposed to work. AFAICT all they do is add more logic about keeping / removing the whole header, when the issue at stake is the "edit" links inside the wikipedia headers. I think if this set of changes is "working" it's probably due to the expensive |
||
| } | ||
| return shouldRemove; | ||
| }); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, at this point I have 2 concerns:
_cleanHeadersto account for this? That already looks at h1/h2 headers, and is presumably where this is being removed - is only updating that method insufficient?