-
Notifications
You must be signed in to change notification settings - Fork 32
feat: refactor dom manipulation #355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| <?php | ||
| $buffer = '<div></div><script src="other.js"></script><script src="tracking.js"></script>'; | ||
| $wrapped_buffer = '<dummy>' . $buffer . '</dummy>'; | ||
|
|
||
| $dom = new DOMDocument(); | ||
| $encoded_buffer = mb_encode_numericentity( $wrapped_buffer, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' ); | ||
| $dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD ); | ||
|
|
||
| $dummy = $dom->getElementsByTagName('dummy')->item(0); | ||
|
|
||
| echo "Child Nodes: " . $dummy->childNodes->length . "\n"; | ||
| foreach ($dummy->childNodes as $node) { | ||
| echo "Node: " . $node->nodeName . "\n"; | ||
| echo "Content: " . $dom->saveHTML($node) . "\n"; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,8 @@ | |
| use Exception; | ||
| use InvalidArgumentException; | ||
| use RuntimeException; | ||
| use DOMDocument; | ||
| use DOMElement; | ||
|
|
||
| /** | ||
| * @param string $type | ||
|
|
@@ -98,97 +100,80 @@ | |
| * @version 2.0.4 | ||
| * @since 1.2.0 | ||
| */ | ||
| function cookiebot_addons_manipulate_script( $buffer, $keywords ) { | ||
|
Check warning on line 103 in src/lib/helper.php
|
||
| /** | ||
| * normalize potential self-closing script tags | ||
| */ | ||
| if ( empty( $buffer ) ) { | ||
| return $buffer; | ||
| } | ||
|
|
||
| // Use DOMDocument to safely parse and modify the script tag | ||
| $dom = new DOMDocument(); | ||
|
|
||
| // Suppress errors for partial HTML | ||
|
Comment on lines
+109
to
+111
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CRITICAL_BUG] The new logic depends on the DOMDocument extension (instantiating new DOMDocument()). If ext-dom is not available in the target environment this will fatal error. Add a graceful fallback (either keep the original regex-based implementation as a fallback or check class_exists('DOMDocument')/extension_loaded('dom') and return $buffer or run the regex approach). Ensure the code never triggers a fatal error in environments without DOM. if (!class_exists('DOMDocument') || !extension_loaded('dom')) {
// Fallback to regex-based implementation
// (Insert the old regex logic here, or simply return $buffer)
return $buffer;
} |
||
| $libxml_previous_state = libxml_use_internal_errors( true ); | ||
|
|
||
| // Wrap buffer in a custom tag to ensure correct parsing of fragments (e.g. multiple siblings at root) | ||
| // This prevents DOMDocument from trying to fix structure by nesting siblings | ||
| $wrapped_buffer = '<cookiebot-wrapper>' . $buffer . '</cookiebot-wrapper>'; | ||
|
|
||
| $normalized_buffer = preg_replace( '/(<script(.*?)\/>)/is', '<script$2></script>', $buffer ); | ||
| // Load HTML with UTF-8 encoding hack | ||
| // The mb_convert_encoding is to ensure we don't have encoding issues | ||
| // We use LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD to avoid adding <html><body> wrappers automatically | ||
| // Replacement for deprecated mb_convert_encoding(..., 'HTML-ENTITIES', 'UTF-8') | ||
| $encoded_buffer = mb_encode_numericentity( $wrapped_buffer, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' ); | ||
| $dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD ); | ||
|
|
||
| if ( $normalized_buffer !== null ) { | ||
| $buffer = $normalized_buffer; | ||
| libxml_use_internal_errors( $libxml_previous_state ); | ||
|
|
||
|
Comment on lines
+112
to
+126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [PERFORMANCE_OPTIMIZATION] You call libxml_use_internal_errors(true) and then restore the previous state after loadHTML but don't clear libxml errors. Call libxml_clear_errors() after loadHTML (before restoring the previous state) to avoid accumulating libxml errors in long-running processes or tests. $dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
libxml_clear_errors();
libxml_use_internal_errors( $libxml_previous_state ); |
||
| $scripts = $dom->getElementsByTagName( 'script' ); | ||
| $modified = false; | ||
|
|
||
| // Convert DOMNodeList to array to avoid modification issues during iteration | ||
| $script_nodes = array(); | ||
| foreach ( $scripts as $script ) { | ||
| $script_nodes[] = $script; | ||
| } | ||
|
|
||
| /** | ||
| * Pattern to get all scripts | ||
| * | ||
| * @version 2.0.4 | ||
| * @since 1.2.0 | ||
| */ | ||
| $pattern = '/(<script[^>]*+>)(.*?)(<\/script>)/is'; | ||
| foreach ( $script_nodes as $script ) { | ||
| /** @var DOMElement $script */ | ||
| // Get the full script HTML to check for keywords | ||
| // We check the outer HTML to include attributes in the search, matching the regex behavior | ||
| $full_script_content = $dom->saveHTML( $script ); | ||
|
|
||
| /** | ||
| * Get all scripts and add cookieconsent if it does match with the criterion | ||
| */ | ||
| $updated_scripts = preg_replace_callback( | ||
| $pattern, | ||
| function ( $matches ) use ( $keywords ) { | ||
| $script = $matches[0]; // the full script html | ||
| $script_tag_open = $matches[1]; // only the script open tag with all attributes | ||
| $script_tag_inner = $matches[2]; // only the script's innerText | ||
| $script_tag_close = $matches[3]; // only the script closing tag | ||
|
|
||
| /** | ||
| * Check if the script contains the keywords, checks keywords one by one | ||
| * | ||
| * If one match, then the rest of the keywords will be skipped. | ||
| */ | ||
| foreach ( $keywords as $needle => $cookie_type ) { | ||
| /** | ||
| * The script contains the needle | ||
| */ | ||
| if ( strpos( $script, $needle ) !== false ) { | ||
| /** | ||
| * replace all single quotes with double quotes in the open tag | ||
| * remove previously set data-cookieconsent attribute | ||
| * remove type attribute | ||
| */ | ||
| $script_tag_open = str_replace( '\'', '"', $script_tag_open ); | ||
| $script_tag_open = preg_replace( '/\sdata-cookieconsent=\"[^"]*+\"/', '', $script_tag_open ); | ||
| $script_tag_open = preg_replace( '/\stype=\"[^"]*+\"/', '', $script_tag_open ); | ||
|
|
||
| /** | ||
| * set the type attribute to text/plain to prevent javascript execution | ||
| * add data-cookieconsent attribute | ||
| */ | ||
| $cookie_types = cookiebot_addons_output_cookie_types( $cookie_type ); | ||
|
|
||
| $script_tag_open = str_replace( | ||
| '<script', | ||
| sprintf( '<script type="text/plain" data-cookieconsent="%s"', $cookie_types ), | ||
| $script_tag_open | ||
| ); | ||
|
|
||
| /** | ||
| * reconstruct the script and break the foreach loop | ||
| */ | ||
| $script = $script_tag_open . $script_tag_inner . $script_tag_close; | ||
| } | ||
| } | ||
| foreach ( $keywords as $needle => $cookie_type ) { | ||
| if ( strpos( $full_script_content, $needle ) !== false ) { | ||
| // Match found | ||
|
|
||
| /** | ||
| * return the reconstructed script | ||
| */ | ||
| return $script; | ||
| }, | ||
| $buffer | ||
| ); | ||
| // Remove existing attributes | ||
| $script->removeAttribute( 'type' ); | ||
| $script->removeAttribute( 'data-cookieconsent' ); | ||
|
|
||
| /** | ||
| * Fallback when the regex fails to work due to PCRE_ERROR_JIT_STACKLIMIT | ||
| * | ||
| * @version 2.0.4 | ||
| * @since 2.0.4 | ||
| */ | ||
| if ( $updated_scripts === null ) { | ||
| $updated_scripts = $buffer; | ||
| // Add new attributes | ||
| $script->setAttribute( 'type', 'text/plain' ); | ||
| $script->setAttribute( 'data-cookieconsent', cookiebot_addons_output_cookie_types( $cookie_type ) ); | ||
|
|
||
| if ( get_option( 'cookiebot_regex_stacklimit' ) === false ) { | ||
| update_option( 'cookiebot_regex_stacklimit', 1 ); | ||
| $modified = true; | ||
| // Break inner loop (keywords) as we found a match | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if ( $modified ) { | ||
| // Save HTML | ||
| // We extract children of our wrapper | ||
| $wrapper = $dom->getElementsByTagName( 'cookiebot-wrapper' )->item( 0 ); | ||
| if ( $wrapper ) { | ||
| $output = ''; | ||
| foreach ( $wrapper->childNodes as $node ) { | ||
| $output .= $dom->saveHTML( $node ); | ||
| } | ||
| return $output; | ||
|
Comment on lines
+157
to
+170
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [VALIDATION] Behavior change: the old implementation updated option 'cookiebot_regex_stacklimit' on a regex fallback. The new DOM-based path removed that fallback and no longer writes that option. If other code relies on that option being set to detect regex failures, adapt or preserve that behavior in a DOM fallback branch to maintain compatibility. // In the fallback branch (regex failure), preserve the option update
if ($updated_scripts === null) {
$updated_scripts = $buffer;
if (get_option('cookiebot_regex_stacklimit') === false) {
update_option('cookiebot_regex_stacklimit', 1);
}
} |
||
| } | ||
| // Fallback if wrapper is missing (should not happen) | ||
| return $dom->saveHTML(); | ||
| } | ||
|
|
||
| return $updated_scripts; | ||
| return $buffer; | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,9 @@ | |
|
|
||
| namespace cybot\cookiebot\lib\script_loader_tag; | ||
|
|
||
| use DOMDocument; | ||
| use DOMElement; | ||
|
|
||
| class Script_Loader_Tag implements Script_Loader_Tag_Interface { | ||
|
|
||
|
|
||
|
|
@@ -73,28 +76,52 @@ | |
| * | ||
| * @since 1.2.0 | ||
| */ | ||
| public function cookiebot_add_consent_attribute_to_tag( $tag, $handle, $src ) { | ||
|
Check warning on line 79 in src/lib/script_loader_tag/Script_Loader_Tag.php
|
||
| // Check if the handle is in our list of tags to modify | ||
| if ( array_key_exists( $handle, $this->tags ) && ! empty( $this->tags[ $handle ] ) ) { | ||
| // If we have a match, we completely replace the tag with our own constructed one | ||
| // This is safer than parsing for this specific case as we know exactly what we want | ||
| //phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedScript | ||
| return '<script src="' . $src . '" type="text/plain" data-cookieconsent="' . implode( ',', $this->tags[ $handle ] ) . '"></script>'; | ||
| return '<script src="' . esc_url( $src ) . '" type="text/plain" data-cookieconsent="' . esc_attr( implode( ',', $this->tags[ $handle ] ) ) . '"></script>'; | ||
| } | ||
|
|
||
| // Check if the script should be ignored | ||
| if ( $this->check_ignore_script( $src ) ) { | ||
| return preg_replace_callback( | ||
| '/<script\s*(?<atts>[^>]*)>/', | ||
| function ( $tag ) use ( $handle ) { | ||
| // Prevent modification of the script tags inside the other script tag by validating the ID of the | ||
| // script and checking if we already set the consent status for the script. This will fix the issue | ||
| // on Gutenberg editor pages. | ||
| if ( ! self::validate_attributes_for_consent_ignore( $handle, $tag['atts'] ) ) { | ||
| return $tag[0]; | ||
| } | ||
|
|
||
| //phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedScript | ||
| return str_replace( '<script ', '<script data-cookieconsent="ignore" ', $tag[0] ); | ||
| }, | ||
| $tag | ||
| ); | ||
| // Use DOMDocument to safely parse and modify the script tag | ||
| $dom = new DOMDocument(); | ||
|
|
||
| // Suppress errors for partial HTML | ||
| $libxml_previous_state = libxml_use_internal_errors( true ); | ||
|
|
||
| // Load HTML with UTF-8 encoding hack | ||
| // The mb_convert_encoding is to ensure we don't have encoding issues | ||
| // Replacement for deprecated mb_convert_encoding(..., 'HTML-ENTITIES', 'UTF-8') | ||
| $encoded_tag = mb_encode_numericentity( $tag, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' ); | ||
| $dom->loadHTML( $encoded_tag, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD ); | ||
|
|
||
|
Comment on lines
+90
to
+101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [CRITICAL_BUG] This method now uses DOMDocument (new DOMDocument() at lines 96-100). As with helper.php, you must guard against missing ext-dom (class DOMDocument not found) or add a fallback. Without the DOM extension this will fatal error. Either check class_exists('DOMDocument') and return original $tag or fall back to the previous preg_replace_callback approach. if ( ! class_exists( 'DOMDocument' ) ) {
// Fallback to previous regex approach or return original $tag
return preg_replace_callback(
'/<script\s*(?<atts>[^>]*)>/',
function ( $tag ) use ( $handle ) {
if ( ! self::validate_attributes_for_consent_ignore( $handle, $tag['atts'] ) ) {
return $tag[0];
}
return str_replace( '<script ', '<script data-cookieconsent="ignore" ', $tag[0] );
},
$tag
);
}
// ... continue with DOMDocument logic |
||
| libxml_use_internal_errors( $libxml_previous_state ); | ||
|
|
||
| $scripts = $dom->getElementsByTagName( 'script' ); | ||
|
|
||
| if ( $scripts->length > 0 ) { | ||
| /** @var DOMElement $script */ | ||
| $script = $scripts->item( 0 ); | ||
|
|
||
| // Validate attributes to ensure we are targeting the right script | ||
| // This mimics the logic in validate_attributes_for_consent_ignore | ||
| $id = $script->getAttribute( 'id' ); | ||
|
|
||
| // If validation fails, return original tag | ||
| if ( ! $this->validate_attributes_for_consent_ignore_dom( $handle, $id, $script ) ) { | ||
| return $tag; | ||
| } | ||
|
|
||
| // Add the ignore attribute | ||
| $script->setAttribute( 'data-cookieconsent', 'ignore' ); | ||
|
|
||
| // Save HTML | ||
| return $dom->saveHTML( $script ); | ||
|
Comment on lines
+119
to
+123
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: After the DOM validation, inject the |
||
| } | ||
| } | ||
|
|
||
| return $tag; | ||
|
|
@@ -183,20 +210,49 @@ | |
| * Check if the script tag attributes are valid for the injection of the consent ignore attribute. | ||
| * | ||
| * @param string $script_handle Handle of the enqueued script. Required for identification of the scripts. | ||
| * @param string $tag_attributes List of the attributes for the tag. | ||
| * @param string $id ID attribute of the script tag. | ||
| * @param DOMElement $script The script element. | ||
| * | ||
| * @return bool True if the attributes are valid for the injection of the consent ignore attribute. | ||
| */ | ||
| private static function validate_attributes_for_consent_ignore( $script_handle, $tag_attributes ) { | ||
| $quoted_handle = preg_quote( $script_handle, '/' ); | ||
|
|
||
| private function validate_attributes_for_consent_ignore_dom( $script_handle, $id, $script ) { | ||
|
Check warning on line 218 in src/lib/script_loader_tag/Script_Loader_Tag.php
|
||
| // Exclude any scripts not related to currently processed script handle. Only script itself and inline block | ||
| // before/after are supported. | ||
| if ( ! preg_match( "/id=['\"]$quoted_handle(?:-js(-(after|before))?)?['\"]/", $tag_attributes ) ) { | ||
|
|
||
| // Construct expected ID pattern logic | ||
| // Original regex: "/id=['\"]$quoted_handle(?:-js(-(after|before))?)?['\"]/" | ||
|
|
||
| // Check if ID starts with handle | ||
| if ( strpos( $id, $script_handle ) !== 0 ) { | ||
| return false; | ||
| } | ||
|
|
||
| // Check if it's an exact match or has valid suffixes | ||
| $valid_suffixes = [ | ||
| '-js', | ||
| '-js-after', | ||
| '-js-before' | ||
| ]; | ||
|
|
||
| $is_valid_id = $id === $script_handle; | ||
| if ( ! $is_valid_id ) { | ||
| foreach ( $valid_suffixes as $suffix ) { | ||
| if ( $id === $script_handle . $suffix ) { | ||
| $is_valid_id = true; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if ( ! $is_valid_id ) { | ||
| return false; | ||
| } | ||
|
|
||
| // Exclude any scripts already containing the consent ignore attribute. | ||
| return is_bool( strpos( $tag_attributes, 'data-cookieconsent=' ) ); | ||
| if ( $script->hasAttribute( 'data-cookieconsent' ) ) { | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[NITPICK] This file appears to be a local debugging script left in the commit. Remove debug_dom.php from the PR (or move it to a non-shipped dev-only location). Leaving debug utilities in the repository can accidentally expose information and increases maintenance noise.