Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions debug_dom.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?php
$buffer = '<div></div><script src="other.js"></script><script src="tracking.js"></script>';
$wrapped_buffer = '<dummy>' . $buffer . '</dummy>';

$dom = new DOMDocument();
$encoded_buffer = mb_encode_numericentity( $wrapped_buffer, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' );
$dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );

$dummy = $dom->getElementsByTagName('dummy')->item(0);

echo "Child Nodes: " . $dummy->childNodes->length . "\n";
foreach ($dummy->childNodes as $node) {
echo "Node: " . $node->nodeName . "\n";
echo "Content: " . $dom->saveHTML($node) . "\n";
}
Comment on lines +1 to +15
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[NITPICK] This file appears to be a local debugging script left in the commit. Remove debug_dom.php from the PR (or move it to a non-shipped dev-only location). Leaving debug utilities in the repository can accidentally expose information and increases maintenance noise.

143 changes: 64 additions & 79 deletions src/lib/helper.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
use Exception;
use InvalidArgumentException;
use RuntimeException;
use DOMDocument;
use DOMElement;

/**
* @param string $type
Expand Down Expand Up @@ -98,97 +100,80 @@
* @version 2.0.4
* @since 1.2.0
*/
function cookiebot_addons_manipulate_script( $buffer, $keywords ) {

Check warning on line 103 in src/lib/helper.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

This function has 4 returns, which is more than the 3 allowed.

See more on https://sonarcloud.io/project/issues?id=CybotAS_CookiebotWP&issues=AZq7Cnah9o_KVu3PpvTD&open=AZq7Cnah9o_KVu3PpvTD&pullRequest=355
/**
* normalize potential self-closing script tags
*/
if ( empty( $buffer ) ) {
return $buffer;
}

// Use DOMDocument to safely parse and modify the script tag
$dom = new DOMDocument();

// Suppress errors for partial HTML
Comment on lines +109 to +111
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CRITICAL_BUG] The new logic depends on the DOMDocument extension (instantiating new DOMDocument()). If ext-dom is not available in the target environment this will fatal error. Add a graceful fallback (either keep the original regex-based implementation as a fallback or check class_exists('DOMDocument')/extension_loaded('dom') and return $buffer or run the regex approach). Ensure the code never triggers a fatal error in environments without DOM.

if (!class_exists('DOMDocument') || !extension_loaded('dom')) {
    // Fallback to regex-based implementation
    // (Insert the old regex logic here, or simply return $buffer)
    return $buffer;
}

$libxml_previous_state = libxml_use_internal_errors( true );

// Wrap buffer in a custom tag to ensure correct parsing of fragments (e.g. multiple siblings at root)
// This prevents DOMDocument from trying to fix structure by nesting siblings
$wrapped_buffer = '<cookiebot-wrapper>' . $buffer . '</cookiebot-wrapper>';

$normalized_buffer = preg_replace( '/(<script(.*?)\/>)/is', '<script$2></script>', $buffer );
// Load HTML with UTF-8 encoding hack
// The mb_convert_encoding is to ensure we don't have encoding issues
// We use LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD to avoid adding <html><body> wrappers automatically
// Replacement for deprecated mb_convert_encoding(..., 'HTML-ENTITIES', 'UTF-8')
$encoded_buffer = mb_encode_numericentity( $wrapped_buffer, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' );
$dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );

if ( $normalized_buffer !== null ) {
$buffer = $normalized_buffer;
libxml_use_internal_errors( $libxml_previous_state );

Comment on lines +112 to +126
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[PERFORMANCE_OPTIMIZATION] You call libxml_use_internal_errors(true) and then restore the previous state after loadHTML but don't clear libxml errors. Call libxml_clear_errors() after loadHTML (before restoring the previous state) to avoid accumulating libxml errors in long-running processes or tests.

$dom->loadHTML( $encoded_buffer, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
libxml_clear_errors();
libxml_use_internal_errors( $libxml_previous_state );

$scripts = $dom->getElementsByTagName( 'script' );
$modified = false;

// Convert DOMNodeList to array to avoid modification issues during iteration
$script_nodes = array();
foreach ( $scripts as $script ) {
$script_nodes[] = $script;
}

/**
* Pattern to get all scripts
*
* @version 2.0.4
* @since 1.2.0
*/
$pattern = '/(<script[^>]*+>)(.*?)(<\/script>)/is';
foreach ( $script_nodes as $script ) {
/** @var DOMElement $script */
// Get the full script HTML to check for keywords
// We check the outer HTML to include attributes in the search, matching the regex behavior
$full_script_content = $dom->saveHTML( $script );

/**
* Get all scripts and add cookieconsent if it does match with the criterion
*/
$updated_scripts = preg_replace_callback(
$pattern,
function ( $matches ) use ( $keywords ) {
$script = $matches[0]; // the full script html
$script_tag_open = $matches[1]; // only the script open tag with all attributes
$script_tag_inner = $matches[2]; // only the script's innerText
$script_tag_close = $matches[3]; // only the script closing tag

/**
* Check if the script contains the keywords, checks keywords one by one
*
* If one match, then the rest of the keywords will be skipped.
*/
foreach ( $keywords as $needle => $cookie_type ) {
/**
* The script contains the needle
*/
if ( strpos( $script, $needle ) !== false ) {
/**
* replace all single quotes with double quotes in the open tag
* remove previously set data-cookieconsent attribute
* remove type attribute
*/
$script_tag_open = str_replace( '\'', '"', $script_tag_open );
$script_tag_open = preg_replace( '/\sdata-cookieconsent=\"[^"]*+\"/', '', $script_tag_open );
$script_tag_open = preg_replace( '/\stype=\"[^"]*+\"/', '', $script_tag_open );

/**
* set the type attribute to text/plain to prevent javascript execution
* add data-cookieconsent attribute
*/
$cookie_types = cookiebot_addons_output_cookie_types( $cookie_type );

$script_tag_open = str_replace(
'<script',
sprintf( '<script type="text/plain" data-cookieconsent="%s"', $cookie_types ),
$script_tag_open
);

/**
* reconstruct the script and break the foreach loop
*/
$script = $script_tag_open . $script_tag_inner . $script_tag_close;
}
}
foreach ( $keywords as $needle => $cookie_type ) {
if ( strpos( $full_script_content, $needle ) !== false ) {
// Match found

/**
* return the reconstructed script
*/
return $script;
},
$buffer
);
// Remove existing attributes
$script->removeAttribute( 'type' );
$script->removeAttribute( 'data-cookieconsent' );

/**
* Fallback when the regex fails to work due to PCRE_ERROR_JIT_STACKLIMIT
*
* @version 2.0.4
* @since 2.0.4
*/
if ( $updated_scripts === null ) {
$updated_scripts = $buffer;
// Add new attributes
$script->setAttribute( 'type', 'text/plain' );
$script->setAttribute( 'data-cookieconsent', cookiebot_addons_output_cookie_types( $cookie_type ) );

if ( get_option( 'cookiebot_regex_stacklimit' ) === false ) {
update_option( 'cookiebot_regex_stacklimit', 1 );
$modified = true;
// Break inner loop (keywords) as we found a match
break;
}
}
}

if ( $modified ) {
// Save HTML
// We extract children of our wrapper
$wrapper = $dom->getElementsByTagName( 'cookiebot-wrapper' )->item( 0 );
if ( $wrapper ) {
$output = '';
foreach ( $wrapper->childNodes as $node ) {
$output .= $dom->saveHTML( $node );
}
return $output;
Comment on lines +157 to +170
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[VALIDATION] Behavior change: the old implementation updated option 'cookiebot_regex_stacklimit' on a regex fallback. The new DOM-based path removed that fallback and no longer writes that option. If other code relies on that option being set to detect regex failures, adapt or preserve that behavior in a DOM fallback branch to maintain compatibility.

// In the fallback branch (regex failure), preserve the option update
if ($updated_scripts === null) {
    $updated_scripts = $buffer;
    if (get_option('cookiebot_regex_stacklimit') === false) {
        update_option('cookiebot_regex_stacklimit', 1);
    }
}

}
// Fallback if wrapper is missing (should not happen)
return $dom->saveHTML();
}

return $updated_scripts;
return $buffer;
}

/**
Expand Down
100 changes: 78 additions & 22 deletions src/lib/script_loader_tag/Script_Loader_Tag.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

namespace cybot\cookiebot\lib\script_loader_tag;

use DOMDocument;
use DOMElement;

class Script_Loader_Tag implements Script_Loader_Tag_Interface {


Expand Down Expand Up @@ -73,28 +76,52 @@
*
* @since 1.2.0
*/
public function cookiebot_add_consent_attribute_to_tag( $tag, $handle, $src ) {

Check warning on line 79 in src/lib/script_loader_tag/Script_Loader_Tag.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

This method has 4 returns, which is more than the 3 allowed.

See more on https://sonarcloud.io/project/issues?id=CybotAS_CookiebotWP&issues=AZq7CnWq9o_KVu3PpvTB&open=AZq7CnWq9o_KVu3PpvTB&pullRequest=355
// Check if the handle is in our list of tags to modify
if ( array_key_exists( $handle, $this->tags ) && ! empty( $this->tags[ $handle ] ) ) {
// If we have a match, we completely replace the tag with our own constructed one
// This is safer than parsing for this specific case as we know exactly what we want
//phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedScript
return '<script src="' . $src . '" type="text/plain" data-cookieconsent="' . implode( ',', $this->tags[ $handle ] ) . '"></script>';
return '<script src="' . esc_url( $src ) . '" type="text/plain" data-cookieconsent="' . esc_attr( implode( ',', $this->tags[ $handle ] ) ) . '"></script>';
}

// Check if the script should be ignored
if ( $this->check_ignore_script( $src ) ) {
return preg_replace_callback(
'/<script\s*(?<atts>[^>]*)>/',
function ( $tag ) use ( $handle ) {
// Prevent modification of the script tags inside the other script tag by validating the ID of the
// script and checking if we already set the consent status for the script. This will fix the issue
// on Gutenberg editor pages.
if ( ! self::validate_attributes_for_consent_ignore( $handle, $tag['atts'] ) ) {
return $tag[0];
}

//phpcs:ignore WordPress.WP.EnqueuedResources.NonEnqueuedScript
return str_replace( '<script ', '<script data-cookieconsent="ignore" ', $tag[0] );
},
$tag
);
// Use DOMDocument to safely parse and modify the script tag
$dom = new DOMDocument();

// Suppress errors for partial HTML
$libxml_previous_state = libxml_use_internal_errors( true );

// Load HTML with UTF-8 encoding hack
// The mb_convert_encoding is to ensure we don't have encoding issues
// Replacement for deprecated mb_convert_encoding(..., 'HTML-ENTITIES', 'UTF-8')
$encoded_tag = mb_encode_numericentity( $tag, array( 0x80, 0x10FFFF, 0, 0x1FFFFF ), 'UTF-8' );
$dom->loadHTML( $encoded_tag, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );

Comment on lines +90 to +101
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[CRITICAL_BUG] This method now uses DOMDocument (new DOMDocument() at lines 96-100). As with helper.php, you must guard against missing ext-dom (class DOMDocument not found) or add a fallback. Without the DOM extension this will fatal error. Either check class_exists('DOMDocument') and return original $tag or fall back to the previous preg_replace_callback approach.

if ( ! class_exists( 'DOMDocument' ) ) {
    // Fallback to previous regex approach or return original $tag
    return preg_replace_callback(
        '/<script\s*(?<atts>[^>]*)>/',
        function ( $tag ) use ( $handle ) {
            if ( ! self::validate_attributes_for_consent_ignore( $handle, $tag['atts'] ) ) {
                return $tag[0];
            }
            return str_replace( '<script ', '<script data-cookieconsent="ignore" ', $tag[0] );
        },
        $tag
    );
}
// ... continue with DOMDocument logic

libxml_use_internal_errors( $libxml_previous_state );

$scripts = $dom->getElementsByTagName( 'script' );

if ( $scripts->length > 0 ) {
/** @var DOMElement $script */
$script = $scripts->item( 0 );

// Validate attributes to ensure we are targeting the right script
// This mimics the logic in validate_attributes_for_consent_ignore
$id = $script->getAttribute( 'id' );

// If validation fails, return original tag
if ( ! $this->validate_attributes_for_consent_ignore_dom( $handle, $id, $script ) ) {
return $tag;
}

// Add the ignore attribute
$script->setAttribute( 'data-cookieconsent', 'ignore' );

// Save HTML
return $dom->saveHTML( $script );
Comment on lines +119 to +123
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: After the DOM validation, inject the data-cookieconsent attribute into the original $tag string instead of returning $dom->saveHTML() so the markup is not mutated by DOMDocument serialization. [possible issue]

}
}

return $tag;
Expand Down Expand Up @@ -183,20 +210,49 @@
* Check if the script tag attributes are valid for the injection of the consent ignore attribute.
*
* @param string $script_handle Handle of the enqueued script. Required for identification of the scripts.
* @param string $tag_attributes List of the attributes for the tag.
* @param string $id ID attribute of the script tag.
* @param DOMElement $script The script element.
*
* @return bool True if the attributes are valid for the injection of the consent ignore attribute.
*/
private static function validate_attributes_for_consent_ignore( $script_handle, $tag_attributes ) {
$quoted_handle = preg_quote( $script_handle, '/' );

private function validate_attributes_for_consent_ignore_dom( $script_handle, $id, $script ) {

Check warning on line 218 in src/lib/script_loader_tag/Script_Loader_Tag.php

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

This method has 4 returns, which is more than the 3 allowed.

See more on https://sonarcloud.io/project/issues?id=CybotAS_CookiebotWP&issues=AZq7CnWq9o_KVu3PpvTC&open=AZq7CnWq9o_KVu3PpvTC&pullRequest=355
// Exclude any scripts not related to currently processed script handle. Only script itself and inline block
// before/after are supported.
if ( ! preg_match( "/id=['\"]$quoted_handle(?:-js(-(after|before))?)?['\"]/", $tag_attributes ) ) {

// Construct expected ID pattern logic
// Original regex: "/id=['\"]$quoted_handle(?:-js(-(after|before))?)?['\"]/"

// Check if ID starts with handle
if ( strpos( $id, $script_handle ) !== 0 ) {
return false;
}

// Check if it's an exact match or has valid suffixes
$valid_suffixes = [
'-js',
'-js-after',
'-js-before'
];

$is_valid_id = $id === $script_handle;
if ( ! $is_valid_id ) {
foreach ( $valid_suffixes as $suffix ) {
if ( $id === $script_handle . $suffix ) {
$is_valid_id = true;
break;
}
}
}

if ( ! $is_valid_id ) {
return false;
}

// Exclude any scripts already containing the consent ignore attribute.
return is_bool( strpos( $tag_attributes, 'data-cookieconsent=' ) );
if ( $script->hasAttribute( 'data-cookieconsent' ) ) {
return false;
}

return true;
}
}
Loading