From 37e70f6a355f1ac2abbc8d7c8a362620a1de5c58 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 22:04:17 +0000 Subject: [PATCH 1/2] fix: parser selector generation correctness and XPath injection - Use NodeId identity in get_nth_of_type instead of comparing outer_html, so identical sibling HTML no longer collapses every position to 1 - Escape attribute values when embedding into XPath: single quotes by default, double quotes when the value contains single quotes, and concat() when it contains both. Adversarial id values can no longer inject XPath operators - Expose Selector::node_id() for the identity check Closes #5 Closes #14 https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn --- src/parser/selector.rs | 7 ++++ src/parser/selector_generation.rs | 57 ++++++++++++++++++++++++++--- tests/parser_selector_generation.rs | 44 ++++++++++++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) diff --git a/src/parser/selector.rs b/src/parser/selector.rs index 53da318..98fe1ed 100644 --- a/src/parser/selector.rs +++ b/src/parser/selector.rs @@ -68,6 +68,13 @@ impl Selector { /// Return the tag name of this element. /// Returns "html" for the document root, "#text" for text nodes. + /// Stable identity of the underlying DOM node. Useful for comparing two + /// `Selector`s that reference the same node without resorting to HTML + /// string equality (which is ambiguous for identical siblings). + pub fn node_id(&self) -> NodeId { + self.node_id + } + pub fn tag(&self) -> &str { let node = self.node_ref(); match node.value() { diff --git a/src/parser/selector_generation.rs b/src/parser/selector_generation.rs index 29ddf53..e84ea63 100644 --- a/src/parser/selector_generation.rs +++ b/src/parser/selector_generation.rs @@ -47,7 +47,11 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String { if let Some(id) = el.attrib().get("id") { if !full_path { - parts.push(format!("{}[@id='{}']", tag, id.as_str())); + parts.push(format!( + "{}[@id={}]", + tag, + xpath_string_literal(id.as_str()) + )); break; } } @@ -71,21 +75,64 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String { } /// Count position of element among same-tag siblings. Returns 0 if unique. +/// +/// Uses the DOM `NodeId` for identity rather than comparing `outer_html`, +/// because identical sibling HTML would otherwise collapse to the same +/// position. fn get_nth_of_type(selector: &Selector) -> usize { let tag = selector.tag(); + let self_id = selector.node_id(); if let Some(parent) = selector.parent() { let children = parent.children(); let same_tag: Vec<_> = children.into_iter().filter(|c| c.tag() == tag).collect(); if same_tag.len() <= 1 { - return 0; // only child of this type, no disambiguation needed + return 0; } - // Find our position by comparing node identity via outer_html - let our_html = selector.outer_html(); for (i, child) in same_tag.iter().enumerate() { - if child.outer_html().as_str() == our_html.as_str() { + if child.node_id() == self_id { return i + 1; } } } 0 } + +/// Quote a string for safe embedding inside an XPath expression. Single +/// quotes are the default; double quotes are used when the value contains +/// single quotes; `concat(...)` is used when it contains both. +fn xpath_string_literal(value: &str) -> String { + let has_single = value.contains('\''); + let has_double = value.contains('"'); + if !has_single { + format!("'{}'", value) + } else if !has_double { + format!("\"{}\"", value) + } else { + // Both quote types present. Split on single quotes and rejoin via + // concat() so neither delimiter has to be escaped. + let parts: Vec = value.split('\'').map(|p| format!("'{}'", p)).collect(); + format!("concat({})", parts.join(", \"'\", ")) + } +} + +#[cfg(test)] +mod tests { + use super::xpath_string_literal; + + #[test] + fn xpath_literal_plain() { + assert_eq!(xpath_string_literal("foo"), "'foo'"); + } + + #[test] + fn xpath_literal_with_single_quote_uses_double() { + assert_eq!(xpath_string_literal("foo' or '1'='1"), "\"foo' or '1'='1\""); + } + + #[test] + fn xpath_literal_with_both_quotes_uses_concat() { + // Contains both ' and ", so concat() must be used. + let lit = xpath_string_literal("a'b\"c"); + assert_eq!(lit, "concat('a', \"'\", 'b\"c')"); + } +} diff --git a/tests/parser_selector_generation.rs b/tests/parser_selector_generation.rs index efd2bc2..97a893a 100644 --- a/tests/parser_selector_generation.rs +++ b/tests/parser_selector_generation.rs @@ -51,3 +51,47 @@ fn test_full_css_selector_includes_body() { css ); } + +#[test] +fn identical_siblings_get_distinct_nth_of_type_positions() { + // Three
  • with identical inner content. The bug compared outer_html + // and returned position 1 for all of them. The fix uses NodeId. + let html = r#"
    • Apple
    • Apple
    • Apple
    "#; + let sel = Selector::from_html(html); + let items = sel.css("li"); + assert_eq!(items.len(), 3); + + let css1 = generate_css_selector(&items[0], true); + let css2 = generate_css_selector(&items[1], true); + let css3 = generate_css_selector(&items[2], true); + assert!(css1.contains("nth-of-type(1)"), "got: {}", css1); + assert!(css2.contains("nth-of-type(2)"), "got: {}", css2); + assert!(css3.contains("nth-of-type(3)"), "got: {}", css3); + + let xp1 = generate_xpath_selector(&items[0], true); + let xp2 = generate_xpath_selector(&items[1], true); + let xp3 = generate_xpath_selector(&items[2], true); + assert!(xp1.ends_with("li[1]"), "got: {}", xp1); + assert!(xp2.ends_with("li[2]"), "got: {}", xp2); + assert!(xp3.ends_with("li[3]"), "got: {}", xp3); +} + +#[test] +fn xpath_selector_escapes_id_with_single_quote() { + // Adversarial id containing a single quote. Without escaping the + // generated XPath would be `div[@id='foo' or '1'='1']`, which is + // syntactically valid but semantically attacker-controlled. + let html = r#"
    x
    "#; + let sel = Selector::from_html(html); + let div = sel.css("div"); + assert_eq!(div.len(), 1); + let xp = generate_xpath_selector(&div[0], false); + // Because the id contains a single quote, the generator switches to + // double-quoted delimiters; the original `'` no longer terminates the + // string. + assert!( + xp.contains("@id=\"foo' or '1'='1\""), + "expected double-quoted id literal, got: {}", + xp + ); +} From 8298c1f76742895a4c8ea62de6f61b6ecc2d8fb9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 22:11:13 +0000 Subject: [PATCH 2/2] docs: keep tag() rustdoc attached to tag() after node_id() insertion Address CodeRabbit review on #21: the tag() doc comment was hovering above node_id() because the new method was inserted between them. Move the tag() doc back to immediately above tag(). https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn --- src/parser/selector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser/selector.rs b/src/parser/selector.rs index 98fe1ed..6402721 100644 --- a/src/parser/selector.rs +++ b/src/parser/selector.rs @@ -66,8 +66,6 @@ impl Selector { self.node_ref().value().is_document() || self.node_ref().value().is_fragment() } - /// Return the tag name of this element. - /// Returns "html" for the document root, "#text" for text nodes. /// Stable identity of the underlying DOM node. Useful for comparing two /// `Selector`s that reference the same node without resorting to HTML /// string equality (which is ambiguous for identical siblings). @@ -75,6 +73,8 @@ impl Selector { self.node_id } + /// Return the tag name of this element. + /// Returns "html" for the document root, "#text" for text nodes. pub fn tag(&self) -> &str { let node = self.node_ref(); match node.value() {