From 37e70f6a355f1ac2abbc8d7c8a362620a1de5c58 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 28 May 2026 22:04:17 +0000
Subject: [PATCH 1/2] fix: parser selector generation correctness and XPath
 injection

- Use NodeId identity in get_nth_of_type instead of comparing outer_html, so
  identical sibling HTML no longer collapses every position to 1
- Escape attribute values when embedding into XPath: single quotes by
  default, double quotes when the value contains single quotes, and
  concat() when it contains both. Adversarial id values can no longer
  inject XPath operators
- Expose Selector::node_id() for the identity check

Closes #5
Closes #14

https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn
---
 src/parser/selector.rs              |  7 ++++
 src/parser/selector_generation.rs   | 57 ++++++++++++++++++++++++++---
 tests/parser_selector_generation.rs | 44 ++++++++++++++++++++++
 3 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/src/parser/selector.rs b/src/parser/selector.rs
index 53da318..98fe1ed 100644
--- a/src/parser/selector.rs
+++ b/src/parser/selector.rs
@@ -68,6 +68,13 @@ impl Selector {
 
     /// Return the tag name of this element.
     /// Returns "html" for the document root, "#text" for text nodes.
+    /// Stable identity of the underlying DOM node. Useful for comparing two
+    /// `Selector`s that reference the same node without resorting to HTML
+    /// string equality (which is ambiguous for identical siblings).
+    pub fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+
     pub fn tag(&self) -> &str {
         let node = self.node_ref();
         match node.value() {
diff --git a/src/parser/selector_generation.rs b/src/parser/selector_generation.rs
index 29ddf53..e84ea63 100644
--- a/src/parser/selector_generation.rs
+++ b/src/parser/selector_generation.rs
@@ -47,7 +47,11 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String {
 
         if let Some(id) = el.attrib().get("id") {
             if !full_path {
-                parts.push(format!("{}[@id='{}']", tag, id.as_str()));
+                parts.push(format!(
+                    "{}[@id={}]",
+                    tag,
+                    xpath_string_literal(id.as_str())
+                ));
                 break;
             }
         }
@@ -71,21 +75,64 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String {
 }
 
 /// Count position of element among same-tag siblings. Returns 0 if unique.
+///
+/// Uses the DOM `NodeId` for identity rather than comparing `outer_html`,
+/// because identical sibling HTML would otherwise collapse to the same
+/// position.
 fn get_nth_of_type(selector: &Selector) -> usize {
     let tag = selector.tag();
+    let self_id = selector.node_id();
     if let Some(parent) = selector.parent() {
         let children = parent.children();
         let same_tag: Vec<_> = children.into_iter().filter(|c| c.tag() == tag).collect();
         if same_tag.len() <= 1 {
-            return 0; // only child of this type, no disambiguation needed
+            return 0;
         }
-        // Find our position by comparing node identity via outer_html
-        let our_html = selector.outer_html();
         for (i, child) in same_tag.iter().enumerate() {
-            if child.outer_html().as_str() == our_html.as_str() {
+            if child.node_id() == self_id {
                 return i + 1;
             }
         }
     }
     0
 }
+
+/// Quote a string for safe embedding inside an XPath expression. Single
+/// quotes are the default; double quotes are used when the value contains
+/// single quotes; `concat(...)` is used when it contains both.
+fn xpath_string_literal(value: &str) -> String {
+    let has_single = value.contains('\'');
+    let has_double = value.contains('"');
+    if !has_single {
+        format!("'{}'", value)
+    } else if !has_double {
+        format!("\"{}\"", value)
+    } else {
+        // Both quote types present. Split on single quotes and rejoin via
+        // concat() so neither delimiter has to be escaped.
+        let parts: Vec<String> = value.split('\'').map(|p| format!("'{}'", p)).collect();
+        format!("concat({})", parts.join(", \"'\", "))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::xpath_string_literal;
+
+    #[test]
+    fn xpath_literal_plain() {
+        assert_eq!(xpath_string_literal("foo"), "'foo'");
+    }
+
+    #[test]
+    fn xpath_literal_with_single_quote_uses_double() {
+        assert_eq!(xpath_string_literal("foo' or '1'='1"), "\"foo' or '1'='1\"");
+    }
+
+    #[test]
+    fn xpath_literal_with_both_quotes_uses_concat() {
+        // Contains both ' and ", so concat() must be used.
+        let lit = xpath_string_literal("a'b\"c");
+        assert_eq!(lit, "concat('a', \"'\", 'b\"c')");
+    }
+}
diff --git a/tests/parser_selector_generation.rs b/tests/parser_selector_generation.rs
index efd2bc2..97a893a 100644
--- a/tests/parser_selector_generation.rs
+++ b/tests/parser_selector_generation.rs
@@ -51,3 +51,47 @@ fn test_full_css_selector_includes_body() {
         css
     );
 }
+
+#[test]
+fn identical_siblings_get_distinct_nth_of_type_positions() {
+    // Three <li> with identical inner content. The bug compared outer_html
+    // and returned position 1 for all of them. The fix uses NodeId.
+    let html = r#"<html><body><ul><li>Apple</li><li>Apple</li><li>Apple</li></ul></body></html>"#;
+    let sel = Selector::from_html(html);
+    let items = sel.css("li");
+    assert_eq!(items.len(), 3);
+
+    let css1 = generate_css_selector(&items[0], true);
+    let css2 = generate_css_selector(&items[1], true);
+    let css3 = generate_css_selector(&items[2], true);
+    assert!(css1.contains("nth-of-type(1)"), "got: {}", css1);
+    assert!(css2.contains("nth-of-type(2)"), "got: {}", css2);
+    assert!(css3.contains("nth-of-type(3)"), "got: {}", css3);
+
+    let xp1 = generate_xpath_selector(&items[0], true);
+    let xp2 = generate_xpath_selector(&items[1], true);
+    let xp3 = generate_xpath_selector(&items[2], true);
+    assert!(xp1.ends_with("li[1]"), "got: {}", xp1);
+    assert!(xp2.ends_with("li[2]"), "got: {}", xp2);
+    assert!(xp3.ends_with("li[3]"), "got: {}", xp3);
+}
+
+#[test]
+fn xpath_selector_escapes_id_with_single_quote() {
+    // Adversarial id containing a single quote. Without escaping the
+    // generated XPath would be `div[@id='foo' or '1'='1']`, which is
+    // syntactically valid but semantically attacker-controlled.
+    let html = r#"<html><body><div id="foo' or '1'='1">x</div></body></html>"#;
+    let sel = Selector::from_html(html);
+    let div = sel.css("div");
+    assert_eq!(div.len(), 1);
+    let xp = generate_xpath_selector(&div[0], false);
+    // Because the id contains a single quote, the generator switches to
+    // double-quoted delimiters; the original `'` no longer terminates the
+    // string.
+    assert!(
+        xp.contains("@id=\"foo' or '1'='1\""),
+        "expected double-quoted id literal, got: {}",
+        xp
+    );
+}

From 8298c1f76742895a4c8ea62de6f61b6ecc2d8fb9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 28 May 2026 22:11:13 +0000
Subject: [PATCH 2/2] docs: keep tag() rustdoc attached to tag() after
 node_id() insertion

Address CodeRabbit review on #21: the tag() doc comment was hovering above
node_id() because the new method was inserted between them. Move the
tag() doc back to immediately above tag().

https://claude.ai/code/session_012RmdaovmNWZVAim4XxCWwn
---
 src/parser/selector.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/parser/selector.rs b/src/parser/selector.rs
index 98fe1ed..6402721 100644
--- a/src/parser/selector.rs
+++ b/src/parser/selector.rs
@@ -66,8 +66,6 @@ impl Selector {
         self.node_ref().value().is_document() || self.node_ref().value().is_fragment()
     }
 
-    /// Return the tag name of this element.
-    /// Returns "html" for the document root, "#text" for text nodes.
     /// Stable identity of the underlying DOM node. Useful for comparing two
     /// `Selector`s that reference the same node without resorting to HTML
     /// string equality (which is ambiguous for identical siblings).
@@ -75,6 +73,8 @@ impl Selector {
         self.node_id
     }
 
+    /// Return the tag name of this element.
+    /// Returns "html" for the document root, "#text" for text nodes.
     pub fn tag(&self) -> &str {
         let node = self.node_ref();
         match node.value() {