Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/parser/selector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ impl Selector {
self.node_ref().value().is_document() || self.node_ref().value().is_fragment()
}

/// Stable identity of the underlying DOM node. Useful for comparing two
/// `Selector`s that reference the same node without resorting to HTML
/// string equality (which is ambiguous for identical siblings).
pub fn node_id(&self) -> NodeId {
self.node_id
}

/// Return the tag name of this element.
/// Returns "html" for the document root, "#text" for text nodes.
pub fn tag(&self) -> &str {
Expand Down
57 changes: 52 additions & 5 deletions src/parser/selector_generation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String {

if let Some(id) = el.attrib().get("id") {
if !full_path {
parts.push(format!("{}[@id='{}']", tag, id.as_str()));
parts.push(format!(
"{}[@id={}]",
tag,
xpath_string_literal(id.as_str())
));
break;
}
}
Expand All @@ -71,21 +75,64 @@ pub fn generate_xpath_selector(selector: &Selector, full_path: bool) -> String {
}

/// Count position of element among same-tag siblings. Returns 0 if unique.
///
/// Uses the DOM `NodeId` for identity rather than comparing `outer_html`,
/// because identical sibling HTML would otherwise collapse to the same
/// position.
fn get_nth_of_type(selector: &Selector) -> usize {
let tag = selector.tag();
let self_id = selector.node_id();
if let Some(parent) = selector.parent() {
let children = parent.children();
let same_tag: Vec<_> = children.into_iter().filter(|c| c.tag() == tag).collect();
if same_tag.len() <= 1 {
return 0; // only child of this type, no disambiguation needed
return 0;
}
// Find our position by comparing node identity via outer_html
let our_html = selector.outer_html();
for (i, child) in same_tag.iter().enumerate() {
if child.outer_html().as_str() == our_html.as_str() {
if child.node_id() == self_id {
return i + 1;
}
}
}
0
}

/// Quote a string for safe embedding inside an XPath expression. Single
/// quotes are the default; double quotes are used when the value contains
/// single quotes; `concat(...)` is used when it contains both.
fn xpath_string_literal(value: &str) -> String {
let has_single = value.contains('\'');
let has_double = value.contains('"');
if !has_single {
format!("'{}'", value)
} else if !has_double {
format!("\"{}\"", value)
} else {
// Both quote types present. Split on single quotes and rejoin via
// concat() so neither delimiter has to be escaped.
let parts: Vec<String> = value.split('\'').map(|p| format!("'{}'", p)).collect();
format!("concat({})", parts.join(", \"'\", "))
}
}

#[cfg(test)]
mod tests {
use super::xpath_string_literal;

#[test]
fn xpath_literal_plain() {
assert_eq!(xpath_string_literal("foo"), "'foo'");
}

#[test]
fn xpath_literal_with_single_quote_uses_double() {
assert_eq!(xpath_string_literal("foo' or '1'='1"), "\"foo' or '1'='1\"");
}

#[test]
fn xpath_literal_with_both_quotes_uses_concat() {
// Contains both ' and ", so concat() must be used.
let lit = xpath_string_literal("a'b\"c");
assert_eq!(lit, "concat('a', \"'\", 'b\"c')");
}
}
44 changes: 44 additions & 0 deletions tests/parser_selector_generation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,47 @@ fn test_full_css_selector_includes_body() {
css
);
}

#[test]
fn identical_siblings_get_distinct_nth_of_type_positions() {
// Three <li> with identical inner content. The bug compared outer_html
// and returned position 1 for all of them. The fix uses NodeId.
let html = r#"<html><body><ul><li>Apple</li><li>Apple</li><li>Apple</li></ul></body></html>"#;
let sel = Selector::from_html(html);
let items = sel.css("li");
assert_eq!(items.len(), 3);

let css1 = generate_css_selector(&items[0], true);
let css2 = generate_css_selector(&items[1], true);
let css3 = generate_css_selector(&items[2], true);
assert!(css1.contains("nth-of-type(1)"), "got: {}", css1);
assert!(css2.contains("nth-of-type(2)"), "got: {}", css2);
assert!(css3.contains("nth-of-type(3)"), "got: {}", css3);

let xp1 = generate_xpath_selector(&items[0], true);
let xp2 = generate_xpath_selector(&items[1], true);
let xp3 = generate_xpath_selector(&items[2], true);
assert!(xp1.ends_with("li[1]"), "got: {}", xp1);
assert!(xp2.ends_with("li[2]"), "got: {}", xp2);
assert!(xp3.ends_with("li[3]"), "got: {}", xp3);
}

#[test]
fn xpath_selector_escapes_id_with_single_quote() {
// Adversarial id containing a single quote. Without escaping the
// generated XPath would be `div[@id='foo' or '1'='1']`, which is
// syntactically valid but semantically attacker-controlled.
let html = r#"<html><body><div id="foo' or '1'='1">x</div></body></html>"#;
let sel = Selector::from_html(html);
let div = sel.css("div");
assert_eq!(div.len(), 1);
let xp = generate_xpath_selector(&div[0], false);
// Because the id contains a single quote, the generator switches to
// double-quoted delimiters; the original `'` no longer terminates the
// string.
assert!(
xp.contains("@id=\"foo' or '1'='1\""),
"expected double-quoted id literal, got: {}",
xp
);
}
Loading