From b7d96b302813e63f6e34f62ca8287c732f4026cc Mon Sep 17 00:00:00 2001 From: tompng Date: Thu, 21 May 2026 02:12:55 +0900 Subject: [PATCH 1/3] Optimize XPath step If a predicate of xpath is position-independent, we don't need to create nodesets that has many duplicated nodes with different positions. Implements optimization of preceding-sibling/following-sibling with simple positional predicates as an example. We can add more optimizations for other axis in the future. --- lib/rexml/xpath_parser.rb | 543 +++++++++++++++------- test/xpath/test_attribute.rb | 15 + test/xpath/test_axis_preceding_sibling.rb | 78 ++++ test/xpath/test_base.rb | 8 + test/xpath/test_predicate.rb | 38 ++ 5 files changed, 513 insertions(+), 169 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 170ff7d4..20d38a50 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -199,118 +199,48 @@ def expr( path_stack, nodeset, context=nil ) nodeset = [nodeset.first.root_node] when :self nodeset = step(path_stack) do - [nodeset] + [:iterate_nodesets, [nodeset]] end when :child nodeset = step(path_stack) do - child(nodeset) + [:iterate_nodesets, child(nodeset)] end when :literal trace(:literal, path_stack, nodeset) if @debug return path_stack.shift when :attribute nodeset = step(path_stack, any_type: :attribute) do - nodeset.map do |node| + nodesets = nodeset.map do |node| next unless node.node_type == :element attributes = node.attributes next if attributes.empty? attributes.each_attribute.to_a end.compact + [:iterate_nodesets, nodesets] end when :namespace warn 'Namespace axis is not supported in REXML::XPathParser', uplevel: 1 # TODO: We need to create NamespaceNode class to support this feature - nodeset = step(path_stack) { [] } + nodeset = step(path_stack) { [:iterate_nodesets, []] } when :parent nodeset = step(path_stack) do - nodesets = [] + parents = Set.new.compare_by_identity nodeset.each do |node| if node.node_type == :attribute parent = node.element else parent = node.parent end - nodesets << [parent] if parent + parents << parent if parent end - nodesets + [:iterate_nodesets, parents.map {|parent| [parent] }] end - when :ancestor - nodeset = step(path_stack, axis_order: :reverse) do - nodesets = [] - # new_nodes = {} - nodeset.each do |node| - new_nodeset = [] - while node.parent - node = node.parent - # next if new_nodes.key?(node) - new_nodeset << node - # new_nodes[node] = true - end - nodesets << new_nodeset unless new_nodeset.empty? - end - nodesets - end - when :ancestor_or_self - nodeset = step(path_stack, axis_order: :reverse) do - nodesets = [] - # new_nodes = {} - nodeset.each do |node| - next unless node.node_type == :element - new_nodeset = [node] - # new_nodes[node] = true - while node.parent - node = node.parent - # next if new_nodes.key?(node) - new_nodeset << node - # new_nodes[node] = true - end - nodesets << new_nodeset unless new_nodeset.empty? - end - nodesets - end - when :descendant_or_self - nodeset = step(path_stack) do - descendant(nodeset, true) - end - when :descendant - nodeset = step(path_stack) do - descendant(nodeset, false) - end - when :following_sibling + when :ancestor, :ancestor_or_self, + :descendant, :descendant_or_self, + :preceding, :preceding_sibling, + :following, :following_sibling nodeset = step(path_stack) do - nodeset.map do |node| - next unless node.respond_to?(:parent) - next if node.parent.nil? - all_siblings = node.parent.children - current_index = all_siblings.index(node) - following_siblings = all_siblings[(current_index + 1)..-1] - next if following_siblings.empty? - following_siblings - end.compact - end - when :preceding_sibling - nodeset = step(path_stack, axis_order: :reverse) do - nodeset.map do |node| - next unless node.respond_to?(:parent) - next if node.parent.nil? - all_siblings = node.parent.children - current_index = all_siblings.index(node) - preceding_siblings = all_siblings[0, current_index].reverse - next if preceding_siblings.empty? - preceding_siblings - end.compact - end - when :preceding - nodeset = step(path_stack, axis_order: :reverse) do - nodeset.map do |node| - preceding(node) - end - end - when :following - nodeset = step(path_stack) do - nodeset.map do |node| - following(node) - end + [op, nodeset] end when :variable var_name = path_stack.shift @@ -382,8 +312,7 @@ def expr( path_stack, nodeset, context=nil ) target_context[:position] = 1 end args = arguments.dclone.collect do |arg| - result = expr(arg, nodeset, target_context) - result + expr(arg, nodeset, target_context) end @functions.context = target_context return @functions.send(func_name, *args) @@ -394,7 +323,7 @@ def expr( path_stack, nodeset, context=nil ) # If result is a nodeset, apply following predicates path_stack.unshift(:node) nodeset = step(path_stack) do - [result] + [:iterate_nodesets, [result]] end else return result @@ -408,120 +337,352 @@ def expr( path_stack, nodeset, context=nil ) leave(:expr, path_stack, nodeset) if @debug end - def step(path_stack, any_type: :element, axis_order: :forward) - nodesets = yield + # Determines if a predicate expression is dependent on the position of nodes. + # nil if the predicate is position-independent, + # :simple if the predicate is a simple position query that can be optimized in axis scanning + # :complex if the predicate is a complex query that might be dependent on the position of nodes + def position_dependency(predicate_expr) + # [number], [position()=number], [position() < number], [position() > number] + return :simple if position_operation(predicate_expr) + + # expressions that return number. + return :complex if %i[div mod mult plus minus neg].include?(predicate_expr[0]) + return :complex if predicate_expr[0] == :function && %w[number ceiling round floor string-length sum count].include?(predicate_expr[1]) + # Numeric literal including Integer and Float: [2] means [position() = 2] + return :complex if predicate_expr[0] == :literal && Numeric === predicate_expr[1] + # A variable could resolve to a number at runtime: [$n] means [position() = $n]. + return :complex if predicate_expr[0] == :variable + + # expressions that contain position-dependent functions + return :complex if calls_position_dependent_function?(predicate_expr) + end + + # Recursively checks if the expression contains position-dependent functions such as position() or last() + def calls_position_dependent_function?(expr) + return false unless Array === expr + return true if expr[0] == :function && (expr[1] == 'position' || expr[1] == 'last') + expr.any? {|part| calls_position_dependent_function?(part) } + end + + # Detects simple position-based predicates that can be optimized in axis scanning, such as [1], [position()=1], [position() < 2], [position() > 3] + # Returns operators and values such as [:==, 1], [:<, 2], [:>, 3] + # Returns nil if the predicate is not a simple position-based predicate + def position_operation(predicate_expr) + return [:==, predicate_expr[1]] if predicate_expr[0] == :literal && predicate_expr[1].is_a?(Integer) + + op, left, right = predicate_expr + return unless op == :eq || op == :lt || op == :lteq || op == :gt || op == :gteq + return unless [left, right].include?([:function, 'position', []]) + + literal = [left, right].find {|part| part[0] == :literal && part[1].is_a?(Integer) } + return unless literal + + value = literal[1] + case op + when :eq + [:==, value] + when :lt + literal == right ? [:<, value] : [:>, value] + when :lteq + literal == right ? [:<, value + 1] : [:>, value - 1] + when :gt + literal == right ? [:>, value]: [:<, value] + when :gteq + literal == right ? [:>, value - 1] : [:<, value + 1] + end + end + + # Pseudo scanner for axis scanning step that nodesets are already collected + def iterate_nodesets(nodesets, tester, selector) + non_optimized_nodesets_select(nodesets, tester, selector) + end + + # Scanner for ancestor-or-self axis + def ancestor_or_self(nodeset, tester, selector) + ancestor(nodeset, tester, selector, include_self: true) + end + + # Scanner for preceding-sibling axis + def preceding_sibling(nodeset, tester, selector) + preceding_following_sibling(nodeset, tester, selector, reverse: true) + end + + # Scanner for following-sibling axis + def following_sibling(nodeset, tester, selector) + preceding_following_sibling(nodeset, tester, selector, reverse: false) + end + + def preceding_following_sibling(nodeset, tester, selector, reverse:) + nodeset = nodeset.select {|node| node.respond_to?(:parent) && node.parent } + case selector + when :uniq + nodeset.group_by(&:parent).flat_map do |parent, sibling_nodes| + sets = Set.new.compare_by_identity + sibling_nodes.each {|sibling| sets << sibling } + children = parent.children + children = children.reverse if reverse + children.drop_while {|child| !sets.include?(child) }.drop(1) + end.select(&tester) + when :nodesets + nodesets = nodeset.map do |node| + parent = node.parent + index = parent.children.index(node) + reverse ? parent.children[0...index].reverse : parent.children[index + 1..-1] + end + non_optimized_nodesets_select(nodesets, tester, selector) + else + operator, value = selector + nodeset.group_by(&:parent).flat_map do |parent, sibling_nodes| + anchors = Set.new.compare_by_identity + sibling_nodes.each {|sibling| anchors << sibling } + children = parent.children + children = children.reverse if reverse + followings = children.drop_while {|child| !anchors.include?(child) }.drop(1) + anchor_indexes = Set[0] + last_anchor = 0 + index = 0 + matched = [] + followings.each do |node| + if tester.call(node) + case operator + when :== + # anchor_indexes only contain values smaller or equal to `index`, + # so value <= 0 case doesn't accidentally match any node. + matched << node if anchor_indexes.include?(index - value + 1) + when :< + # Position from the last anchor will be the minimum possible position for the node + matched << node if index - last_anchor + 1 < value + when :> + # Position from the first anchor(==0) will be the maximum possible position for the node + matched << node if index + 1 > value + end + index += 1 + end + if anchors.include?(node) + anchor_indexes << index + last_anchor = index + end + end + matched + end + end + end + + # Scanner for ancestor axis + def ancestor(nodeset, tester, selector, include_self: false) + nodeset = nodeset.select {|node| node.respond_to?(:parent) && node.parent } + case selector + when :uniq + ancestors = Set.new.compare_by_identity + nodeset.each do |node| + ancestors << node if include_self + parent = node.parent + while parent + break if ancestors.include?(parent) + ancestors << parent + parent = parent.parent + end + end + ancestors.select(&tester) + else + # Slow pass + nodesets = nodeset.map do |node| + ancestors = [] + ancestors << node if include_self + parent = node.parent + while parent + ancestors << parent + parent = parent.parent + end + ancestors + end + non_optimized_nodesets_select(nodesets, tester, selector) + end + end + + # Scanner fallback step for axis that is not optimized for position-based predicates. + def non_optimized_nodesets_select(nodesets, tester, selector) + nodesets = nodesets.map do |nodeset| + nodeset.select(&tester) + end.reject(&:empty?) + case selector + when :nodesets + nodesets + when :uniq + seen = Set.new.compare_by_identity + nodesets.flatten.each {|node| seen << node } + seen.to_a + else + operator, value = selector + nodes = nodesets.flatten + nodes = + case operator + when :== + nodesets.map {|nodeset| nodeset[value - 1] if value >= 1 }.compact + when :< + nodesets.map {|nodeset| nodeset[0...value - 1] if value >= 1 }.compact.flatten + when :> + nodesets.map {|nodeset| value < 0 ? nodeset : nodeset[value..-1] }.flatten + end + seen = Set.new.compare_by_identity + nodes.each {|node| seen << node } + seen.to_a + end + end + + # Split predicates into several groups based on their dependency on the position of nodes + # If there are no position-based predicates, + # return [position_independent_predicates, nil, [], nil] + # If there are only one simple position-based predicate, + # return [position_independent_predicates, position_operator, post_position_independent_predicates, nil] + # If there are multiple position-based predicates or complex position-based predicates, + # return [position_independent_predicates, nil, nil, complex_predicates] + def split_positional_predicates(predicates) + pre_independent = predicates.take_while {|predicate| position_dependency(predicate).nil? } + predicates = predicates.drop(pre_independent.size) + return [pre_independent, nil, [], nil] if predicates.empty? + + op = position_operation(predicates.first) + if op && predicates[1..-1].all? {|predicate| position_dependency(predicate).nil? } + [pre_independent, op, predicates[1..-1], nil] + else + [pre_independent, nil, nil, predicates] + end + end + + # Performs an axis scanning step. + # The caller provides a scanner method and its argument, which determines the axis to scan and the nodes to scan from: + # step(path_stack) { [scanner_method, scanner_argument] } + # Scanner method are called with `(scanner_argument, tester_block, selector)` + # selector is a flag for the scanner to determine how to return the scan result. + # It can be: `:uniq`, `:nodesets` or `[position_comparator, value]`. + # `:uniq` means the scanner should return unique nodes. Predicates are position-independent. + # `:nodesets` means the scanner should return nodesets. Predicates are complex position queries that can't be optimized in axis scanning. + # `[position_comparator, value]` means the scanner should return nodes matching the position comparator and value. + # Each scanner method can implement optimized scanning strategy for each selector. + + def step(path_stack, any_type: :element) + scanner, scanner_argument = yield begin - enter(:step, path_stack, nodesets) if @debug - nodesets = node_test(path_stack, nodesets, any_type: any_type) - while path_stack[0] == :predicate - path_stack.shift # :predicate - predicate_expression = path_stack.shift.dclone - nodesets = evaluate_predicate(predicate_expression, nodesets) - end - if nodesets.size == 1 - new_nodeset = axis_order == :forward ? nodesets.first : nodesets.first.reverse - else - nodes = Set.new.compare_by_identity - nodesets.each do |nodeset| - nodeset.each do |node| - nodes << node + enter(:step, path_stack, scanner, scanner_argument) if @debug + tester = node_test(path_stack, any_type: any_type) + predicates = [] + while path_stack.first == :predicate + path_stack.shift + predicates << path_stack.shift + end + pre_predicates, position_operator, post_predicates, complex_predicates = split_positional_predicates(predicates) + + if pre_predicates.any? + original_tester = tester + tester = -> (node) do + original_tester.call(node) && + pre_predicates.all? do |predicate_expr| + evaluate_predicate(predicate_expr.dclone, [[node]]).flatten.size == 1 end end - new_nodeset = sort(nodes.to_a) end - new_nodeset + if complex_predicates + nodesets = send(scanner, scanner_argument, tester, :nodesets) + elsif position_operator + nodeset = send(scanner, scanner_argument, tester, position_operator) + nodesets = [nodeset] + else + nodeset = send(scanner, scanner_argument, tester, :uniq) + nodesets = [nodeset] + end + + (complex_predicates || post_predicates).each do |predicate_expr| + nodesets = evaluate_predicate(predicate_expr.dclone, nodesets) + end + nodes = Set.new.compare_by_identity + nodesets.each do |nodeset| + nodeset.each do |node| + nodes << node + end + end + new_nodeset = sort(nodes.to_a) ensure leave(:step, path_stack, new_nodeset) if @debug end end - def node_test(path_stack, nodesets, any_type: :element) - enter(:node_test, path_stack, nodesets) if @debug + def node_test(path_stack, any_type: :element) + enter(:node_test, path_stack) if @debug operator = path_stack.shift case operator when :qname prefix = path_stack.shift name = path_stack.shift - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - case node.node_type - when :element - if prefix.nil? - node.name == name - elsif prefix.empty? - if strict? - node.name == name and node.namespace == "" - else - node.name == name and node.namespace == get_namespace(node, prefix) - end - else - node.name == name and node.namespace == get_namespace(node, prefix) - end - when :attribute - if prefix.nil? - node.name == name - elsif prefix.empty? + ->(node) do + case node.node_type + when :element + if prefix.nil? + node.name == name + elsif prefix.empty? + if strict? node.name == name and node.namespace == "" else - node.name == name and node.namespace == get_namespace(node.element, prefix) + node.name == name and node.namespace == get_namespace(node, prefix) end else - false + node.name == name and node.namespace == get_namespace(node, prefix) + end + when :attribute + if prefix.nil? + node.name == name + elsif prefix.empty? + node.name == name and node.namespace == "" + else + node.name == name and node.namespace == get_namespace(node.element, prefix) end + else + false end end when :namespace prefix = path_stack.shift - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - case node.node_type - when :element - namespaces = @namespaces || node.namespaces - node.namespace == namespaces[prefix] - when :attribute - namespaces = @namespaces || node.element.namespaces - node.namespace == namespaces[prefix] - else - false - end + ->(node) do + case node.node_type + when :element + namespaces = @namespaces || node.namespaces + node.namespace == namespaces[prefix] + when :attribute + namespaces = @namespaces || node.element.namespaces + node.namespace == namespaces[prefix] + else + false end end when :any - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - node.node_type == any_type - end + ->(node) do + node.node_type == any_type end when :comment - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - node.node_type == :comment - end + ->(node) do + node.node_type == :comment end when :text - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - node.node_type == :text - end + ->(node) do + node.node_type == :text end when :processing_instruction target = path_stack.shift - new_nodesets = nodesets.collect do |nodeset| - nodeset.select do |node| - (node.node_type == :processing_instruction) and - (target.empty? or (node.target == target)) - end + ->(node) do + (node.node_type == :processing_instruction) and + (target.empty? or (node.target == target)) end when :node - new_nodesets = nodesets + ->(_node) do + true + end else message = "[BUG] Unexpected node test: " + "<#{operator.inspect}>: <#{path_stack.inspect}>" raise message end - new_nodesets ensure - leave(:node_test, path_stack, new_nodesets) if @debug + leave(:node_test, path_stack) if @debug end def evaluate_predicate(expression, nodesets) @@ -581,6 +742,8 @@ def leave(tag, *args) # I wouldn't have to do this. Maybe add a document IDX for each node? # Problems with mutable documents. Or, rewrite everything. def sort(array_of_nodes) + return array_of_nodes if array_of_nodes.size <= 1 + new_arry = [] array_of_nodes.each { |node| node_idx = [] @@ -599,15 +762,43 @@ def sort(array_of_nodes) end end - def descendant(nodeset, include_self) - nodesets = [] - nodeset.each do |node| - new_nodeset = [] - new_nodes = {} - descendant_recursive(node, new_nodeset, new_nodes, include_self) - nodesets << new_nodeset unless new_nodeset.empty? + # Scanner for descendant-or-self axis + def descendant_or_self(nodeset, tester, selector) + descendant(nodeset, tester, selector, include_self: true) + end + + # Scanner for descendant axis + def descendant(nodeset, tester, selector, include_self: false) + nodeset = nodeset.select {|node| node.respond_to?(:children) } + case selector + when :uniq + seen = Set.new.compare_by_identity + recursive = ->(node) do + node_type = node.node_type + return if seen.include?(node) + seen << node if node_type != :xmldecl + return unless node_type == :element || node_type == :document + node.children.each do |child| + recursive.call(child) + end + end + nodeset.each do |node| + if include_self + recursive.call(node) + else + node.children.each(&recursive) + end + end + seen.select(&tester) + else + nodesets = nodeset.map do |node| + new_nodeset = [] + new_nodes = {} + descendant_recursive(node, new_nodeset, new_nodes, include_self) + new_nodeset + end + non_optimized_nodesets_select(nodesets, tester, selector) end - nodesets end def descendant_recursive(node, new_nodeset, new_nodes, include_self) @@ -625,11 +816,17 @@ def descendant_recursive(node, new_nodeset, new_nodes, include_self) end end + # Scanner for preceding axis + def preceding(nodeset, tester, selector) + nodesets = nodeset.select {|node| node.respond_to?(:parent) }.map {|node| preceding_nodes(node) } + non_optimized_nodesets_select(nodesets, tester, selector) + end + # Builds a nodeset of all of the preceding nodes of the supplied node, # in reverse document order # preceding:: includes every element in the document that precedes this node, # except for ancestors - def preceding(node) + def preceding_nodes(node) ancestors = [] parent = node.parent while parent @@ -665,7 +862,15 @@ def preceding_node_of( node ) psn end - def following(node) + # Scanner for following axis + def following(nodeset, tester, selector) + nodesets = nodeset.select {|node| node.respond_to?(:parent) }.map do |node| + following_nodes(node) + end + non_optimized_nodesets_select(nodesets, tester, selector) + end + + def following_nodes(node) followings = [] following_node = next_sibling_node(node) while following_node diff --git a/test/xpath/test_attribute.rb b/test/xpath/test_attribute.rb index b778ff81..e57bd915 100644 --- a/test/xpath/test_attribute.rb +++ b/test/xpath/test_attribute.rb @@ -32,5 +32,20 @@ def test_no_namespace "nothing" => "") assert_equal(["child2"], children.collect(&:text)) end + + def test_attribute_axis_with_other_axes_does_not_raise + # Some of those are not yet supported in REXML, but at least it shouldn't raise an error + assert_nothing_raised do + REXML::XPath.match(@document, '//attribute::*/parent::*') + REXML::XPath.match(@document, '//attribute::*/preceding::*') + REXML::XPath.match(@document, '//attribute::*/following::*') + REXML::XPath.match(@document, '//attribute::*/preceding-sibling::*') + REXML::XPath.match(@document, '//attribute::*/following-sibling::*') + REXML::XPath.match(@document, '//attribute::*/ancestor::*') + REXML::XPath.match(@document, '//attribute::*/descendant::*') + REXML::XPath.match(@document, '//attribute::*/ancestor-or-self::*') + REXML::XPath.match(@document, '//attribute::*/descendant-or-self::*') + end + end end end diff --git a/test/xpath/test_axis_preceding_sibling.rb b/test/xpath/test_axis_preceding_sibling.rb index 2f0ba5d1..0e1ddff2 100644 --- a/test/xpath/test_axis_preceding_sibling.rb +++ b/test/xpath/test_axis_preceding_sibling.rb @@ -34,5 +34,83 @@ def test_preceding_sibling_axis prev = XPath.first(context, "preceding-sibling::f[3]") assert_equal "3", prev.attributes["id"] end + + def test_preceding_sibling_position_less_than + context = XPath.first(@@doc, "/a/e/f[last()]") + assert_equal([], XPath.match(context, "preceding-sibling::f[position() < 1]")) + assert_equal(["5"], + XPath.match(context, "preceding-sibling::f[position() < 2]").map {|n| n.attributes["id"] }) + assert_equal(["4", "5"], + XPath.match(context, "preceding-sibling::f[position() < 3]").map {|n| n.attributes["id"] }) + end + + def test_preceding_sibling_position_less_than_or_equal + context = XPath.first(@@doc, "/a/e/f[last()]") + assert_equal([], XPath.match(context, "preceding-sibling::f[position() <= 0]")) + assert_equal(["4", "5"], + XPath.match(context, "preceding-sibling::f[position() <= 2]").map {|n| n.attributes["id"] }) + end + + def test_preceding_sibling_position_greater_than + context = XPath.first(@@doc, "/a/e/f[last()]") + assert_equal(["3", "4", "5"], + XPath.match(context, "preceding-sibling::f[position() > 0]").map {|n| n.attributes["id"] }) + assert_equal(["3", "4"], + XPath.match(context, "preceding-sibling::f[position() > 1]").map {|n| n.attributes["id"] }) + assert_equal(["3"], + XPath.match(context, "preceding-sibling::f[position() > 2]").map {|n| n.attributes["id"] }) + end + + def test_preceding_sibling_position_greater_than_or_equal + context = XPath.first(@@doc, "/a/e/f[last()]") + assert_equal(["3", "4", "5"], + XPath.match(context, "preceding-sibling::f[position() >= 0]").map {|n| n.attributes["id"] }) + assert_equal(["3", "4", "5"], + XPath.match(context, "preceding-sibling::f[position() >= 1]").map {|n| n.attributes["id"] }) + assert_equal(["3", "4"], + XPath.match(context, "preceding-sibling::f[position() >= 2]").map {|n| n.attributes["id"] }) + end + + def test_preceding_following_sibling_multiple_anchors + doc = Document.new(<<~XML) + + + + + + + + + + + + + + + + + + + + + + + + XML + + assert_equal(%w[2 7 9], XPath.match(doc, "/a/anchor/preceding-sibling::b[position() = 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[2 3 4 7 8 9 10 11], XPath.match(doc, "/a/anchor/preceding-sibling::b[position() <= 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[1 2 3 4 5 6 7 8], XPath.match(doc, "/a/anchor/preceding-sibling::b[position() >= 4]").map {|n| n.attributes["id"] }) + assert_equal(%w[2 7 a2], XPath.match(doc, "/a/anchor/preceding-sibling::*[@id][position() = 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[2 3 4 7 8 9 a2 10 11], XPath.match(doc, "/a/anchor/preceding-sibling::*[@id][position() <= 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[1 2 3 4 a1 5 6 7 8 9], XPath.match(doc, "/a/anchor/preceding-sibling::*[@id][position() >= 4]").map {|n| n.attributes["id"] }) + + assert_equal(%w[7 12], XPath.match(doc, "/a/anchor/following-sibling::b[position() = 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[5 6 7 10 11 12], XPath.match(doc, "/a/anchor/following-sibling::b[position() <= 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[8 9 10 11 12], XPath.match(doc, "/a/anchor/following-sibling::b[position() >= 4]").map {|n| n.attributes["id"] }) + assert_equal(%w[7 a3], XPath.match(doc, "/a/anchor/following-sibling::*[@id][position() = 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[5 6 7 10 11 a3 12], XPath.match(doc, "/a/anchor/following-sibling::*[@id][position() <= 3]").map {|n| n.attributes["id"] }) + assert_equal(%w[8 9 a2 10 11 a3 12], XPath.match(doc, "/a/anchor/following-sibling::*[@id][position() >= 4]").map {|n| n.attributes["id"] }) + end end end diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 46b665b2..43fe3e99 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -500,6 +500,14 @@ def test_following_sibling_predicates assert_equal(["w", "x", "y", "z"], matches.map(&:name)) end + def test_following_sibling_position_less_than + source = "" + doc = REXML::Document.new(source) + assert_equal([], REXML::XPath.match(doc, "/r/a/following-sibling::*[position() < 1]")) + assert_equal(["b"], REXML::XPath.match(doc, "/r/a/following-sibling::*[position() < 2]").map(&:name)) + assert_equal(["b", "c"], REXML::XPath.match(doc, "/r/a/following-sibling::*[position() < 3]").map(&:name)) + end + def test_preceding_sibling_across_multiple_nodes source = <<-XML diff --git a/test/xpath/test_predicate.rb b/test/xpath/test_predicate.rb index 278e3765..f25bd758 100644 --- a/test/xpath/test_predicate.rb +++ b/test/xpath/test_predicate.rb @@ -59,12 +59,50 @@ def test_predicates_multi assert_equal( "1", m[0].attributes["id"] ) end + def test_predicate_multi_position + xml = <<~XML + + + + + XML + doc = REXML::Document.new(xml) + + result = REXML::XPath.match(doc, "/a/b/c[position()>1]") + assert_equal(%w[2 3 4 5 7 8 9 10], result.map { |node| node.attributes["id"] }) + + result = REXML::XPath.match(doc, "/a/b/c[position()>1][position()>1]") + assert_equal(%w[3 4 5 8 9 10], result.map { |node| node.attributes["id"] }) + + result = REXML::XPath.match(doc, "/a/b/c[position()>1][position()>1][@id!='3']") + assert_equal(%w[4 5 8 9 10], result.map { |node| node.attributes["id"] }) + + result = REXML::XPath.match(doc, "/a/b/c[position()>1][position()>1][@id!='3'][position()!=2]") + assert_equal(%w[4 8 10], result.map { |node| node.attributes["id"] }) + end + def do_path( path ) m = REXML::XPath.match( @doc, path ) #puts path, @parser.parse( path ).inspect return m end + def test_predicate_float_literal + doc = REXML::Document.new("") + # [N.0] is equivalent to [position() = N.0] = [position() = N] + assert_equal(["a"], REXML::XPath.match(doc, "/r/*[1.0]").map(&:name)) + assert_equal(["b"], REXML::XPath.match(doc, "/r/*[2.0]").map(&:name)) + # Non-integer numeric literals match no node. + assert_equal([], REXML::XPath.match(doc, "/r/*[1.5]")) + end + + def test_predicate_variable_as_position + doc = REXML::Document.new("") + parser = REXML::XPathParser.new + parser["x"] = 2 + assert_equal(["b"], parser.parse("/r/*[$x]", doc).map(&:name)) + end + def test_get_no_siblings_terminal_nodes source = <<-XML From 447c0a38a7360d8a290e70492d0f1bd7ff00ec16 Mon Sep 17 00:00:00 2001 From: tompng Date: Sat, 13 Jun 2026 16:15:20 +0900 Subject: [PATCH 2/3] Fix reviewed points --- lib/rexml/xpath_parser.rb | 49 +++++++++++++++++++++++++++--------- test/xpath/test_predicate.rb | 40 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 12 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 20d38a50..1069d36e 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -345,16 +345,42 @@ def position_dependency(predicate_expr) # [number], [position()=number], [position() < number], [position() > number] return :simple if position_operation(predicate_expr) - # expressions that return number. - return :complex if %i[div mod mult plus minus neg].include?(predicate_expr[0]) - return :complex if predicate_expr[0] == :function && %w[number ceiling round floor string-length sum count].include?(predicate_expr[1]) - # Numeric literal including Integer and Float: [2] means [position() = 2] - return :complex if predicate_expr[0] == :literal && Numeric === predicate_expr[1] - # A variable could resolve to a number at runtime: [$n] means [position() = $n]. - return :complex if predicate_expr[0] == :variable - # expressions that contain position-dependent functions return :complex if calls_position_dependent_function?(predicate_expr) + + # Even if expression is followed by path steps, the analysis of + # position dependency is the same as the expression itself. + case predicate_expr[0] + when :union, :or, :and, :eq, :neq, :lt, :lteq, :gt, :gteq, :not + # Expressions that don't evaluate to a number are position independent + # if it doesn't contain position-dependent functions. + nil + when :div, :mod, :mult, :plus, :minus, :neg + # expressions that return number. eg. `[position() = @attr + 1]` + :complex + when :literal + # Integer literal is handled in `position_operation(predicate_expr)`. + # We treat this as complex(no-optimization) because this is not a normal case + # eg. `/foo["hi"]` `/bar[true]` `/baz[3.14]` + :complex + when :variable + # A variable could resolve to a number at runtime. + # It's possible to optimize this by checking the actual value of the variable. + :complex + when :function + # functions that return number is position dependent. eg. `[position() = string-length(@attr)]` + %w[number ceiling round floor string-length sum count].include?(predicate_expr[1]) ? :complex : nil + when :group + position_dependency(predicate_expr[1]) + when :descendant, :descendant_or_self, :ancestor, :ancestor_or_self, + :following, :following_sibling, :preceding, :preceding_sibling, + :document, :child, :self, :parent, :attribute, :namespace + # paths are position independent. `foo[path[1]]` doesn't depend on the position of `foo` + nil + else + # Every other unhandled expressions are treated as complex for safety + :complex + end end # Recursively checks if the expression contains position-dependent functions such as position() or last() @@ -514,15 +540,14 @@ def non_optimized_nodesets_select(nodesets, tester, selector) seen.to_a else operator, value = selector - nodes = nodesets.flatten nodes = case operator when :== nodesets.map {|nodeset| nodeset[value - 1] if value >= 1 }.compact when :< - nodesets.map {|nodeset| nodeset[0...value - 1] if value >= 1 }.compact.flatten + nodesets.flat_map {|nodeset| nodeset[0...value - 1] if value >= 1 }.compact when :> - nodesets.map {|nodeset| value < 0 ? nodeset : nodeset[value..-1] }.flatten + nodesets.flat_map {|nodeset| value <= 0 ? nodeset : nodeset.drop(value) } end seen = Set.new.compare_by_identity nodes.each {|node| seen << node } @@ -553,7 +578,7 @@ def split_positional_predicates(predicates) # Performs an axis scanning step. # The caller provides a scanner method and its argument, which determines the axis to scan and the nodes to scan from: # step(path_stack) { [scanner_method, scanner_argument] } - # Scanner method are called with `(scanner_argument, tester_block, selector)` + # Scanner methods are called with `(scanner_argument, tester_block, selector)` # selector is a flag for the scanner to determine how to return the scan result. # It can be: `:uniq`, `:nodesets` or `[position_comparator, value]`. # `:uniq` means the scanner should return unique nodes. Predicates are position-independent. diff --git a/test/xpath/test_predicate.rb b/test/xpath/test_predicate.rb index f25bd758..164e3e96 100644 --- a/test/xpath/test_predicate.rb +++ b/test/xpath/test_predicate.rb @@ -103,6 +103,46 @@ def test_predicate_variable_as_position assert_equal(["b"], parser.parse("/r/*[$x]", doc).map(&:name)) end + def test_predicate_out_of_range_position + doc = REXML::Document.new("") + parser = REXML::XPathParser.new + base = '/r/*' + assert_equal([], parser.parse("#{base}[-1]", doc).map(&:name)) + assert_equal([], parser.parse("#{base}[0]", doc).map(&:name)) + assert_equal([], parser.parse("#{base}[5]", doc).map(&:name)) + assert_equal([], parser.parse("#{base}[position()>5]", doc).map(&:name)) + assert_equal([], parser.parse("#{base}[position()<0]", doc).map(&:name)) + assert_equal([], parser.parse("#{base}[position()<-1]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base}[position()>0]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base}[position()>-1]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base}[position()<10]", doc).map(&:name)) + + # non-optimizable case + base_no_opt = '/r/*[position()!=name()]' + assert_equal([], parser.parse("#{base_no_opt}[-1]", doc).map(&:name)) + assert_equal([], parser.parse("#{base_no_opt}[0]", doc).map(&:name)) + assert_equal([], parser.parse("#{base_no_opt}[5]", doc).map(&:name)) + assert_equal([], parser.parse("#{base_no_opt}[position()>5]", doc).map(&:name)) + assert_equal([], parser.parse("#{base_no_opt}[position()<0]", doc).map(&:name)) + assert_equal([], parser.parse("#{base_no_opt}[position()<-1]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base_no_opt}[position()>0]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base_no_opt}[position()>-1]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("#{base_no_opt}[position()<10]", doc).map(&:name)) + end + + def test_predicate_parenthesized_position + doc = REXML::Document.new("") + parser = REXML::XPathParser.new + assert_equal(["b"], parser.parse("/r/*[(2)]", doc).map(&:name)) + end + + def test_position_dependent_function_predicates + doc = REXML::Document.new("") + parser = REXML::XPathParser.new + assert_equal(["b"], parser.parse("/r/*['2'=string(-(0 - position())*1)]", doc).map(&:name)) + assert_equal(%w[a b c d], parser.parse("/r/*['4'=string(-(0 - last())*1)]", doc).map(&:name)) + end + def test_get_no_siblings_terminal_nodes source = <<-XML From f02b871746e2fc9ea4d42fa487bd925465e83d6f Mon Sep 17 00:00:00 2001 From: tompng Date: Sat, 13 Jun 2026 17:05:06 +0900 Subject: [PATCH 3/3] Simplify position dependency check Stop classifying dependency pattern, just return true(maybe dependent) of false(guaranteed to be independent) --- lib/rexml/xpath_parser.rb | 43 +++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 1069d36e..821cca95 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -338,15 +338,11 @@ def expr( path_stack, nodeset, context=nil ) end # Determines if a predicate expression is dependent on the position of nodes. - # nil if the predicate is position-independent, - # :simple if the predicate is a simple position query that can be optimized in axis scanning - # :complex if the predicate is a complex query that might be dependent on the position of nodes - def position_dependency(predicate_expr) - # [number], [position()=number], [position() < number], [position() > number] - return :simple if position_operation(predicate_expr) - - # expressions that contain position-dependent functions - return :complex if calls_position_dependent_function?(predicate_expr) + # Returns false if the expression is guaranteed to be position-independent. + # Returns true if the expression might be position-dependent. + def position_dependent?(predicate_expr) + # expressions that contain position-dependent functions are position-dependent. + return true if calls_position_dependent_function?(predicate_expr) # Even if expression is followed by path steps, the analysis of # position dependency is the same as the expression itself. @@ -354,32 +350,31 @@ def position_dependency(predicate_expr) when :union, :or, :and, :eq, :neq, :lt, :lteq, :gt, :gteq, :not # Expressions that don't evaluate to a number are position independent # if it doesn't contain position-dependent functions. - nil + false when :div, :mod, :mult, :plus, :minus, :neg - # expressions that return number. eg. `[position() = @attr + 1]` - :complex + # expressions that return number. eg. `[@attr + 1]` + true when :literal - # Integer literal is handled in `position_operation(predicate_expr)`. - # We treat this as complex(no-optimization) because this is not a normal case - # eg. `/foo["hi"]` `/bar[true]` `/baz[3.14]` - :complex + # Numeric literal is position dependent. String and boolean literal is useless + # and not worth optimizing + true when :variable # A variable could resolve to a number at runtime. # It's possible to optimize this by checking the actual value of the variable. - :complex + true when :function # functions that return number is position dependent. eg. `[position() = string-length(@attr)]` - %w[number ceiling round floor string-length sum count].include?(predicate_expr[1]) ? :complex : nil + %w[number ceiling round floor string-length sum count].include?(predicate_expr[1]) when :group - position_dependency(predicate_expr[1]) + position_dependent?(predicate_expr[1]) when :descendant, :descendant_or_self, :ancestor, :ancestor_or_self, :following, :following_sibling, :preceding, :preceding_sibling, :document, :child, :self, :parent, :attribute, :namespace # paths are position independent. `foo[path[1]]` doesn't depend on the position of `foo` - nil + false else - # Every other unhandled expressions are treated as complex for safety - :complex + # Every other unhandled expressions are treated position dependent for safety + true end end @@ -563,12 +558,12 @@ def non_optimized_nodesets_select(nodesets, tester, selector) # If there are multiple position-based predicates or complex position-based predicates, # return [position_independent_predicates, nil, nil, complex_predicates] def split_positional_predicates(predicates) - pre_independent = predicates.take_while {|predicate| position_dependency(predicate).nil? } + pre_independent = predicates.take_while {|predicate| !position_dependent?(predicate) } predicates = predicates.drop(pre_independent.size) return [pre_independent, nil, [], nil] if predicates.empty? op = position_operation(predicates.first) - if op && predicates[1..-1].all? {|predicate| position_dependency(predicate).nil? } + if op && predicates[1..-1].all? {|predicate| !position_dependent?(predicate) } [pre_independent, op, predicates[1..-1], nil] else [pre_independent, nil, nil, predicates]