From 72c7ae196da7879d1d85f1010b31656c189d886e Mon Sep 17 00:00:00 2001 From: slow-J <32519034+slow-J@users.noreply.github.com> Date: Wed, 10 Jun 2026 17:58:14 +0100 Subject: [PATCH] GraphTokenStreamFiniteStrings#articulationPoints replacing recursion with iterative approach, removing the 1000-depth limit Reimplementing Tarjan's articulation points as an interative DFS using a IntArrayList stack. Output is unchanged. --- lucene/CHANGES.txt | 3 +- .../graph/GraphTokenStreamFiniteStrings.java | 105 ++++++++++-------- .../TestGraphTokenStreamFiniteStrings.java | 9 +- 3 files changed, 66 insertions(+), 51 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 42a6e8073576..f95c5e3ccc13 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -287,7 +287,8 @@ New Features Improvements --------------------- -(No changes) +* GITHUB#16239: GraphTokenStreamFiniteStrings#articulationPoints now uses an iterative + approach instead of recursion, removing the previous MAX_RECURSION_LEVEL of 1000. (Jakub Slowinski) Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index fa5895e30085..2f3d02627df5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -45,9 +45,6 @@ * different paths of the {@link Automaton}. */ public final class GraphTokenStreamFiniteStrings { - /** Maximum level of recursion allowed in recursive operations. */ - private static final int MAX_RECURSION_LEVEL = 1000; - private AttributeSource[] tokens = new AttributeSource[4]; private final Automaton det; private final Transition transition = new Transition(); @@ -176,15 +173,7 @@ public int[] articulationPoints() { undirect.addTransition(transition.dest, i, transition.min); } } - int numStates = det.getNumStates(); - BitSet visited = new BitSet(numStates); - int[] depth = new int[det.getNumStates()]; - int[] low = new int[det.getNumStates()]; - int[] parent = new int[det.getNumStates()]; - Arrays.fill(parent, -1); - IntArrayList points = new IntArrayList(); - articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points); - return points.reverse().toArray(); + return articulationPointsIterative(undirect.finish()).reverse().toArray(); } /** Build an automaton from the provided {@link TokenStream}. */ @@ -252,43 +241,67 @@ private Automaton build(final TokenStream in) throws IOException { return builder.finish(); } - private static void articulationPointsRecurse( - Automaton a, - int state, - int d, - int[] depth, - int[] low, - int[] parent, - BitSet visited, - IntArrayList points) { - visited.set(state); - depth[state] = d; - low[state] = d; - int childCount = 0; - boolean isArticulation = false; - Transition t = new Transition(); - int numT = a.initTransition(state, t); - for (int i = 0; i < numT; i++) { - a.getNextTransition(t); - if (visited.get(t.dest) == false) { - parent[t.dest] = state; - if (d < MAX_RECURSION_LEVEL) { - articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points); - } else { - throw new IllegalArgumentException( - "Exceeded maximum recursion level during graph analysis"); + /** + * Iterative (IntArrayList stack) implementation of Tarjan's articulation points DFS, so that deep + * undirected graphs can be analyzed without hitting a {@link StackOverflowError}. Each state + * (vertex) is pushed at most once over the whole traversal. {@code transitionIndex[state]} holds + * how many of that state's transitions (edges) we've already traversed. We resume at the next one + * after exploring the current child and all its descendants. + */ + private static IntArrayList articulationPointsIterative(Automaton a) { + final int numStates = a.getNumStates(); + final int[] depth = new int[numStates]; + final int[] low = new int[numStates]; + final int[] parent = new int[numStates]; + Arrays.fill(parent, -1); + final int[] transitionIndex = new int[numStates]; + final BitSet visited = new BitSet(numStates); + final BitSet isArticulation = new BitSet(numStates); + final IntArrayList stack = new IntArrayList(); + final IntArrayList points = new IntArrayList(); + final Transition t = new Transition(); + + // The DFS starts at the root (state 0). depth[0] and low[0] are by default set to 0. + stack.add(0); + visited.set(0); + // The root vertex must be handled separately: it is a cut vertex if and only if it has at least + // two children in the DFS tree. + int rootChildCount = 0; + + while (stack.isEmpty() == false) { + int state = stack.get(stack.size() - 1); + int i = transitionIndex[state]; + if (i < a.getNumTransitions(state)) { + transitionIndex[state] = i + 1; + a.getTransition(state, i, t); + if (visited.get(t.dest) == false) { + visited.set(t.dest); + parent[t.dest] = state; + depth[t.dest] = low[t.dest] = depth[state] + 1; + if (state == 0) { + rootChildCount++; + } + stack.add(t.dest); + } else if (t.dest != parent[state]) { + // backedge. + low[state] = Math.min(low[state], depth[t.dest]); + } + } else { + // Current state has been fully explored. + stack.removeLast(); + // Only root (state 0) has no parent (-1). + int par = parent[state]; + if (par != -1) { + low[par] = Math.min(low[par], low[state]); + if (low[state] >= depth[par]) { + isArticulation.set(par); + } } - childCount++; - if (low[t.dest] >= depth[state]) { - isArticulation = true; + if ((par != -1 && isArticulation.get(state)) || (par == -1 && rootChildCount > 1)) { + points.add(state); } - low[state] = Math.min(low[state], low[t.dest]); - } else if (t.dest != parent[state]) { - low[state] = Math.min(low[state], depth[t.dest]); } } - if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) { - points.add(state); - } + return points; } } diff --git a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java index 77875bc5038c..ad079d379f7c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java @@ -662,7 +662,7 @@ public void testMultipleSidePathsWithGaps() throws Exception { assertFalse(it.hasNext()); } - public void testLongTokenStreamStackOverflowError() throws Exception { + public void testLongTokenStreamDoesNotOverflow() throws Exception { ArrayList tokens = new ArrayList<>() { @@ -674,14 +674,15 @@ public void testLongTokenStreamStackOverflowError() throws Exception { } }; - // Add in too many tokens to get a high depth graph - for (int i = 0; i < 1024 + 1; i++) { + // Add in many tokens to get a very deep graph. + for (int i = 0; i < 50_000; i++) { tokens.add(token("network", 1, 1)); } TokenStream ts = new CannedTokenStream(tokens.toArray(Token[]::new)); GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts); - assertThrows(IllegalArgumentException.class, graph::articulationPoints); + int[] points = graph.articulationPoints(); + assertEquals(50_001, points.length); } }