From 9242c969a5a4693602d512261b40764b5661aee2 Mon Sep 17 00:00:00 2001 From: Andy Date: Mon, 9 Mar 2026 00:11:56 +0300 Subject: [PATCH 1/2] fix: BoundedBacktracker span-based CanHandle + ReplaceAllStringFunc O(n) performance SearchAtWithState checked CanHandle(len(haystack)) against full haystack, rejecting valid searches when remaining span [at, len] fit within budget. LogParser on 7MB input returned 22004 matches instead of 33089. Fix: span-based visited table sizing matching Rust regex's Input span model. Visited positions stored relative to SpanStart. Full haystack preserved for zero-width assertions (\b) needing backward context. ReplaceAllStringFunc replaced O(n^2) string concatenation with strings.Builder for O(n) performance (2m19s -> 1.3s on 150K matches). Fixes #127 --- CHANGELOG.md | 21 +++++++++++ nfa/backtrack.go | 26 +++++++++++--- nfa/backtrack_getters_test.go | 68 +++++++++++++++++++++++++++++++++++ regex.go | 16 ++++----- 4 files changed, 118 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5fac3c..783c226 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120)) - SIMD prefilter for CompositeSequenceDFA (#83) +## [0.12.6] - 2026-03-08 + +### Fixed +- **BoundedBacktracker rejected valid searches on large inputs** (Issue [#127](https://github.com/coregx/coregex/issues/127)) — + `SearchAtWithState(haystack, at, state)` checked `CanHandle(len(haystack))` + against the full haystack length, rejecting inputs >2.4MB even when the + remaining search span `[at, len(haystack)]` easily fit within the visited + table budget. This caused `FindAllStringIndex` with UseNFA strategy to miss + matches in the second half of large inputs (e.g., LogParser on 7MB log files + returned 22004 matches instead of 33089). Fix: span-based visited table + sizing — `CanHandle` now checks `len(haystack) - at`, and visited positions + are stored relative to `SpanStart`. This matches Rust regex's `Input` span + model (`backtrack.rs` line 1848). Full haystack is preserved for zero-width + assertions like `\b` that need backward context. + Reported by [@kostya](https://github.com/kostya). +- **`ReplaceAllStringFunc` O(n²) performance on large inputs** — + Used `result += string` concatenation in a loop, causing quadratic memory + allocation for inputs with many matches (e.g., 150K replacements on 6MB + string took 2m19s). Fix: replaced with `strings.Builder` for O(n) performance + (now completes in ~1.3s). + ## [0.12.5] - 2026-03-08 ### Fixed diff --git a/nfa/backtrack.go b/nfa/backtrack.go index 51fecf4..d8b8352 100644 --- a/nfa/backtrack.go +++ b/nfa/backtrack.go @@ -52,6 +52,14 @@ type BacktrackerState struct { // InputLen is cached for bounds checking InputLen int + // SpanStart is the starting offset within the haystack for this search. + // The visited table is sized for the span [SpanStart, SpanStart+InputLen], + // and positions are stored relative to SpanStart. + // This follows Rust regex's Input span model: the visited table covers + // only the search span, while the full haystack remains available for + // zero-width assertions like \b that need backward context. + SpanStart int + // Longest enables leftmost-longest match semantics (POSIX/AWK compatibility). // When true, explores all branches to find the longest match instead of // returning on the first match found. @@ -116,6 +124,7 @@ func (b *BoundedBacktracker) CanHandle(haystackLen int) bool { func (b *BoundedBacktracker) reset(state *BacktrackerState, haystackLen int) { state.InputLen = haystackLen state.NumStates = b.numStates + state.SpanStart = 0 // Default: span starts at beginning of haystack // Calculate required size in entries entriesNeeded := b.numStates * (haystackLen + 1) @@ -147,8 +156,10 @@ func (b *BoundedBacktracker) reset(state *BacktrackerState, haystackLen int) { // Layout: Visited[pos * numStates + state] provides cache locality when // checking multiple states at the same position (common in epsilon traversal). func (b *BoundedBacktracker) shouldVisit(s *BacktrackerState, state StateID, pos int) bool { - // Calculate index: pos * numStates + state (cache-friendly layout) - idx := pos*s.NumStates + int(state) + // Calculate index using position relative to span start. + // This matches Rust regex's visited.insert(sid, at - input.start()). + relPos := pos - s.SpanStart + idx := relPos*s.NumStates + int(state) // Check if visited in current generation if s.Visited[idx] == s.Generation { @@ -228,11 +239,18 @@ func (b *BoundedBacktracker) SearchAt(haystack []byte, at int) (int, int, bool) // Returns (start, end, true) if found, (-1, -1, false) otherwise. // This method uses external state and IS thread-safe when each goroutine uses its own state. func (b *BoundedBacktracker) SearchAtWithState(haystack []byte, at int, state *BacktrackerState) (int, int, bool) { - if !b.CanHandle(len(haystack)) { + // Use span-based sizing: only the remaining portion [at, len(haystack)] + // needs visited table entries. This matches Rust regex's Input span model + // (backtrack.rs line 1848: haylen = input.get_span().len()). + // The full haystack is kept for byte matching so zero-width assertions + // like \b can see context before 'at'. + spanLen := len(haystack) - at + if !b.CanHandle(spanLen) { return -1, -1, false } - b.reset(state, len(haystack)) + b.reset(state, spanLen) + state.SpanStart = at // Visited table positions are relative to this offset // Try to match starting at each position from 'at' for startPos := at; startPos <= len(haystack); startPos++ { diff --git a/nfa/backtrack_getters_test.go b/nfa/backtrack_getters_test.go index 242fc02..b91c85a 100644 --- a/nfa/backtrack_getters_test.go +++ b/nfa/backtrack_getters_test.go @@ -274,6 +274,74 @@ func TestBoundedBacktracker_SearchAtWithState(t *testing.T) { } } +// TestBoundedBacktracker_SearchAtWithState_SpanBased tests the span-based +// CanHandle fix. When the full haystack exceeds CanHandle but the span +// [at, len(haystack)] fits, SearchAtWithState should still find matches. +// This is the bug that caused LogParser to miss matches on 7MB inputs. +func TestBoundedBacktracker_SearchAtWithState_SpanBased(t *testing.T) { + nfa := compileNFAForTest(`\d+`) + bt := NewBoundedBacktracker(nfa) + state := NewBacktrackerState() + + // Create a haystack where full length exceeds CanHandle + // but the remaining span from 'at' fits. + maxInput := bt.MaxInputSize() + haystackLen := maxInput + 1000 // Exceeds CanHandle for full haystack + + haystack := make([]byte, haystackLen) + for i := range haystack { + haystack[i] = 'x' // Fill with non-matching bytes + } + // Place digits near the end (within the span that fits) + copy(haystack[haystackLen-10:], []byte("abc123def!")) + + // Full haystack should NOT be handleable + if bt.CanHandle(haystackLen) { + t.Skip("haystack fits entirely, can't test span-based behavior") + } + + // But searching from a position near the end should work + at := haystackLen - 500 // Remaining span is 500 bytes — easily fits + if !bt.CanHandle(haystackLen - at) { + t.Fatal("span should be handleable") + } + + start, end, found := bt.SearchAtWithState(haystack, at, state) + if !found { + t.Fatal("SearchAtWithState should find digits near end of large haystack") + } + if start != haystackLen-7 || end != haystackLen-4 { + t.Errorf("SearchAtWithState = (%d, %d), want (%d, %d)", start, end, haystackLen-7, haystackLen-4) + } +} + +// TestBoundedBacktracker_SearchAtWithState_WordBoundary verifies that +// span-based visited sizing preserves full haystack context for \b. +func TestBoundedBacktracker_SearchAtWithState_WordBoundary(t *testing.T) { + nfa := compileNFAForTest(`\bfoo\b`) + bt := NewBoundedBacktracker(nfa) + state := NewBacktrackerState() + + haystack := []byte("hello foo bar") + + // Search from position 5 (the space before "foo") + start, end, found := bt.SearchAtWithState(haystack, 5, state) + if !found { + t.Fatal("SearchAtWithState should find \\bfoo\\b when starting from position 5") + } + if start != 6 || end != 9 { + t.Errorf("SearchAtWithState = (%d, %d), want (6, 9)", start, end) + } + + // Search from position 7 (inside "foo") — \b at position 6 needs to see + // the space at position 5, which is before 'at'. The full haystack + // must be preserved for this to work. + start2, end2, found2 := bt.SearchAtWithState(haystack, 7, state) + if found2 { + t.Errorf("SearchAtWithState from inside 'foo' should not find \\bfoo\\b, got (%d, %d)", start2, end2) + } +} + func TestBoundedBacktracker_LargeInputNotHandled(t *testing.T) { nfa := compileNFAForTest(`\w+`) bt := NewBoundedBacktracker(nfa) diff --git a/regex.go b/regex.go index a89ad21..0a86eb0 100644 --- a/regex.go +++ b/regex.go @@ -48,6 +48,7 @@ package coregex import ( "io" "regexp/syntax" + "strings" "unsafe" "github.com/coregx/coregex/meta" @@ -1150,21 +1151,18 @@ func (r *Regex) ReplaceAllStringFunc(src string, repl func(string) string) strin return src } - var result string + var buf strings.Builder + buf.Grow(len(src)) lastEnd := 0 for _, idx := range indices { - // Append text before match - result += src[lastEnd:idx[0]] - // Apply replacement function - replacement := repl(src[idx[0]:idx[1]]) - result += replacement + buf.WriteString(src[lastEnd:idx[0]]) + buf.WriteString(repl(src[idx[0]:idx[1]])) lastEnd = idx[1] } - // Append remaining text - result += src[lastEnd:] - return result + buf.WriteString(src[lastEnd:]) + return buf.String() } // Split slices s into substrings separated by the expression and returns a slice From 84504e74a10362432b1091ec63ab1768b36f7e35 Mon Sep 17 00:00:00 2001 From: Andy Date: Mon, 9 Mar 2026 00:15:52 +0300 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20lint=20=E2=80=94=20simplify=20[]byte?= =?UTF-8?q?=20to=20string=20in=20copy()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nfa/backtrack_getters_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nfa/backtrack_getters_test.go b/nfa/backtrack_getters_test.go index b91c85a..c3202f1 100644 --- a/nfa/backtrack_getters_test.go +++ b/nfa/backtrack_getters_test.go @@ -293,7 +293,7 @@ func TestBoundedBacktracker_SearchAtWithState_SpanBased(t *testing.T) { haystack[i] = 'x' // Fill with non-matching bytes } // Place digits near the end (within the span that fits) - copy(haystack[haystackLen-10:], []byte("abc123def!")) + copy(haystack[haystackLen-10:], "abc123def!") // Full haystack should NOT be handleable if bt.CanHandle(haystackLen) {