Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
- SIMD prefilter for CompositeSequenceDFA (#83)

## [0.12.6] - 2026-03-08

### Fixed
- **BoundedBacktracker rejected valid searches on large inputs** (Issue [#127](https://github.com/coregx/coregex/issues/127)) —
`SearchAtWithState(haystack, at, state)` checked `CanHandle(len(haystack))`
against the full haystack length, rejecting inputs >2.4MB even when the
remaining search span `[at, len(haystack)]` easily fit within the visited
table budget. This caused `FindAllStringIndex` with UseNFA strategy to miss
matches in the second half of large inputs (e.g., LogParser on 7MB log files
returned 22004 matches instead of 33089). Fix: span-based visited table
sizing — `CanHandle` now checks `len(haystack) - at`, and visited positions
are stored relative to `SpanStart`. This matches Rust regex's `Input` span
model (`backtrack.rs` line 1848). Full haystack is preserved for zero-width
assertions like `\b` that need backward context.
Reported by [@kostya](https://github.com/kostya).
- **`ReplaceAllStringFunc` O(n²) performance on large inputs** —
Used `result += string` concatenation in a loop, causing quadratic memory
allocation for inputs with many matches (e.g., 150K replacements on 6MB
string took 2m19s). Fix: replaced with `strings.Builder` for O(n) performance
(now completes in ~1.3s).

## [0.12.5] - 2026-03-08

### Fixed
Expand Down
26 changes: 22 additions & 4 deletions nfa/backtrack.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ type BacktrackerState struct {
// InputLen is cached for bounds checking
InputLen int

// SpanStart is the starting offset within the haystack for this search.
// The visited table is sized for the span [SpanStart, SpanStart+InputLen],
// and positions are stored relative to SpanStart.
// This follows Rust regex's Input span model: the visited table covers
// only the search span, while the full haystack remains available for
// zero-width assertions like \b that need backward context.
SpanStart int

// Longest enables leftmost-longest match semantics (POSIX/AWK compatibility).
// When true, explores all branches to find the longest match instead of
// returning on the first match found.
Expand Down Expand Up @@ -116,6 +124,7 @@ func (b *BoundedBacktracker) CanHandle(haystackLen int) bool {
func (b *BoundedBacktracker) reset(state *BacktrackerState, haystackLen int) {
state.InputLen = haystackLen
state.NumStates = b.numStates
state.SpanStart = 0 // Default: span starts at beginning of haystack

// Calculate required size in entries
entriesNeeded := b.numStates * (haystackLen + 1)
Expand Down Expand Up @@ -147,8 +156,10 @@ func (b *BoundedBacktracker) reset(state *BacktrackerState, haystackLen int) {
// Layout: Visited[pos * numStates + state] provides cache locality when
// checking multiple states at the same position (common in epsilon traversal).
func (b *BoundedBacktracker) shouldVisit(s *BacktrackerState, state StateID, pos int) bool {
// Calculate index: pos * numStates + state (cache-friendly layout)
idx := pos*s.NumStates + int(state)
// Calculate index using position relative to span start.
// This matches Rust regex's visited.insert(sid, at - input.start()).
relPos := pos - s.SpanStart
idx := relPos*s.NumStates + int(state)

// Check if visited in current generation
if s.Visited[idx] == s.Generation {
Expand Down Expand Up @@ -228,11 +239,18 @@ func (b *BoundedBacktracker) SearchAt(haystack []byte, at int) (int, int, bool)
// Returns (start, end, true) if found, (-1, -1, false) otherwise.
// This method uses external state and IS thread-safe when each goroutine uses its own state.
func (b *BoundedBacktracker) SearchAtWithState(haystack []byte, at int, state *BacktrackerState) (int, int, bool) {
if !b.CanHandle(len(haystack)) {
// Use span-based sizing: only the remaining portion [at, len(haystack)]
// needs visited table entries. This matches Rust regex's Input span model
// (backtrack.rs line 1848: haylen = input.get_span().len()).
// The full haystack is kept for byte matching so zero-width assertions
// like \b can see context before 'at'.
spanLen := len(haystack) - at
if !b.CanHandle(spanLen) {
return -1, -1, false
}

b.reset(state, len(haystack))
b.reset(state, spanLen)
state.SpanStart = at // Visited table positions are relative to this offset

// Try to match starting at each position from 'at'
for startPos := at; startPos <= len(haystack); startPos++ {
Expand Down
68 changes: 68 additions & 0 deletions nfa/backtrack_getters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,74 @@ func TestBoundedBacktracker_SearchAtWithState(t *testing.T) {
}
}

// TestBoundedBacktracker_SearchAtWithState_SpanBased tests the span-based
// CanHandle fix. When the full haystack exceeds CanHandle but the span
// [at, len(haystack)] fits, SearchAtWithState should still find matches.
// This is the bug that caused LogParser to miss matches on 7MB inputs.
func TestBoundedBacktracker_SearchAtWithState_SpanBased(t *testing.T) {
nfa := compileNFAForTest(`\d+`)
bt := NewBoundedBacktracker(nfa)
state := NewBacktrackerState()

// Create a haystack where full length exceeds CanHandle
// but the remaining span from 'at' fits.
maxInput := bt.MaxInputSize()
haystackLen := maxInput + 1000 // Exceeds CanHandle for full haystack

haystack := make([]byte, haystackLen)
for i := range haystack {
haystack[i] = 'x' // Fill with non-matching bytes
}
// Place digits near the end (within the span that fits)
copy(haystack[haystackLen-10:], "abc123def!")

// Full haystack should NOT be handleable
if bt.CanHandle(haystackLen) {
t.Skip("haystack fits entirely, can't test span-based behavior")
}

// But searching from a position near the end should work
at := haystackLen - 500 // Remaining span is 500 bytes — easily fits
if !bt.CanHandle(haystackLen - at) {
t.Fatal("span should be handleable")
}

start, end, found := bt.SearchAtWithState(haystack, at, state)
if !found {
t.Fatal("SearchAtWithState should find digits near end of large haystack")
}
if start != haystackLen-7 || end != haystackLen-4 {
t.Errorf("SearchAtWithState = (%d, %d), want (%d, %d)", start, end, haystackLen-7, haystackLen-4)
}
}

// TestBoundedBacktracker_SearchAtWithState_WordBoundary verifies that
// span-based visited sizing preserves full haystack context for \b.
func TestBoundedBacktracker_SearchAtWithState_WordBoundary(t *testing.T) {
nfa := compileNFAForTest(`\bfoo\b`)
bt := NewBoundedBacktracker(nfa)
state := NewBacktrackerState()

haystack := []byte("hello foo bar")

// Search from position 5 (the space before "foo")
start, end, found := bt.SearchAtWithState(haystack, 5, state)
if !found {
t.Fatal("SearchAtWithState should find \\bfoo\\b when starting from position 5")
}
if start != 6 || end != 9 {
t.Errorf("SearchAtWithState = (%d, %d), want (6, 9)", start, end)
}

// Search from position 7 (inside "foo") — \b at position 6 needs to see
// the space at position 5, which is before 'at'. The full haystack
// must be preserved for this to work.
start2, end2, found2 := bt.SearchAtWithState(haystack, 7, state)
if found2 {
t.Errorf("SearchAtWithState from inside 'foo' should not find \\bfoo\\b, got (%d, %d)", start2, end2)
}
}

func TestBoundedBacktracker_LargeInputNotHandled(t *testing.T) {
nfa := compileNFAForTest(`\w+`)
bt := NewBoundedBacktracker(nfa)
Expand Down
16 changes: 7 additions & 9 deletions regex.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ package coregex
import (
"io"
"regexp/syntax"
"strings"
"unsafe"

"github.com/coregx/coregex/meta"
Expand Down Expand Up @@ -1150,21 +1151,18 @@ func (r *Regex) ReplaceAllStringFunc(src string, repl func(string) string) strin
return src
}

var result string
var buf strings.Builder
buf.Grow(len(src))
lastEnd := 0

for _, idx := range indices {
// Append text before match
result += src[lastEnd:idx[0]]
// Apply replacement function
replacement := repl(src[idx[0]:idx[1]])
result += replacement
buf.WriteString(src[lastEnd:idx[0]])
buf.WriteString(repl(src[idx[0]:idx[1]]))
lastEnd = idx[1]
}

// Append remaining text
result += src[lastEnd:]
return result
buf.WriteString(src[lastEnd:])
return buf.String()
}

// Split slices s into substrings separated by the expression and returns a slice
Expand Down
Loading