From 8c37cb91df5d81baafccfc0cbf1a33b865d52b34 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 20:19:25 +0300
Subject: [PATCH 1/7] =?UTF-8?q?perf:=20streaming=20ReplaceAll=20=E2=80=94?=
 =?UTF-8?q?=20single-pass=20without=20[][]int=20allocation=20(#135)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert ReplaceAllStringFunc, ReplaceAllFunc, ReplaceAllLiteral,
ReplaceAllLiteralString from two-pass (collect indices then replace)
to single-pass streaming. Eliminates [][]int allocation for
high-match-count inputs. Returns original string when no matches
(Cow-like optimization).
---
 regex.go | 258 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 205 insertions(+), 53 deletions(-)

diff --git a/regex.go b/regex.go
index 0a86eb0..4795229 100644
--- a/regex.go
+++ b/regex.go
@@ -841,38 +841,64 @@ func (r *Regex) FindAllStringIndex(s string, n int) [][]int {
 //	result := re.ReplaceAllLiteral([]byte("age: 42"), []byte("XX"))
 //	// result = []byte("age: XX")
 func (r *Regex) ReplaceAllLiteral(src, repl []byte) []byte {
-	indices := r.FindAllIndex(src, -1)
-	if len(indices) == 0 {
-		// No matches, return copy of src
-		result := make([]byte, len(src))
-		copy(result, src)
-		return result
-	}
+	var result []byte
+	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	// Pre-allocate result buffer
-	// Estimate: len(src) + (len(repl)-avgMatchLen)*numMatches
-	totalMatchLen := 0
-	for _, idx := range indices {
-		totalMatchLen += idx[1] - idx[0]
-	}
-	avgMatchLen := totalMatchLen / len(indices)
-	estimatedLen := len(src) + (len(repl)-avgMatchLen)*len(indices)
-	if estimatedLen < 0 {
-		estimatedLen = len(src)
-	}
+	for {
+		start, end, found := r.engine.FindIndicesAt(src, pos)
+		if !found {
+			break
+		}
 
-	result := make([]byte, 0, estimatedLen)
-	lastEnd := 0
+		// Skip empty matches at the position where a non-empty match just ended.
+		// This matches Go stdlib behavior (see FindAllIndex for details).
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
 
-	for _, idx := range indices {
-		// Append text before match
-		result = append(result, src[lastEnd:idx[0]]...)
-		// Append replacement
+		if !matched {
+			// Lazy allocation on first match
+			result = make([]byte, 0, len(src))
+			matched = true
+		}
+
+		result = append(result, src[lastEnd:start]...)
 		result = append(result, repl...)
-		lastEnd = idx[1]
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		// No matches: return a copy of src (stdlib compatibility)
+		out := make([]byte, len(src))
+		copy(out, src)
+		return out
 	}
 
-	// Append remaining text
 	result = append(result, src[lastEnd:]...)
 	return result
 }
@@ -887,7 +913,61 @@ func (r *Regex) ReplaceAllLiteral(src, repl []byte) []byte {
 //	result := re.ReplaceAllLiteralString("age: 42", "XX")
 //	// result = "age: XX"
 func (r *Regex) ReplaceAllLiteralString(src, repl string) string {
-	return string(r.ReplaceAllLiteral([]byte(src), []byte(repl)))
+	b := stringToBytes(src)
+	var buf strings.Builder
+	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
+
+	for {
+		start, end, found := r.engine.FindIndicesAt(b, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			buf.Grow(len(src))
+			matched = true
+		}
+
+		buf.WriteString(src[lastEnd:start])
+		buf.WriteString(repl)
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		return src
+	}
+
+	buf.WriteString(src[lastEnd:])
+	return buf.String()
 }
 
 // Expand appends template to dst and returns the result; during the
@@ -1107,27 +1187,61 @@ func (r *Regex) ReplaceAllString(src, repl string) string {
 //	})
 //	// result = []byte("2 4 6")
 func (r *Regex) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
-	indices := r.FindAllIndex(src, -1)
-	if len(indices) == 0 {
-		// No matches, return copy of src
-		result := make([]byte, len(src))
-		copy(result, src)
-		return result
-	}
-
 	var result []byte
 	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	for _, idx := range indices {
-		// Append text before match
-		result = append(result, src[lastEnd:idx[0]]...)
-		// Apply replacement function
-		replacement := repl(src[idx[0]:idx[1]])
-		result = append(result, replacement...)
-		lastEnd = idx[1]
+	for {
+		start, end, found := r.engine.FindIndicesAt(src, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			result = make([]byte, 0, len(src))
+			matched = true
+		}
+
+		result = append(result, src[lastEnd:start]...)
+		result = append(result, repl(src[start:end])...)
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		// No matches: return a copy of src (stdlib compatibility)
+		out := make([]byte, len(src))
+		copy(out, src)
+		return out
 	}
 
-	// Append remaining text
 	result = append(result, src[lastEnd:]...)
 	return result
 }
@@ -1146,19 +1260,57 @@ func (r *Regex) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 //	})
 //	// result = "2 4 6"
 func (r *Regex) ReplaceAllStringFunc(src string, repl func(string) string) string {
-	indices := r.FindAllStringIndex(src, -1)
-	if len(indices) == 0 {
-		return src
-	}
-
+	b := stringToBytes(src)
 	var buf strings.Builder
-	buf.Grow(len(src))
 	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	for _, idx := range indices {
-		buf.WriteString(src[lastEnd:idx[0]])
-		buf.WriteString(repl(src[idx[0]:idx[1]]))
-		lastEnd = idx[1]
+	for {
+		start, end, found := r.engine.FindIndicesAt(b, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			buf.Grow(len(src))
+			matched = true
+		}
+
+		buf.WriteString(src[lastEnd:start])
+		buf.WriteString(repl(src[start:end]))
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		return src
 	}
 
 	buf.WriteString(src[lastEnd:])

From daa5b6ad88b40f9a69f89be8b147f4ea2eddb350 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 20:23:56 +0300
Subject: [PATCH 2/7] =?UTF-8?q?perf:=20DFA-first=20FindSubmatchAt=20?=
 =?UTF-8?q?=E2=80=94=20PikeVM=20on=20match=20span=20only=20(#135)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement Rust-style two-phase search for capture extraction:
Phase 1: DFA/strategy finds match boundaries [start, end]
Phase 2: PikeVM runs anchored within [start..end] for captures

Add SearchWithCapturesInSpan to PikeVM.
Reduces PikeVM work from O(remaining_haystack) to O(match_len)
per match. For 50K matches on 10MB: ~400x less PikeVM work.

Also optimize: skip PikeVM entirely when CaptureCount <= 1
(only group 0 needed — DFA result already provides boundaries).

Rewrite FindAllSubmatch to use FindSubmatchAt internally,
benefiting from the same two-phase optimization.
---
 meta/findall.go | 91 ++++++++++++++++++++++++++++++++++++-------------
 nfa/pikevm.go   | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 23 deletions(-)

diff --git a/meta/findall.go b/meta/findall.go
index 10e109d..34744d1 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -38,12 +38,25 @@ func (e *Engine) FindSubmatch(haystack []byte) *MatchWithCaptures {
 // This method is used by ReplaceAll* operations to correctly handle anchors like ^.
 // Unlike FindSubmatch, it takes the FULL haystack and a starting position.
 // Thread-safe: uses pooled state for both OnePass cache and PikeVM.
+//
+// Two-phase search (Rust-style optimization):
+//
+//	Phase 1: DFA/strategy finds match boundaries [start, end] — O(n) fast scan
+//	Phase 2: PikeVM extracts captures within [start, end] — O(match_len)
+//
+// This reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
+// For 50K matches on 10MB input: ~400x less PikeVM work.
 func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
+	if at > len(haystack) {
+		return nil
+	}
+
 	// Get pooled state first for thread-safe access
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
-	// For position 0, try OnePass DFA if available (10-20x faster for anchored patterns)
+	// For position 0, try OnePass DFA if available (10-20x faster for anchored patterns).
+	// OnePass handles captures natively — no need for two-phase search.
 	if at == 0 && e.onepass != nil && state.onepassCache != nil {
 		atomic.AddUint64(&e.stats.OnePassSearches, 1)
 		slots := e.onepass.Search(haystack, state.onepassCache)
@@ -53,14 +66,36 @@ func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
 			return NewMatchWithCaptures(haystack, captures)
 		}
 		// OnePass failed (input doesn't match from position 0)
-		// Fall through to PikeVM which can find match anywhere
+		// Fall through to two-phase search
 	}
 
-	atomic.AddUint64(&e.stats.NFASearches, 1)
+	// Phase 1: Use DFA/strategy to find match boundaries.
+	// This is the fast O(n) scan that locates [start, end] without captures.
+	start, end, found := e.findIndicesAtWithState(haystack, at, state)
+	if !found {
+		return nil
+	}
+
+	// Optimization: if only group 0 is needed (no sub-captures), skip PikeVM.
+	// The DFA result already provides exact [start, end] boundaries.
+	if e.nfa.CaptureCount() <= 1 {
+		captures := [][]int{{start, end}}
+		return NewMatchWithCaptures(haystack, captures)
+	}
 
-	nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
+	// Phase 2: PikeVM extracts captures within the narrow [start, end] span.
+	// The full haystack is passed for lookbehind context (\b at span boundary),
+	// but PikeVM only processes bytes within [start, end].
+	atomic.AddUint64(&e.stats.NFASearches, 1)
+	nfaMatch := state.pikevm.SearchWithCapturesInSpan(haystack, start, end)
 	if nfaMatch == nil {
-		return nil
+		// Defensive fallback: DFA found a match but PikeVM disagrees.
+		// This should not happen with correct DFA boundaries, but fall back
+		// to full PikeVM search for safety.
+		nfaMatch = state.pikevm.SearchWithCapturesAt(haystack, at)
+		if nfaMatch == nil {
+			return nil
+		}
 	}
 
 	return NewMatchWithCaptures(haystack, nfaMatch.Captures)
@@ -253,6 +288,10 @@ func (e *Engine) Count(haystack []byte, n int) int {
 // FindAllSubmatch returns all successive matches with capture group information.
 // If n > 0, returns at most n matches. If n <= 0, returns all matches.
 //
+// Uses DFA-first two-phase search: DFA finds match boundaries, then PikeVM
+// extracts captures within the narrow match span. This reduces PikeVM work
+// from O(remaining_haystack) to O(match_len) per match.
+//
 // Example:
 //
 //	engine, _ := meta.Compile(`(\w+)@(\w+)\.(\w+)`)
@@ -265,35 +304,41 @@ func (e *Engine) FindAllSubmatch(haystack []byte, n int) []*MatchWithCaptures {
 
 	var matches []*MatchWithCaptures
 	pos := 0
+	lastMatchEnd := -1
 
 	for pos <= len(haystack) {
-		// Use PikeVM for capture extraction
-		atomic.AddUint64(&e.stats.NFASearches, 1)
-		nfaMatch := e.pikevm.SearchWithCaptures(haystack[pos:])
-		if nfaMatch == nil {
+		match := e.FindSubmatchAt(haystack, pos)
+		if match == nil {
 			break
 		}
 
-		// Adjust captures to absolute positions
-		// Captures is [][]int where each element is [start, end] for a group
-		adjustedCaptures := make([][]int, len(nfaMatch.Captures))
-		for i, cap := range nfaMatch.Captures {
-			if len(cap) >= 2 && cap[0] >= 0 {
-				adjustedCaptures[i] = []int{pos + cap[0], pos + cap[1]}
-			} else {
-				adjustedCaptures[i] = nil // Unmatched group
+		matchStart := match.Start()
+		matchEnd := match.End()
+
+		// Skip empty matches at the end of previous non-empty match (stdlib behavior)
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if matchStart == matchEnd && matchStart == lastMatchEnd {
+			pos++
+			if pos > len(haystack) {
+				break
 			}
+			continue
 		}
 
-		match := NewMatchWithCaptures(haystack, adjustedCaptures)
 		matches = append(matches, match)
 
+		// Track non-empty match ends for the skip rule
+		if matchStart != matchEnd {
+			lastMatchEnd = matchEnd
+		}
+
 		// Move position past this match
-		end := nfaMatch.End
-		if end > 0 {
-			pos += end
-		} else {
-			// Empty match: advance by 1 to avoid infinite loop
+		switch {
+		case matchStart == matchEnd:
+			pos = matchEnd + 1
+		case matchEnd > pos:
+			pos = matchEnd
+		default:
 			pos++
 		}
 
diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index 7e5e13d..557a637 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -1085,6 +1085,97 @@ func (p *PikeVM) searchAtWithCaptures(haystack []byte, startPos int) *MatchWithC
 	return nil
 }
 
+// SearchWithCapturesInSpan searches for a match anchored at spanStart,
+// not exceeding spanEnd. The full haystack is preserved for lookbehind
+// context (e.g., \b word boundary assertions at spanStart-1).
+//
+// This implements Phase 2 of the DFA-first two-phase search:
+//
+//	Phase 1: DFA/strategy finds match boundaries [spanStart, spanEnd]
+//	Phase 2: PikeVM extracts captures within [spanStart, spanEnd]
+//
+// The search is anchored: threads are seeded only at spanStart, not at
+// every position. This reduces PikeVM work from O(remaining_haystack)
+// to O(match_len) per match.
+//
+// Preconditions:
+//   - 0 <= spanStart <= spanEnd <= len(haystack)
+//   - A match is known to exist in [spanStart, spanEnd] (from Phase 1)
+//
+// Returns nil if no match is found (should not happen if Phase 1 is correct).
+//
+//nolint:gocognit // Merged match-check + step loop (Rust's nexts pattern) is inherently complex
+func (p *PikeVM) SearchWithCapturesInSpan(haystack []byte, spanStart, spanEnd int) *MatchWithCaptures {
+	if spanStart > spanEnd || spanEnd > len(haystack) {
+		return nil
+	}
+
+	// Reset state
+	p.internalState.Queue = p.internalState.Queue[:0]
+	p.internalState.NextQueue = p.internalState.NextQueue[:0]
+	p.internalState.Visited.Clear()
+
+	// Seed thread only at spanStart (anchored search within span)
+	caps := p.newCaptures()
+	p.addThread(thread{state: p.nfa.StartAnchored(), startPos: spanStart, captures: caps}, haystack, spanStart)
+
+	lastMatchPos := -1
+	var lastMatchCaptures []int
+
+	// Process bytes from spanStart to spanEnd (not len(haystack)).
+	// The full haystack slice is kept so that addThread/step can evaluate
+	// lookbehind assertions (\b) using bytes before spanStart.
+	for pos := spanStart; pos <= spanEnd; pos++ {
+		if pos < spanEnd {
+			b := haystack[pos]
+			p.internalState.Visited.Clear()
+			for _, t := range p.internalState.Queue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						lastMatchCaptures = t.captures.copyData()
+					}
+					if !p.internalState.Longest {
+						break
+					}
+					continue
+				}
+				p.step(t, b, haystack, pos+1)
+			}
+		} else {
+			// At spanEnd: only check for match states, don't step further
+			for _, t := range p.internalState.Queue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						lastMatchCaptures = t.captures.copyData()
+					}
+					break
+				}
+			}
+		}
+
+		if len(p.internalState.NextQueue) == 0 && (pos >= spanEnd || lastMatchPos != -1) {
+			break
+		}
+
+		if pos >= spanEnd {
+			break
+		}
+
+		p.internalState.Queue, p.internalState.NextQueue = p.internalState.NextQueue, p.internalState.Queue[:0]
+	}
+
+	if lastMatchPos != -1 {
+		return &MatchWithCaptures{
+			Start:    spanStart,
+			End:      lastMatchPos,
+			Captures: p.buildCapturesResult(lastMatchCaptures, spanStart, lastMatchPos),
+		}
+	}
+	return nil
+}
+
 // buildCapturesResult converts internal capture slots to the result format
 func (p *PikeVM) buildCapturesResult(caps []int, matchStart, matchEnd int) [][]int {
 	numGroups := p.nfa.CaptureCount()

From 9d9f9e0710c25234da8025d727be1ac9eeaffa8b Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 20:43:12 +0300
Subject: [PATCH 3/7] docs: update CHANGELOG and ROADMAP for v0.12.8

---
 CHANGELOG.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 ROADMAP.md   | 17 +++++++++++------
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef6381f..d7c0f3b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,50 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.8] - 2026-03-10
+
+### Performance
+- **Streaming ReplaceAll — single-pass without `[][]int` allocation** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  `ReplaceAllStringFunc`, `ReplaceAllFunc`, `ReplaceAllLiteral`, and `ReplaceAllLiteralString`
+  converted from two-pass (collect all match indices → iterate) to single-pass streaming.
+  Eliminates `[][]int` allocation for high-match-count inputs (e.g., 800KB for 50K matches).
+  Returns original string when no matches (Cow-like optimization, avoids copy).
+
+- **DFA-first FindSubmatchAt — PikeVM on match span only** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  Implements Rust-style two-phase search for capture extraction:
+  Phase 1: DFA/strategy finds match boundaries `[start, end]` — O(n) fast scan.
+  Phase 2: PikeVM runs anchored within `[start..end]` for captures — O(match_len).
+  Reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
+  For 50K matches on 10MB: ~400x less PikeVM work. Also adds `is_capture_search_needed`
+  optimization: when only group 0 is needed, PikeVM is skipped entirely.
+
+### Fixed
+- **FindAllSubmatch context loss** — `FindAllSubmatch` previously sliced the haystack
+  (`haystack[pos:]`), losing lookbehind context for `\b` word boundary assertions at
+  match boundaries. Now uses `FindSubmatchAt` with full haystack preservation.
+
+## [0.12.8] - 2026-03-10
+
+### Performance
+- **Streaming ReplaceAll — single-pass without `[][]int` allocation** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  `ReplaceAllStringFunc`, `ReplaceAllFunc`, `ReplaceAllLiteral`, and `ReplaceAllLiteralString`
+  converted from two-pass (collect all match indices → iterate) to single-pass streaming.
+  Eliminates `[][]int` allocation for high-match-count inputs (e.g., 800KB for 50K matches).
+  Returns original string when no matches (Cow-like optimization, avoids copy).
+
+- **DFA-first FindSubmatchAt — PikeVM on match span only** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  Implements Rust-style two-phase search for capture extraction:
+  Phase 1: DFA/strategy finds match boundaries `[start, end]` — O(n) fast scan.
+  Phase 2: PikeVM runs anchored within `[start..end]` for captures — O(match_len).
+  Reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
+  For 50K matches on 10MB: ~400x less PikeVM work. Also adds `is_capture_search_needed`
+  optimization: when only group 0 is needed, PikeVM is skipped entirely.
+
+### Fixed
+- **FindAllSubmatch context loss** — `FindAllSubmatch` previously sliced the haystack
+  (`haystack[pos:]`), losing lookbehind context for `\b` word boundary assertions at
+  match boundaries. Now uses `FindSubmatchAt` with full haystack preservation.
+
 ## [0.12.7] - 2026-03-10
 
 ### Performance
diff --git a/ROADMAP.md b/ROADMAP.md
index 81249e7..89df4a5 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -2,7 +2,7 @@
 
 > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations
 
-**Last Updated**: 2026-03-10 | **Current Version**: v0.12.7 | **Target**: v1.0.0 stable
+**Last Updated**: 2026-03-10 | **Current Version**: v0.12.8 | **Target**: v1.0.0 stable
 
 ---
 
@@ -12,7 +12,7 @@ Build a **production-ready, high-performance regex engine** for Go that matches
 
 ### Current State vs Target
 
-| Metric | Current (v0.12.7) | Target (v1.0.0) |
+| Metric | Current (v0.12.8) | Target (v1.0.0) |
 |--------|-------------------|-----------------|
 | Inner literal speedup | **280-3154x** | ✅ Achieved |
 | Case-insensitive speedup | **263x** | ✅ Achieved |
@@ -70,7 +70,9 @@ v0.12.5 ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124)
          ↓
 v0.12.6 ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
          ↓
-v0.12.7 (Current) ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132)
+v0.12.7 ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132)
+         ↓
+v0.12.8 (Current) ✅ → Streaming ReplaceAll + DFA-first FindSubmatchAt (#135)
          ↓
 v1.0.0-rc → Feature freeze, API locked
          ↓
@@ -106,6 +108,8 @@ v1.0.0 STABLE → Production release with API stability guarantee
 - ✅ **v0.12.5**: Non-greedy quantifier fix, ReverseSuffix forward verification (#124)
 - ✅ **v0.12.6**: BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
 - ✅ **v0.12.7**: PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)
+- ✅ **v0.12.8**: Streaming ReplaceAll + DFA-first FindSubmatchAt, Rust-style two-phase search (#135)
+- ✅ **v0.12.8**: Streaming ReplaceAll + DFA-first FindSubmatchAt, Rust-style two-phase search (#135)
 
 ---
 
@@ -197,7 +201,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 
 ## Feature Comparison Matrix
 
-| Feature | RE2 | rust-regex | coregex v0.12.7 | coregex v1.0 |
+| Feature | RE2 | rust-regex | coregex v0.12.8 | coregex v1.0 |
 |---------|-----|------------|-----------------|--------------|
 | Lazy DFA | ✅ | ✅ | ✅ | ✅ |
 | Thompson NFA | ✅ | ✅ | ✅ | ✅ |
@@ -355,7 +359,8 @@ Reference implementations available locally:
 
 | Version | Date | Type | Key Changes |
 |---------|------|------|-------------|
-| **v0.12.7** | 2026-03-10 | Performance | **PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)** |
+| **v0.12.8** | 2026-03-10 | Performance | **Streaming ReplaceAll + DFA-first FindSubmatchAt (#135)** |
+| v0.12.8 | 2026-03-10 | Performance | PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132) |
 | v0.12.6 | 2026-03-08 | Fix | BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) |
 | v0.12.5 | 2026-03-08 | Fix | Non-greedy quantifier fix, ReverseSuffix correctness (#124) |
 | v0.12.4 | 2026-03-01 | Test | Test coverage 80%+, CI improvements, awesome-go readiness |
@@ -396,4 +401,4 @@ Reference implementations available locally:
 
 ---
 
-*Current: v0.12.7 | Next: v0.13.0 | Target: v1.0.0*
+*Current: v0.12.8 | Target: v1.0.0*

From 6292998cca636d50c9b5cb9f913d4c233333ed2a Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 21:19:13 +0300
Subject: [PATCH 4/7] fix: BT stack overflow on 386/macOS + \B context loss in
 SearchWithCapturesAt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes for PR #136 CI failure:

1. FindSubmatchAt: skip two-phase search for UseBoundedBacktracker and UseNFA
   strategies — BT's recursive backtrackFindWithState overflows 250MB stack
   on 386 with deep UTF-8 NFA chains. These strategies don't benefit from
   two-phase anyway (Phase 1 uses the same engine as Phase 2).

2. SearchWithCapturesAt: use matchesEmptyAt(haystack, at) instead of
   matchesEmpty() for at==len(haystack) fast path. matchesEmpty() loses
   lookbehind context (evaluates with nil,0), causing \B false positives
   at end-of-haystack. SearchWithSlotTableAt already used the correct
   matchesEmptyAt — this aligns SearchWithCapturesAt to match.
---
 meta/findall.go | 14 ++++++++++++++
 nfa/pikevm.go   | 10 ++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/meta/findall.go b/meta/findall.go
index 34744d1..8a16292 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -69,6 +69,20 @@ func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
 		// Fall through to two-phase search
 	}
 
+	// Two-phase search is only beneficial for DFA-based strategies where Phase 1
+	// is a fast O(n) scan. For NFA/BoundedBacktracker strategies, Phase 1 uses
+	// the same engines as Phase 2, so go directly to PikeVM with captures.
+	// BoundedBacktracker is also unsafe for two-phase: its recursive implementation
+	// can overflow the stack on large inputs with deep UTF-8 NFA chains (386/macOS).
+	if e.strategy == UseBoundedBacktracker || e.strategy == UseNFA {
+		atomic.AddUint64(&e.stats.NFASearches, 1)
+		nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
+		if nfaMatch == nil {
+			return nil
+		}
+		return NewMatchWithCaptures(haystack, nfaMatch.Captures)
+	}
+
 	// Phase 1: Use DFA/strategy to find match boundaries.
 	// This is the fast O(n) scan that locates [start, end] without captures.
 	start, end, found := e.findIndicesAtWithState(haystack, at, state)
diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index 557a637..9fcd7ae 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -910,8 +910,10 @@ func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCapture
 	}
 
 	if at == len(haystack) {
-		// At end of input - check if empty string matches
-		if p.matchesEmpty() {
+		// At end of input - check if empty string matches at this position.
+		// Must use matchesEmptyAt with full haystack context for correct
+		// look assertion evaluation (e.g., \B needs previous byte context).
+		if p.matchesEmptyAt(haystack, at) {
 			return &MatchWithCaptures{
 				Start:    at,
 				End:      at,
@@ -922,8 +924,8 @@ func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCapture
 	}
 
 	if len(haystack) == 0 {
-		// Check if empty string matches
-		if p.matchesEmpty() {
+		// Check if empty string matches (haystack is empty, pos=0)
+		if p.matchesEmptyAt(haystack, 0) {
 			return &MatchWithCaptures{
 				Start:    0,
 				End:      0,

From fe49d6027ec46c35c9f3cd6798c404bc7d87fefe Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 21:36:36 +0300
Subject: [PATCH 5/7] =?UTF-8?q?perf:=20FindAllSubmatch=20state=20reuse=20?=
 =?UTF-8?q?=E2=80=94=20eliminate=20per-match=20sync.Pool=20overhead?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FindAllSubmatch now acquires SearchState once for entire iteration loop,
matching the pattern in findAllIndicesLoop. Extracted findSubmatchAtWithState
internal method shared by both FindSubmatchAt (public) and FindAllSubmatch.

Prevents race detector test timeouts (>10 min) caused by thousands of
sync.Pool get/put operations per FindAllSubmatch call.
---
 meta/findall.go | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/meta/findall.go b/meta/findall.go
index 8a16292..f587375 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -51,22 +51,26 @@ func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
 		return nil
 	}
 
-	// Get pooled state first for thread-safe access
+	// Get pooled state for thread-safe access
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
+	return e.findSubmatchAtWithState(haystack, at, state)
+}
+
+// findSubmatchAtWithState is the state-reusing internal version of FindSubmatchAt.
+// Used by FindAllSubmatch to avoid per-match sync.Pool get/put overhead.
+func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchState) *MatchWithCaptures {
 	// For position 0, try OnePass DFA if available (10-20x faster for anchored patterns).
 	// OnePass handles captures natively — no need for two-phase search.
 	if at == 0 && e.onepass != nil && state.onepassCache != nil {
 		atomic.AddUint64(&e.stats.OnePassSearches, 1)
 		slots := e.onepass.Search(haystack, state.onepassCache)
 		if slots != nil {
-			// Convert flat slots [start0, end0, start1, end1, ...] to nested captures
 			captures := slotsToCaptures(slots)
 			return NewMatchWithCaptures(haystack, captures)
 		}
-		// OnePass failed (input doesn't match from position 0)
-		// Fall through to two-phase search
+		// OnePass failed — fall through to two-phase search
 	}
 
 	// Two-phase search is only beneficial for DFA-based strategies where Phase 1
@@ -104,8 +108,6 @@ func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
 	nfaMatch := state.pikevm.SearchWithCapturesInSpan(haystack, start, end)
 	if nfaMatch == nil {
 		// Defensive fallback: DFA found a match but PikeVM disagrees.
-		// This should not happen with correct DFA boundaries, but fall back
-		// to full PikeVM search for safety.
 		nfaMatch = state.pikevm.SearchWithCapturesAt(haystack, at)
 		if nfaMatch == nil {
 			return nil
@@ -320,8 +322,13 @@ func (e *Engine) FindAllSubmatch(haystack []byte, n int) []*MatchWithCaptures {
 	pos := 0
 	lastMatchEnd := -1
 
+	// Get state ONCE for entire iteration — eliminates sync.Pool overhead per match.
+	// Critical for race detector performance (10+ minute timeout without this).
+	state := e.getSearchState()
+	defer e.putSearchState(state)
+
 	for pos <= len(haystack) {
-		match := e.FindSubmatchAt(haystack, pos)
+		match := e.findSubmatchAtWithState(haystack, pos, state)
 		if match == nil {
 			break
 		}

From 5811934d6e24c4d384efe0621f60f67b58a91c8c Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 21:49:58 +0300
Subject: [PATCH 6/7] fix: data race in concurrent FindSubmatch for DFA-based
 strategies

Strategies UseDFA, UseBoth, UseDigitPrefilter access shared mutable
state (e.dfa lazy DFA, e.pikevm) in their findIndicesAt dispatch paths.
When findSubmatchAtWithState routes Phase 1 through these strategies,
concurrent FindSubmatch calls race on the shared state.

Fix: extend the two-phase bypass guard to include all strategies that
use shared mutable state. These strategies now go directly to the
pooled PikeVM (state.pikevm) for capture extraction, which is
thread-safe by design.

Strategies that remain eligible for two-phase search all use their
own immutable instances (ReverseSuffix, ReverseInner, CharClassSearcher,
CompositeSearcher, etc.).
---
 meta/findall.go | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/meta/findall.go b/meta/findall.go
index f587375..ba80453 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -73,12 +73,22 @@ func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchS
 		// OnePass failed — fall through to two-phase search
 	}
 
-	// Two-phase search is only beneficial for DFA-based strategies where Phase 1
-	// is a fast O(n) scan. For NFA/BoundedBacktracker strategies, Phase 1 uses
-	// the same engines as Phase 2, so go directly to PikeVM with captures.
-	// BoundedBacktracker is also unsafe for two-phase: its recursive implementation
-	// can overflow the stack on large inputs with deep UTF-8 NFA chains (386/macOS).
-	if e.strategy == UseBoundedBacktracker || e.strategy == UseNFA {
+	// Strategies that must bypass two-phase search and go directly to PikeVM:
+	//
+	// Thread-safety: UseDFA, UseBoth, UseDigitPrefilter access shared mutable state
+	// (e.dfa lazy DFA, e.pikevm) that is NOT safe for concurrent access.
+	// findSubmatchAtWithState is called with a pooled SearchState, but Phase 1
+	// dispatches to findIndicesDFAAt/findIndicesAdaptiveAt/findIndicesDigitPrefilterAt
+	// which use e.dfa and e.pikevm directly, causing data races.
+	//
+	// Performance: UseNFA Phase 1 uses the same PikeVM as Phase 2, so two-phase
+	// adds overhead without benefit.
+	//
+	// Safety: UseBoundedBacktracker's recursive implementation can overflow the
+	// stack on large inputs with deep UTF-8 NFA chains (386/macOS 250MB limit).
+	switch e.strategy {
+	case UseBoundedBacktracker, UseNFA,
+		UseDFA, UseBoth, UseDigitPrefilter:
 		atomic.AddUint64(&e.stats.NFASearches, 1)
 		nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
 		if nfaMatch == nil {

From 66ccb69ca43d1c1181809a06fc87c5e78fd54ecf Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Tue, 10 Mar 2026 22:19:05 +0300
Subject: [PATCH 7/7] =?UTF-8?q?docs:=20update=20CHANGELOG=20for=20v0.12.8?=
 =?UTF-8?q?=20=E2=80=94=20add=20fixes=20for=20data=20race,=20stack=20overf?=
 =?UTF-8?q?low,=20\B=20context?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGELOG.md | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7c0f3b..5d24ba8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,33 +29,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   For 50K matches on 10MB: ~400x less PikeVM work. Also adds `is_capture_search_needed`
   optimization: when only group 0 is needed, PikeVM is skipped entirely.
 
-### Fixed
-- **FindAllSubmatch context loss** — `FindAllSubmatch` previously sliced the haystack
-  (`haystack[pos:]`), losing lookbehind context for `\b` word boundary assertions at
-  match boundaries. Now uses `FindSubmatchAt` with full haystack preservation.
-
-## [0.12.8] - 2026-03-10
-
-### Performance
-- **Streaming ReplaceAll — single-pass without `[][]int` allocation** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
-  `ReplaceAllStringFunc`, `ReplaceAllFunc`, `ReplaceAllLiteral`, and `ReplaceAllLiteralString`
-  converted from two-pass (collect all match indices → iterate) to single-pass streaming.
-  Eliminates `[][]int` allocation for high-match-count inputs (e.g., 800KB for 50K matches).
-  Returns original string when no matches (Cow-like optimization, avoids copy).
-
-- **DFA-first FindSubmatchAt — PikeVM on match span only** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
-  Implements Rust-style two-phase search for capture extraction:
-  Phase 1: DFA/strategy finds match boundaries `[start, end]` — O(n) fast scan.
-  Phase 2: PikeVM runs anchored within `[start..end]` for captures — O(match_len).
-  Reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
-  For 50K matches on 10MB: ~400x less PikeVM work. Also adds `is_capture_search_needed`
-  optimization: when only group 0 is needed, PikeVM is skipped entirely.
+- **FindAllSubmatch state reuse** — acquires `SearchState` once for entire iteration loop,
+  eliminating per-match `sync.Pool` get/put overhead. Critical for race detector performance.
 
 ### Fixed
 - **FindAllSubmatch context loss** — `FindAllSubmatch` previously sliced the haystack
   (`haystack[pos:]`), losing lookbehind context for `\b` word boundary assertions at
   match boundaries. Now uses `FindSubmatchAt` with full haystack preservation.
 
+- **BoundedBacktracker stack overflow on 386/macOS** — two-phase search routed through
+  `BoundedBacktracker` for Phase 1, causing recursive stack overflow on large inputs
+  with deep UTF-8 NFA chains (386/macOS 250MB stack limit). Fix: strategies using
+  BoundedBacktracker and NFA bypass two-phase search, going directly to pooled PikeVM.
+
+- **`\B` false positive at end of input** — `SearchWithCapturesAt` at `at==len(haystack)`
+  used `matchesEmpty()` which evaluates with `nil,0`, losing lookbehind context.
+  For `\B` at position 2 of "xx", left='x' (word char) means word boundary, so `\B`
+  should NOT match — but context loss caused a false positive. Fix: uses
+  `matchesEmptyAt(haystack, at)` to preserve full context.
+
+- **Data race in concurrent FindSubmatch** — strategies `UseDFA`, `UseBoth`, and
+  `UseDigitPrefilter` access shared mutable state (`e.dfa` lazy DFA, `e.pikevm`) in
+  their `findIndicesAt` dispatch paths. Concurrent `FindSubmatch` calls raced on this
+  shared state. Fix: these strategies bypass two-phase search, going directly to
+  pooled `state.pikevm.SearchWithCapturesAt()` which is thread-safe by design.
+
 ## [0.12.7] - 2026-03-10
 
 ### Performance