diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef6381f..5d24ba8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,48 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - ARM NEON SIMD support (Go 1.26 `simd/archsimd` intrinsics — [#120](https://github.com/coregx/coregex/issues/120))
 - SIMD prefilter for CompositeSequenceDFA (#83)
 
+## [0.12.8] - 2026-03-10
+
+### Performance
+- **Streaming ReplaceAll — single-pass without `[][]int` allocation** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  `ReplaceAllStringFunc`, `ReplaceAllFunc`, `ReplaceAllLiteral`, and `ReplaceAllLiteralString`
+  converted from two-pass (collect all match indices → iterate) to single-pass streaming.
+  Eliminates `[][]int` allocation for high-match-count inputs (e.g., 800KB for 50K matches).
+  Returns original string when no matches (Cow-like optimization, avoids copy).
+
+- **DFA-first FindSubmatchAt — PikeVM on match span only** (Issue [#135](https://github.com/coregx/coregex/issues/135)) —
+  Implements Rust-style two-phase search for capture extraction:
+  Phase 1: DFA/strategy finds match boundaries `[start, end]` — O(n) fast scan.
+  Phase 2: PikeVM runs anchored within `[start..end]` for captures — O(match_len).
+  Reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
+  For 50K matches on 10MB: ~400x less PikeVM work. Also adds `is_capture_search_needed`
+  optimization: when only group 0 is needed, PikeVM is skipped entirely.
+
+- **FindAllSubmatch state reuse** — acquires `SearchState` once for entire iteration loop,
+  eliminating per-match `sync.Pool` get/put overhead. Critical for race detector performance.
+
+### Fixed
+- **FindAllSubmatch context loss** — `FindAllSubmatch` previously sliced the haystack
+  (`haystack[pos:]`), losing lookbehind context for `\b` word boundary assertions at
+  match boundaries. Now uses `FindSubmatchAt` with full haystack preservation.
+
+- **BoundedBacktracker stack overflow on 386/macOS** — two-phase search routed through
+  `BoundedBacktracker` for Phase 1, causing recursive stack overflow on large inputs
+  with deep UTF-8 NFA chains (386/macOS 250MB stack limit). Fix: strategies using
+  BoundedBacktracker and NFA bypass two-phase search, going directly to pooled PikeVM.
+
+- **`\B` false positive at end of input** — `SearchWithCapturesAt` at `at==len(haystack)`
+  used `matchesEmpty()` which evaluates with `nil,0`, losing lookbehind context.
+  For `\B` at position 2 of "xx", left='x' (word char) means word boundary, so `\B`
+  should NOT match — but context loss caused a false positive. Fix: uses
+  `matchesEmptyAt(haystack, at)` to preserve full context.
+
+- **Data race in concurrent FindSubmatch** — strategies `UseDFA`, `UseBoth`, and
+  `UseDigitPrefilter` access shared mutable state (`e.dfa` lazy DFA, `e.pikevm`) in
+  their `findIndicesAt` dispatch paths. Concurrent `FindSubmatch` calls raced on this
+  shared state. Fix: these strategies bypass two-phase search, going directly to
+  pooled `state.pikevm.SearchWithCapturesAt()` which is thread-safe by design.
+
 ## [0.12.7] - 2026-03-10
 
 ### Performance
diff --git a/ROADMAP.md b/ROADMAP.md
index 81249e7..89df4a5 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -2,7 +2,7 @@
 
 > **Strategic Focus**: Production-grade regex engine with RE2/rust-regex level optimizations
 
-**Last Updated**: 2026-03-10 | **Current Version**: v0.12.7 | **Target**: v1.0.0 stable
+**Last Updated**: 2026-03-10 | **Current Version**: v0.12.8 | **Target**: v1.0.0 stable
 
 ---
 
@@ -12,7 +12,7 @@ Build a **production-ready, high-performance regex engine** for Go that matches
 
 ### Current State vs Target
 
-| Metric | Current (v0.12.7) | Target (v1.0.0) |
+| Metric | Current (v0.12.8) | Target (v1.0.0) |
 |--------|-------------------|-----------------|
 | Inner literal speedup | **280-3154x** | ✅ Achieved |
 | Case-insensitive speedup | **263x** | ✅ Achieved |
@@ -70,7 +70,9 @@ v0.12.5 ✅ → Non-greedy quantifier fix, ReverseSuffix correctness (#124)
          ↓
 v0.12.6 ✅ → BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
          ↓
-v0.12.7 (Current) ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132)
+v0.12.7 ✅ → PikeVM sparse-dispatch for dot patterns, 2.8-4.8x speedup (#132)
+         ↓
+v0.12.8 (Current) ✅ → Streaming ReplaceAll + DFA-first FindSubmatchAt (#135)
          ↓
 v1.0.0-rc → Feature freeze, API locked
          ↓
@@ -106,6 +108,8 @@ v1.0.0 STABLE → Production release with API stability guarantee
 - ✅ **v0.12.5**: Non-greedy quantifier fix, ReverseSuffix forward verification (#124)
 - ✅ **v0.12.6**: BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127)
 - ✅ **v0.12.7**: PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)
+- ✅ **v0.12.8**: Streaming ReplaceAll + DFA-first FindSubmatchAt, Rust-style two-phase search (#135)
+- ✅ **v0.12.8**: Streaming ReplaceAll + DFA-first FindSubmatchAt, Rust-style two-phase search (#135)
 
 ---
 
@@ -197,7 +201,7 @@ v1.0.0 STABLE → Production release with API stability guarantee
 
 ## Feature Comparison Matrix
 
-| Feature | RE2 | rust-regex | coregex v0.12.7 | coregex v1.0 |
+| Feature | RE2 | rust-regex | coregex v0.12.8 | coregex v1.0 |
 |---------|-----|------------|-----------------|--------------|
 | Lazy DFA | ✅ | ✅ | ✅ | ✅ |
 | Thompson NFA | ✅ | ✅ | ✅ | ✅ |
@@ -355,7 +359,8 @@ Reference implementations available locally:
 
 | Version | Date | Type | Key Changes |
 |---------|------|------|-------------|
-| **v0.12.7** | 2026-03-10 | Performance | **PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132)** |
+| **v0.12.8** | 2026-03-10 | Performance | **Streaming ReplaceAll + DFA-first FindSubmatchAt (#135)** |
+| v0.12.8 | 2026-03-10 | Performance | PikeVM sparse-dispatch for `.` patterns, 2.8-4.8x speedup (#132) |
 | v0.12.6 | 2026-03-08 | Fix | BoundedBacktracker span-based CanHandle, ReplaceAllStringFunc O(n) (#127) |
 | v0.12.5 | 2026-03-08 | Fix | Non-greedy quantifier fix, ReverseSuffix correctness (#124) |
 | v0.12.4 | 2026-03-01 | Test | Test coverage 80%+, CI improvements, awesome-go readiness |
@@ -396,4 +401,4 @@ Reference implementations available locally:
 
 ---
 
-*Current: v0.12.7 | Next: v0.13.0 | Target: v1.0.0*
+*Current: v0.12.8 | Target: v1.0.0*
diff --git a/meta/findall.go b/meta/findall.go
index 10e109d..ba80453 100644
--- a/meta/findall.go
+++ b/meta/findall.go
@@ -38,31 +38,92 @@ func (e *Engine) FindSubmatch(haystack []byte) *MatchWithCaptures {
 // This method is used by ReplaceAll* operations to correctly handle anchors like ^.
 // Unlike FindSubmatch, it takes the FULL haystack and a starting position.
 // Thread-safe: uses pooled state for both OnePass cache and PikeVM.
+//
+// Two-phase search (Rust-style optimization):
+//
+//	Phase 1: DFA/strategy finds match boundaries [start, end] — O(n) fast scan
+//	Phase 2: PikeVM extracts captures within [start, end] — O(match_len)
+//
+// This reduces PikeVM work from O(remaining_haystack) to O(match_len) per match.
+// For 50K matches on 10MB input: ~400x less PikeVM work.
 func (e *Engine) FindSubmatchAt(haystack []byte, at int) *MatchWithCaptures {
-	// Get pooled state first for thread-safe access
+	if at > len(haystack) {
+		return nil
+	}
+
+	// Get pooled state for thread-safe access
 	state := e.getSearchState()
 	defer e.putSearchState(state)
 
-	// For position 0, try OnePass DFA if available (10-20x faster for anchored patterns)
+	return e.findSubmatchAtWithState(haystack, at, state)
+}
+
+// findSubmatchAtWithState is the state-reusing internal version of FindSubmatchAt.
+// Used by FindAllSubmatch to avoid per-match sync.Pool get/put overhead.
+func (e *Engine) findSubmatchAtWithState(haystack []byte, at int, state *SearchState) *MatchWithCaptures {
+	// For position 0, try OnePass DFA if available (10-20x faster for anchored patterns).
+	// OnePass handles captures natively — no need for two-phase search.
 	if at == 0 && e.onepass != nil && state.onepassCache != nil {
 		atomic.AddUint64(&e.stats.OnePassSearches, 1)
 		slots := e.onepass.Search(haystack, state.onepassCache)
 		if slots != nil {
-			// Convert flat slots [start0, end0, start1, end1, ...] to nested captures
 			captures := slotsToCaptures(slots)
 			return NewMatchWithCaptures(haystack, captures)
 		}
-		// OnePass failed (input doesn't match from position 0)
-		// Fall through to PikeVM which can find match anywhere
+		// OnePass failed — fall through to two-phase search
 	}
 
-	atomic.AddUint64(&e.stats.NFASearches, 1)
+	// Strategies that must bypass two-phase search and go directly to PikeVM:
+	//
+	// Thread-safety: UseDFA, UseBoth, UseDigitPrefilter access shared mutable state
+	// (e.dfa lazy DFA, e.pikevm) that is NOT safe for concurrent access.
+	// findSubmatchAtWithState is called with a pooled SearchState, but Phase 1
+	// dispatches to findIndicesDFAAt/findIndicesAdaptiveAt/findIndicesDigitPrefilterAt
+	// which use e.dfa and e.pikevm directly, causing data races.
+	//
+	// Performance: UseNFA Phase 1 uses the same PikeVM as Phase 2, so two-phase
+	// adds overhead without benefit.
+	//
+	// Safety: UseBoundedBacktracker's recursive implementation can overflow the
+	// stack on large inputs with deep UTF-8 NFA chains (386/macOS 250MB limit).
+	switch e.strategy {
+	case UseBoundedBacktracker, UseNFA,
+		UseDFA, UseBoth, UseDigitPrefilter:
+		atomic.AddUint64(&e.stats.NFASearches, 1)
+		nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
+		if nfaMatch == nil {
+			return nil
+		}
+		return NewMatchWithCaptures(haystack, nfaMatch.Captures)
+	}
 
-	nfaMatch := state.pikevm.SearchWithCapturesAt(haystack, at)
-	if nfaMatch == nil {
+	// Phase 1: Use DFA/strategy to find match boundaries.
+	// This is the fast O(n) scan that locates [start, end] without captures.
+	start, end, found := e.findIndicesAtWithState(haystack, at, state)
+	if !found {
 		return nil
 	}
 
+	// Optimization: if only group 0 is needed (no sub-captures), skip PikeVM.
+	// The DFA result already provides exact [start, end] boundaries.
+	if e.nfa.CaptureCount() <= 1 {
+		captures := [][]int{{start, end}}
+		return NewMatchWithCaptures(haystack, captures)
+	}
+
+	// Phase 2: PikeVM extracts captures within the narrow [start, end] span.
+	// The full haystack is passed for lookbehind context (\b at span boundary),
+	// but PikeVM only processes bytes within [start, end].
+	atomic.AddUint64(&e.stats.NFASearches, 1)
+	nfaMatch := state.pikevm.SearchWithCapturesInSpan(haystack, start, end)
+	if nfaMatch == nil {
+		// Defensive fallback: DFA found a match but PikeVM disagrees.
+		nfaMatch = state.pikevm.SearchWithCapturesAt(haystack, at)
+		if nfaMatch == nil {
+			return nil
+		}
+	}
+
 	return NewMatchWithCaptures(haystack, nfaMatch.Captures)
 }
 
@@ -253,6 +314,10 @@ func (e *Engine) Count(haystack []byte, n int) int {
 // FindAllSubmatch returns all successive matches with capture group information.
 // If n > 0, returns at most n matches. If n <= 0, returns all matches.
 //
+// Uses DFA-first two-phase search: DFA finds match boundaries, then PikeVM
+// extracts captures within the narrow match span. This reduces PikeVM work
+// from O(remaining_haystack) to O(match_len) per match.
+//
 // Example:
 //
 //	engine, _ := meta.Compile(`(\w+)@(\w+)\.(\w+)`)
@@ -265,35 +330,46 @@ func (e *Engine) FindAllSubmatch(haystack []byte, n int) []*MatchWithCaptures {
 
 	var matches []*MatchWithCaptures
 	pos := 0
+	lastMatchEnd := -1
+
+	// Get state ONCE for entire iteration — eliminates sync.Pool overhead per match.
+	// Critical for race detector performance (10+ minute timeout without this).
+	state := e.getSearchState()
+	defer e.putSearchState(state)
 
 	for pos <= len(haystack) {
-		// Use PikeVM for capture extraction
-		atomic.AddUint64(&e.stats.NFASearches, 1)
-		nfaMatch := e.pikevm.SearchWithCaptures(haystack[pos:])
-		if nfaMatch == nil {
+		match := e.findSubmatchAtWithState(haystack, pos, state)
+		if match == nil {
 			break
 		}
 
-		// Adjust captures to absolute positions
-		// Captures is [][]int where each element is [start, end] for a group
-		adjustedCaptures := make([][]int, len(nfaMatch.Captures))
-		for i, cap := range nfaMatch.Captures {
-			if len(cap) >= 2 && cap[0] >= 0 {
-				adjustedCaptures[i] = []int{pos + cap[0], pos + cap[1]}
-			} else {
-				adjustedCaptures[i] = nil // Unmatched group
+		matchStart := match.Start()
+		matchEnd := match.End()
+
+		// Skip empty matches at the end of previous non-empty match (stdlib behavior)
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if matchStart == matchEnd && matchStart == lastMatchEnd {
+			pos++
+			if pos > len(haystack) {
+				break
 			}
+			continue
 		}
 
-		match := NewMatchWithCaptures(haystack, adjustedCaptures)
 		matches = append(matches, match)
 
+		// Track non-empty match ends for the skip rule
+		if matchStart != matchEnd {
+			lastMatchEnd = matchEnd
+		}
+
 		// Move position past this match
-		end := nfaMatch.End
-		if end > 0 {
-			pos += end
-		} else {
-			// Empty match: advance by 1 to avoid infinite loop
+		switch {
+		case matchStart == matchEnd:
+			pos = matchEnd + 1
+		case matchEnd > pos:
+			pos = matchEnd
+		default:
 			pos++
 		}
 
diff --git a/nfa/pikevm.go b/nfa/pikevm.go
index 7e5e13d..9fcd7ae 100644
--- a/nfa/pikevm.go
+++ b/nfa/pikevm.go
@@ -910,8 +910,10 @@ func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCapture
 	}
 
 	if at == len(haystack) {
-		// At end of input - check if empty string matches
-		if p.matchesEmpty() {
+		// At end of input - check if empty string matches at this position.
+		// Must use matchesEmptyAt with full haystack context for correct
+		// look assertion evaluation (e.g., \B needs previous byte context).
+		if p.matchesEmptyAt(haystack, at) {
 			return &MatchWithCaptures{
 				Start:    at,
 				End:      at,
@@ -922,8 +924,8 @@ func (p *PikeVM) SearchWithCapturesAt(haystack []byte, at int) *MatchWithCapture
 	}
 
 	if len(haystack) == 0 {
-		// Check if empty string matches
-		if p.matchesEmpty() {
+		// Check if empty string matches (haystack is empty, pos=0)
+		if p.matchesEmptyAt(haystack, 0) {
 			return &MatchWithCaptures{
 				Start:    0,
 				End:      0,
@@ -1085,6 +1087,97 @@ func (p *PikeVM) searchAtWithCaptures(haystack []byte, startPos int) *MatchWithC
 	return nil
 }
 
+// SearchWithCapturesInSpan searches for a match anchored at spanStart,
+// not exceeding spanEnd. The full haystack is preserved for lookbehind
+// context (e.g., \b word boundary assertions at spanStart-1).
+//
+// This implements Phase 2 of the DFA-first two-phase search:
+//
+//	Phase 1: DFA/strategy finds match boundaries [spanStart, spanEnd]
+//	Phase 2: PikeVM extracts captures within [spanStart, spanEnd]
+//
+// The search is anchored: threads are seeded only at spanStart, not at
+// every position. This reduces PikeVM work from O(remaining_haystack)
+// to O(match_len) per match.
+//
+// Preconditions:
+//   - 0 <= spanStart <= spanEnd <= len(haystack)
+//   - A match is known to exist in [spanStart, spanEnd] (from Phase 1)
+//
+// Returns nil if no match is found (should not happen if Phase 1 is correct).
+//
+//nolint:gocognit // Merged match-check + step loop (Rust's nexts pattern) is inherently complex
+func (p *PikeVM) SearchWithCapturesInSpan(haystack []byte, spanStart, spanEnd int) *MatchWithCaptures {
+	if spanStart > spanEnd || spanEnd > len(haystack) {
+		return nil
+	}
+
+	// Reset state
+	p.internalState.Queue = p.internalState.Queue[:0]
+	p.internalState.NextQueue = p.internalState.NextQueue[:0]
+	p.internalState.Visited.Clear()
+
+	// Seed thread only at spanStart (anchored search within span)
+	caps := p.newCaptures()
+	p.addThread(thread{state: p.nfa.StartAnchored(), startPos: spanStart, captures: caps}, haystack, spanStart)
+
+	lastMatchPos := -1
+	var lastMatchCaptures []int
+
+	// Process bytes from spanStart to spanEnd (not len(haystack)).
+	// The full haystack slice is kept so that addThread/step can evaluate
+	// lookbehind assertions (\b) using bytes before spanStart.
+	for pos := spanStart; pos <= spanEnd; pos++ {
+		if pos < spanEnd {
+			b := haystack[pos]
+			p.internalState.Visited.Clear()
+			for _, t := range p.internalState.Queue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						lastMatchCaptures = t.captures.copyData()
+					}
+					if !p.internalState.Longest {
+						break
+					}
+					continue
+				}
+				p.step(t, b, haystack, pos+1)
+			}
+		} else {
+			// At spanEnd: only check for match states, don't step further
+			for _, t := range p.internalState.Queue {
+				if p.nfa.IsMatch(t.state) {
+					if pos > lastMatchPos || lastMatchPos == -1 {
+						lastMatchPos = pos
+						lastMatchCaptures = t.captures.copyData()
+					}
+					break
+				}
+			}
+		}
+
+		if len(p.internalState.NextQueue) == 0 && (pos >= spanEnd || lastMatchPos != -1) {
+			break
+		}
+
+		if pos >= spanEnd {
+			break
+		}
+
+		p.internalState.Queue, p.internalState.NextQueue = p.internalState.NextQueue, p.internalState.Queue[:0]
+	}
+
+	if lastMatchPos != -1 {
+		return &MatchWithCaptures{
+			Start:    spanStart,
+			End:      lastMatchPos,
+			Captures: p.buildCapturesResult(lastMatchCaptures, spanStart, lastMatchPos),
+		}
+	}
+	return nil
+}
+
 // buildCapturesResult converts internal capture slots to the result format
 func (p *PikeVM) buildCapturesResult(caps []int, matchStart, matchEnd int) [][]int {
 	numGroups := p.nfa.CaptureCount()
diff --git a/regex.go b/regex.go
index 0a86eb0..4795229 100644
--- a/regex.go
+++ b/regex.go
@@ -841,38 +841,64 @@ func (r *Regex) FindAllStringIndex(s string, n int) [][]int {
 //	result := re.ReplaceAllLiteral([]byte("age: 42"), []byte("XX"))
 //	// result = []byte("age: XX")
 func (r *Regex) ReplaceAllLiteral(src, repl []byte) []byte {
-	indices := r.FindAllIndex(src, -1)
-	if len(indices) == 0 {
-		// No matches, return copy of src
-		result := make([]byte, len(src))
-		copy(result, src)
-		return result
-	}
+	var result []byte
+	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	// Pre-allocate result buffer
-	// Estimate: len(src) + (len(repl)-avgMatchLen)*numMatches
-	totalMatchLen := 0
-	for _, idx := range indices {
-		totalMatchLen += idx[1] - idx[0]
-	}
-	avgMatchLen := totalMatchLen / len(indices)
-	estimatedLen := len(src) + (len(repl)-avgMatchLen)*len(indices)
-	if estimatedLen < 0 {
-		estimatedLen = len(src)
-	}
+	for {
+		start, end, found := r.engine.FindIndicesAt(src, pos)
+		if !found {
+			break
+		}
 
-	result := make([]byte, 0, estimatedLen)
-	lastEnd := 0
+		// Skip empty matches at the position where a non-empty match just ended.
+		// This matches Go stdlib behavior (see FindAllIndex for details).
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
 
-	for _, idx := range indices {
-		// Append text before match
-		result = append(result, src[lastEnd:idx[0]]...)
-		// Append replacement
+		if !matched {
+			// Lazy allocation on first match
+			result = make([]byte, 0, len(src))
+			matched = true
+		}
+
+		result = append(result, src[lastEnd:start]...)
 		result = append(result, repl...)
-		lastEnd = idx[1]
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		// No matches: return a copy of src (stdlib compatibility)
+		out := make([]byte, len(src))
+		copy(out, src)
+		return out
 	}
 
-	// Append remaining text
 	result = append(result, src[lastEnd:]...)
 	return result
 }
@@ -887,7 +913,61 @@ func (r *Regex) ReplaceAllLiteral(src, repl []byte) []byte {
 //	result := re.ReplaceAllLiteralString("age: 42", "XX")
 //	// result = "age: XX"
 func (r *Regex) ReplaceAllLiteralString(src, repl string) string {
-	return string(r.ReplaceAllLiteral([]byte(src), []byte(repl)))
+	b := stringToBytes(src)
+	var buf strings.Builder
+	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
+
+	for {
+		start, end, found := r.engine.FindIndicesAt(b, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			buf.Grow(len(src))
+			matched = true
+		}
+
+		buf.WriteString(src[lastEnd:start])
+		buf.WriteString(repl)
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		return src
+	}
+
+	buf.WriteString(src[lastEnd:])
+	return buf.String()
 }
 
 // Expand appends template to dst and returns the result; during the
@@ -1107,27 +1187,61 @@ func (r *Regex) ReplaceAllString(src, repl string) string {
 //	})
 //	// result = []byte("2 4 6")
 func (r *Regex) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
-	indices := r.FindAllIndex(src, -1)
-	if len(indices) == 0 {
-		// No matches, return copy of src
-		result := make([]byte, len(src))
-		copy(result, src)
-		return result
-	}
-
 	var result []byte
 	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	for _, idx := range indices {
-		// Append text before match
-		result = append(result, src[lastEnd:idx[0]]...)
-		// Apply replacement function
-		replacement := repl(src[idx[0]:idx[1]])
-		result = append(result, replacement...)
-		lastEnd = idx[1]
+	for {
+		start, end, found := r.engine.FindIndicesAt(src, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			result = make([]byte, 0, len(src))
+			matched = true
+		}
+
+		result = append(result, src[lastEnd:start]...)
+		result = append(result, repl(src[start:end])...)
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		// No matches: return a copy of src (stdlib compatibility)
+		out := make([]byte, len(src))
+		copy(out, src)
+		return out
 	}
 
-	// Append remaining text
 	result = append(result, src[lastEnd:]...)
 	return result
 }
@@ -1146,19 +1260,57 @@ func (r *Regex) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 //	})
 //	// result = "2 4 6"
 func (r *Regex) ReplaceAllStringFunc(src string, repl func(string) string) string {
-	indices := r.FindAllStringIndex(src, -1)
-	if len(indices) == 0 {
-		return src
-	}
-
+	b := stringToBytes(src)
 	var buf strings.Builder
-	buf.Grow(len(src))
 	lastEnd := 0
+	pos := 0
+	lastMatchEnd := -1
+	matched := false
 
-	for _, idx := range indices {
-		buf.WriteString(src[lastEnd:idx[0]])
-		buf.WriteString(repl(src[idx[0]:idx[1]]))
-		lastEnd = idx[1]
+	for {
+		start, end, found := r.engine.FindIndicesAt(b, pos)
+		if !found {
+			break
+		}
+
+		//nolint:gocritic // badCond: intentional - checking empty match at lastMatchEnd
+		if start == end && start == lastMatchEnd {
+			pos++
+			if pos > len(src) {
+				break
+			}
+			continue
+		}
+
+		if !matched {
+			buf.Grow(len(src))
+			matched = true
+		}
+
+		buf.WriteString(src[lastEnd:start])
+		buf.WriteString(repl(src[start:end]))
+		lastEnd = end
+
+		if start != end {
+			lastMatchEnd = end
+		}
+
+		switch {
+		case start == end:
+			pos = end + 1
+		case end > pos:
+			pos = end
+		default:
+			pos++
+		}
+
+		if pos > len(src) {
+			break
+		}
+	}
+
+	if !matched {
+		return src
 	}
 
 	buf.WriteString(src[lastEnd:])