From e5f65cc52c6f3d9ce1eba00a432e937d58f879a6 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:52 +0900
Subject: [PATCH 01/32] checksum: CRC-32 slice-by-8 (642 -> 2525 MB/s, 3.9x)

Replace the byte-at-a-time CRC-32 inner loop with Intel slice-by-8:
fold eight bytes per iteration through eight precomputed tables instead
of one. Output is byte-identical (verified against the byte-at-a-time
loop over 16 MiB). Standalone microbench: 642 -> 2525 MB/s.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/checksum.rs | 57 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/checksum.rs b/src/checksum.rs
index cb278c5..f232133 100644
--- a/src/checksum.rs
+++ b/src/checksum.rs
@@ -71,9 +71,29 @@ impl Crc32 {
 
     pub fn update(&mut self, data: &[u8]) {
         let mut s = self.state;
-        for &b in data {
+
+        // Slice-by-8: consume eight bytes per iteration using eight
+        // precomputed tables. This shortens the per-byte dependency chain
+        // and branch/load count versus the byte-at-a-time loop while
+        // producing identical CRCs.
+        let mut chunks = data.chunks_exact(8);
+        for c in &mut chunks {
+            let lo = u32::from_le_bytes([c[0], c[1], c[2], c[3]]) ^ s;
+            let hi = u32::from_le_bytes([c[4], c[5], c[6], c[7]]);
+            s = CRC32_TABLE8[7][(lo & 0xFF) as usize]
+                ^ CRC32_TABLE8[6][((lo >> 8) & 0xFF) as usize]
+                ^ CRC32_TABLE8[5][((lo >> 16) & 0xFF) as usize]
+                ^ CRC32_TABLE8[4][(lo >> 24) as usize]
+                ^ CRC32_TABLE8[3][(hi & 0xFF) as usize]
+                ^ CRC32_TABLE8[2][((hi >> 8) & 0xFF) as usize]
+                ^ CRC32_TABLE8[1][((hi >> 16) & 0xFF) as usize]
+                ^ CRC32_TABLE8[0][(hi >> 24) as usize];
+        }
+
+        // Tail: fewer than 8 bytes remain.
+        for &b in chunks.remainder() {
             let idx = ((s ^ b as u32) & 0xFF) as usize;
-            s = (s >> 8) ^ CRC32_TABLE[idx];
+            s = (s >> 8) ^ CRC32_TABLE8[0][idx];
         }
         self.state = s;
     }
@@ -94,13 +114,18 @@ impl Default for Crc32 {
     }
 }
 
-/// Build the standard 256-entry table at compile time.
+/// Slice-by-8 tables, built at compile time. `CRC32_TABLE8[0]` is the
+/// standard 256-entry CRC-32 table; `CRC32_TABLE8[n]` for `n >= 1` advances
+/// the CRC by an extra byte position, so eight bytes can be folded per
+/// iteration. See Intel's "Slicing-by-8" technique.
 #[cfg(any(feature = "gzip", test))]
-const CRC32_TABLE: [u32; 256] = {
-    let mut table = [0u32; 256];
-    let mut i = 0u32;
+const CRC32_TABLE8: [[u32; 256]; 8] = {
+    let mut tables = [[0u32; 256]; 8];
+
+    // Base table (slice 0): the standard reflected CRC-32 step.
+    let mut i = 0usize;
     while i < 256 {
-        let mut c = i;
+        let mut c = i as u32;
         let mut k = 0;
         while k < 8 {
             c = if c & 1 != 0 {
@@ -110,10 +135,24 @@ const CRC32_TABLE: [u32; 256] = {
             };
             k += 1;
         }
-        table[i as usize] = c;
+        tables[0][i] = c;
         i += 1;
     }
-    table
+
+    // Each subsequent table folds in one more zero byte:
+    // table[n][i] = (table[n-1][i] >> 8) ^ table[0][table[n-1][i] & 0xFF].
+    let mut n = 1usize;
+    while n < 8 {
+        let mut j = 0usize;
+        while j < 256 {
+            let prev = tables[n - 1][j];
+            tables[n][j] = (prev >> 8) ^ tables[0][(prev & 0xFF) as usize];
+            j += 1;
+        }
+        n += 1;
+    }
+
+    tables
 };
 
 #[cfg(test)]

From 5027f1de133d22b09abeabfaaf5295bece250e92 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:01:08 +0900
Subject: [PATCH 02/32] rle90: bulk-copy literal runs in decoder (~1268 ->
 ~4600 MB/s, 3.5x)

The Normal-state decode path copied literal (non-FLAG) bytes one at a
time through the state machine. Scan for the contiguous non-FLAG span
bounded by input/output availability and copy_from_slice it in one
memcpy, updating last/have_last from the span's final byte. Output is
byte-identical; all rle90 tests pass.

Bench decode (1 MiB): Lorem ~1268 -> ~4600 MB/s, Random ~1211 -> ~3750 MB/s.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/rle90.rs | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/rle90.rs b/src/rle90.rs
index ce42c07..e6447b0 100644
--- a/src/rle90.rs
+++ b/src/rle90.rs
@@ -335,11 +335,25 @@ impl RawDecoder for Decoder {
                                 done: false,
                             });
                         }
-                        output[written] = b;
-                        written += 1;
-                        consumed += 1;
-                        self.last = b;
+                        // Bulk-copy a contiguous run of literal (non-FLAG)
+                        // bytes, bounded by remaining input and output. This
+                        // turns the common literal-heavy stream into a single
+                        // memcpy instead of a per-byte state-machine cycle.
+                        let in_avail = input.len() - consumed;
+                        let out_avail = output.len() - written;
+                        let limit = in_avail.min(out_avail);
+                        let src = &input[consumed..consumed + limit];
+                        // Length of the leading non-FLAG span.
+                        let span = match src.iter().position(|&x| x == FLAG) {
+                            Some(p) => p,
+                            None => limit,
+                        };
+                        // `span >= 1` because src[0] == b != FLAG.
+                        output[written..written + span].copy_from_slice(&src[..span]);
+                        self.last = src[span - 1];
                         self.have_last = true;
+                        written += span;
+                        consumed += span;
                     }
                 }
                 DecState::AwaitCount => {

From 7a1e935fdcd7c4bf8ed8be96904b0aa4fd01b959 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:00 +0900
Subject: [PATCH 03/32] deflate: vectorize decoder match-copy incl. overlapping
 runs

Replace the per-byte overlap fallback in the inflate EmittingMatch hot
loop (distance < remaining, e.g. distance-1 zero runs) with contiguous
copy_within/copy_from_slice in non-wrapping spans, plus an expanding
doubling copy that replicates the d-byte pattern instead of one byte at
a time. Two modulos per byte become one wrap check per span.

Decode throughput (1 MiB, median of 3):
  deflate Zeros: 242 -> ~460 MB/s  (+90%)
  zlib   Zeros: 231 -> ~419 MB/s  (+82%)
  gzip   Zeros: 179 -> ~271 MB/s  (+52%)
  deflate Lorem: 4751 -> ~5700 MB/s (+20%)
  zlib   Lorem: 2483 -> ~2700 MB/s (+9%)

Round-trip + reference-fixture tests (system gzip, python zlib/deflate)
all green; output is byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/deflate/decoder.rs | 124 ++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 32 deletions(-)

diff --git a/src/deflate/decoder.rs b/src/deflate/decoder.rs
index 7e3db14..50316f0 100644
--- a/src/deflate/decoder.rs
+++ b/src/deflate/decoder.rs
@@ -802,42 +802,102 @@ impl Decoder {
                                 work.phase = HuffmanPhase::NextSymbol;
                                 continue;
                             }
-                            // Bulk-copy the non-overlapping run; fall back
-                            // to the byte loop for overlap (distance < remaining)
-                            // and wrap-spanning chunks.
+                            // Copy the match run in contiguous, non-wrapping
+                            // chunks, advancing `window_pos`/`*written` per
+                            // chunk instead of doing two modulos per byte.
+                            //
+                            // Two cases inside the loop:
+                            //   • Non-overlapping (`src + chunk <= window_pos`):
+                            //     one `copy_from_slice` to output and one
+                            //     `copy_within` in the ring.
+                            //   • Overlapping (`distance < remaining`, e.g. a
+                            //     run of zeros at distance 1): the source region
+                            //     grows as we write. We materialise it with an
+                            //     *expanding* `copy_within` — first `d` bytes,
+                            //     then doubling the produced span each step —
+                            //     which `copy_within` vectorises, then mirror
+                            //     the produced bytes to the output in one go.
                             let d = distance as usize;
-                            let out_room = output.len() - *written;
-                            let mut chunk = (remaining as usize).min(out_room);
-                            if chunk > 0 && d >= chunk {
-                                let src = (self.window_pos + self.win_cap - d) % self.win_cap;
-                                // Limit chunk so source and destination
-                                // ranges do not wrap the circular window.
-                                let src_room = self.win_cap - src;
+                            while remaining > 0 && *written < output.len() {
+                                let out_room = output.len() - *written;
+                                // `src` sits `d` bytes behind `window_pos`.
+                                let src = if self.window_pos >= d {
+                                    self.window_pos - d
+                                } else {
+                                    self.window_pos + self.win_cap - d
+                                };
                                 let dst_room = self.win_cap - self.window_pos;
-                                chunk = chunk.min(src_room).min(dst_room);
-                                if chunk > 0 {
-                                    // Copy to output.
-                                    output[*written..*written + chunk]
-                                        .copy_from_slice(&self.window[src..src + chunk]);
-                                    // Copy to window via copy_within (src and dst
-                                    // don't overlap because d >= chunk).
-                                    self.window.copy_within(src..src + chunk, self.window_pos);
-                                    *written += chunk;
-                                    self.window_pos = (self.window_pos + chunk) % self.win_cap;
-                                    if self.window_size < self.win_cap {
-                                        self.window_size =
-                                            (self.window_size + chunk).min(self.win_cap);
+                                let src_room = self.win_cap - src;
+                                // Bytes we can produce before the source read or
+                                // destination write wraps the ring, or we run
+                                // out of output / remaining run.
+                                let span = (remaining as usize)
+                                    .min(out_room)
+                                    .min(dst_room)
+                                    .min(src_room);
+                                if span == 0 {
+                                    break;
+                                }
+
+                                if d >= span {
+                                    // Non-overlapping within this span: source
+                                    // is fully behind the destination and does
+                                    // not wrap (bounded by src_room above).
+                                    let wp = self.window_pos;
+                                    self.window.copy_within(src..src + span, wp);
+                                    output[*written..*written + span]
+                                        .copy_from_slice(&self.window[wp..wp + span]);
+                                    *written += span;
+                                    self.window_pos = wp + span;
+                                } else if src + d == self.window_pos {
+                                    // Overlapping with a contiguous source:
+                                    // `src` is exactly `d` bytes before
+                                    // `window_pos` and neither wraps. Replicate
+                                    // the d-byte pattern forward into
+                                    // `[start, start+span)` by doubling — each
+                                    // step copies an already-materialised prefix
+                                    // of length ≤ d onto the next slot, which
+                                    // `copy_within` vectorises.
+                                    let start = self.window_pos; // == src + d
+                                    let mut produced = 0usize;
+                                    while produced < span {
+                                        let copy = d.min(span - produced);
+                                        self.window.copy_within(
+                                            src + produced..src + produced + copy,
+                                            start + produced,
+                                        );
+                                        produced += copy;
                                     }
-                                    remaining -= chunk as u16;
-                                    progress = true;
+                                    output[*written..*written + span]
+                                        .copy_from_slice(&self.window[start..start + span]);
+                                    *written += span;
+                                    self.window_pos = start + span;
+                                } else {
+                                    // Rare: overlapping match whose source wraps
+                                    // the ring (window_pos < d). Fall back to a
+                                    // byte-wise replication for just this span.
+                                    let start = self.window_pos;
+                                    for i in 0..span {
+                                        let s = if start + i >= d {
+                                            start + i - d
+                                        } else {
+                                            start + i + self.win_cap - d
+                                        };
+                                        let b = self.window[s];
+                                        self.window[start + i] = b;
+                                        output[*written] = b;
+                                        *written += 1;
+                                    }
+                                    self.window_pos = start + span;
                                 }
-                            }
-                            while remaining > 0 && *written < output.len() {
-                                let d = distance as usize;
-                                let src = (self.window_pos + self.win_cap - d) % self.win_cap;
-                                let b = self.window[src];
-                                self.emit_byte(b, output, written);
-                                remaining -= 1;
+
+                                if self.window_pos == self.win_cap {
+                                    self.window_pos = 0;
+                                }
+                                if self.window_size < self.win_cap {
+                                    self.window_size = (self.window_size + span).min(self.win_cap);
+                                }
+                                remaining -= span as u16;
                                 progress = true;
                             }
                             if remaining == 0 {

From 5d62b830c5c4b2a25128df9eb87e4d4c2eb2027e Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:46 +0900
Subject: [PATCH 04/32] deflate: replace per-literal modulo with a wrap branch
 in emit_byte

window_pos advance used `% win_cap`; win_cap is a runtime value so this
lowered to an integer division on every emitted literal. Swap for a
single equality+reset branch and mark emit_byte #[inline]. Correctness
unchanged (output byte-identical); removes a hardware divide from the
literal hot path. Neutral-to-positive on the literal-heavy Lorem decode,
no regression elsewhere.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/deflate/decoder.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/deflate/decoder.rs b/src/deflate/decoder.rs
index 50316f0..e29e636 100644
--- a/src/deflate/decoder.rs
+++ b/src/deflate/decoder.rs
@@ -278,12 +278,19 @@ impl Decoder {
     }
 
     /// Write one byte to both the sliding window and the caller's output.
+    #[inline]
     fn emit_byte(&mut self, byte: u8, output: &mut [u8], written: &mut usize) {
         debug_assert!(*written < output.len());
         output[*written] = byte;
         *written += 1;
         self.window[self.window_pos] = byte;
-        self.window_pos = (self.window_pos + 1) % self.win_cap;
+        // `win_cap` is a runtime value, so `% win_cap` would lower to an
+        // integer division; a single wrap branch is far cheaper on the
+        // per-literal hot path.
+        self.window_pos += 1;
+        if self.window_pos == self.win_cap {
+            self.window_pos = 0;
+        }
         if self.window_size < self.win_cap {
             self.window_size += 1;
         }

From ccebe8afc3721844bd4672afabccca46bed09725 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:53:52 +0900
Subject: [PATCH 05/32] deflate64: vectorize decoder match-copy incl.
 overlapping runs

Mirror the deflate inflate optimization in the deflate64 decoder: copy
each match run in contiguous, non-wrapping spans (one copy_within +
copy_from_slice for non-overlapping spans, an expanding doubling copy
for overlapping ones) instead of a per-byte fallback loop. deflate64's
larger window and match length make long matches common, so the bulk
copy is a big win.

Decode throughput (1 MiB, median of 3):
  deflate64 Lorem: ~1459 -> ~10800 MB/s  (long repetitive matches)
  deflate64 Zeros/Random: unchanged within noise

Round-trip tests green; output byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/deflate64/decoder.rs | 93 ++++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/deflate64/decoder.rs b/src/deflate64/decoder.rs
index 288b2de..de0d5e4 100644
--- a/src/deflate64/decoder.rs
+++ b/src/deflate64/decoder.rs
@@ -662,37 +662,76 @@ impl Decoder {
                                 work.phase = HuffmanPhase::NextSymbol;
                                 continue;
                             }
-                            // Bulk-copy the non-overlapping run; the byte loop
-                            // handles overlap (distance < remaining) and any
-                            // wrap-around inside the circular window.
+                            // Copy the match run in contiguous, non-wrapping
+                            // spans. Non-overlapping spans use a single
+                            // copy_within + copy_from_slice; overlapping spans
+                            // (distance < remaining) replicate the d-byte
+                            // pattern with an expanding doubling copy instead
+                            // of one byte at a time.
                             let d = distance as usize;
-                            let out_room = output.len() - *written;
-                            let mut chunk = (remaining as usize).min(out_room);
-                            if chunk > 0 && d >= chunk {
-                                let src = (self.window_pos + WINDOW_SIZE - d) % WINDOW_SIZE;
-                                let src_room = WINDOW_SIZE - src;
+                            while remaining > 0 && *written < output.len() {
+                                let out_room = output.len() - *written;
+                                let src = if self.window_pos >= d {
+                                    self.window_pos - d
+                                } else {
+                                    self.window_pos + WINDOW_SIZE - d
+                                };
                                 let dst_room = WINDOW_SIZE - self.window_pos;
-                                chunk = chunk.min(src_room).min(dst_room);
-                                if chunk > 0 {
-                                    output[*written..*written + chunk]
-                                        .copy_from_slice(&self.window[src..src + chunk]);
-                                    self.window.copy_within(src..src + chunk, self.window_pos);
-                                    *written += chunk;
-                                    self.window_pos = (self.window_pos + chunk) % WINDOW_SIZE;
-                                    if self.window_size < WINDOW_SIZE {
-                                        self.window_size =
-                                            (self.window_size + chunk).min(WINDOW_SIZE);
+                                let src_room = WINDOW_SIZE - src;
+                                let span = (remaining as usize)
+                                    .min(out_room)
+                                    .min(dst_room)
+                                    .min(src_room);
+                                if span == 0 {
+                                    break;
+                                }
+
+                                if d >= span {
+                                    let wp = self.window_pos;
+                                    self.window.copy_within(src..src + span, wp);
+                                    output[*written..*written + span]
+                                        .copy_from_slice(&self.window[wp..wp + span]);
+                                    *written += span;
+                                    self.window_pos = wp + span;
+                                } else if src + d == self.window_pos {
+                                    let start = self.window_pos; // == src + d
+                                    let mut produced = 0usize;
+                                    while produced < span {
+                                        let copy = d.min(span - produced);
+                                        self.window.copy_within(
+                                            src + produced..src + produced + copy,
+                                            start + produced,
+                                        );
+                                        produced += copy;
                                     }
-                                    remaining -= chunk as u32;
-                                    progress = true;
+                                    output[*written..*written + span]
+                                        .copy_from_slice(&self.window[start..start + span]);
+                                    *written += span;
+                                    self.window_pos = start + span;
+                                } else {
+                                    // Rare: overlapping source wraps the ring.
+                                    let start = self.window_pos;
+                                    for i in 0..span {
+                                        let s = if start + i >= d {
+                                            start + i - d
+                                        } else {
+                                            start + i + WINDOW_SIZE - d
+                                        };
+                                        let b = self.window[s];
+                                        self.window[start + i] = b;
+                                        output[*written] = b;
+                                        *written += 1;
+                                    }
+                                    self.window_pos = start + span;
                                 }
-                            }
-                            while remaining > 0 && *written < output.len() {
-                                let d = distance as usize;
-                                let src = (self.window_pos + WINDOW_SIZE - d) % WINDOW_SIZE;
-                                let b = self.window[src];
-                                self.emit_byte(b, output, written);
-                                remaining -= 1;
+
+                                if self.window_pos == WINDOW_SIZE {
+                                    self.window_pos = 0;
+                                }
+                                if self.window_size < WINDOW_SIZE {
+                                    self.window_size = (self.window_size + span).min(WINDOW_SIZE);
+                                }
+                                remaining -= span as u32;
                                 progress = true;
                             }
                             if remaining == 0 {

From fa2ba85ba4941d8105d8b303eab74a6269149b83 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:50:03 +0900
Subject: [PATCH 06/32] lzma2: bulk match-copy in decode_chunk (xz/lzma2
 decode)

The LZMA2 chunk decoder copied match bytes one at a time through
dict_get/dict_put. For non-overlapping matches (distance+1 >= length)
the source bytes already sit contiguously behind dict_pos, so we can
copy_from_slice into the output and copy_within inside the dict in
bulk, mirroring the dict_copy_match_bulk fast path already used by the
.lzma decoder. The per-byte loop still handles overlapping matches and
the circular-buffer wrap remainder, so decoder output is byte-identical.

Measured (1 MiB corpus, median of 3, release):
  xz   Lorem  decode 340 -> ~553 MB/s (+63%)
  xz   Random decode 434 -> ~680 MB/s (+57%)
  xz   Zeros  decode 365 -> ~384 MB/s (+5%)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lzma2_internal/lzma2_decoder.rs | 63 ++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/src/lzma2_internal/lzma2_decoder.rs b/src/lzma2_internal/lzma2_decoder.rs
index bf33f23..1785be5 100644
--- a/src/lzma2_internal/lzma2_decoder.rs
+++ b/src/lzma2_internal/lzma2_decoder.rs
@@ -529,6 +529,44 @@ impl LzmaCore {
         (distance as usize) < n
     }
 
+    /// Bulk-copy up to `n` non-overlapping match bytes (requires
+    /// `distance + 1 >= n`) from the dictionary into both `out[*written..]`
+    /// and back into the dict. Returns the number of bytes copied; may be
+    /// less than `n` when the source or destination range wraps the circular
+    /// dict, in which case the caller falls back to the per-byte loop for
+    /// the remainder. Caller must guarantee `dict_has(distance)` and that
+    /// `out` has at least `n` bytes of room from `*written`.
+    fn dict_copy_match_bulk(
+        &mut self,
+        distance: u32,
+        n: usize,
+        out: &mut [u8],
+        written: &mut usize,
+    ) -> usize {
+        let dist1 = distance as usize + 1;
+        let src = if self.dict_pos >= dist1 {
+            self.dict_pos - dist1
+        } else {
+            self.dict.len() - (dist1 - self.dict_pos)
+        };
+        let src_room = self.dict.len() - src;
+        let dst_room = self.dict.len() - self.dict_pos;
+        let chunk = n.min(src_room).min(dst_room);
+        if chunk == 0 {
+            return 0;
+        }
+        out[*written..*written + chunk].copy_from_slice(&self.dict[src..src + chunk]);
+        self.dict.copy_within(src..src + chunk, self.dict_pos);
+        *written += chunk;
+        self.dict_pos += chunk;
+        if self.dict_pos >= self.dict.len() {
+            self.dict_pos = 0;
+            self.dict_full = true;
+        }
+        self.output_pos += chunk as u64;
+        chunk
+    }
+
     fn pos_state(&self) -> u32 {
         (self.output_pos as u32) & self.pos_mask
     }
@@ -755,17 +793,32 @@ impl LzmaCore {
                 PacketOutcome::Match { length } => {
                     let mut remaining = length as usize;
                     let distance = self.rep0;
+                    if !self.dict_has(distance) {
+                        return Err(Error::Corrupt);
+                    }
+                    // A match that would write past the chunk's declared
+                    // output size is malformed.
+                    if remaining > target - written {
+                        return Err(Error::Corrupt);
+                    }
+                    // Fast path: when the match is non-overlapping
+                    // (distance + 1 >= remaining) the source bytes already
+                    // exist contiguously behind `dict_pos`, so we can bulk
+                    // `copy_from_slice` / `copy_within` instead of stepping
+                    // byte by byte. `dict_copy_match_bulk` copies as much as
+                    // it can without crossing the circular dict boundary and
+                    // returns the count; the per-byte loop handles any
+                    // wrapped remainder and the overlapping case.
+                    if distance as usize + 1 >= remaining {
+                        let did = self.dict_copy_match_bulk(distance, remaining, out, &mut written);
+                        remaining -= did;
+                    }
                     while remaining > 0 {
                         if !self.dict_has(distance) {
                             return Err(Error::Corrupt);
                         }
                         let b = self.dict_get(distance);
                         self.dict_put(b);
-                        if written >= target {
-                            // Matches that overshoot the per-chunk size cap
-                            // are malformed.
-                            return Err(Error::Corrupt);
-                        }
                         out[written] = b;
                         written += 1;
                         remaining -= 1;

From 0fff46f5e3ccbb7b8982ea82d24555068b4b5a81 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:51:28 +0900
Subject: [PATCH 07/32] lzma2: bulk overlapping match-copy in decode_chunk
 (xz/lzma2 decode)

Overlapping matches (distance+1 < length, e.g. RLE-style runs over
long zero/repeat regions) still fell through to the byte-by-byte loop.
Add dict_copy_match_overlap: it replicates the dist1-byte source window
forward inside the dict via doubling copy_within windows (each read hits
bytes written by an earlier window), then copy_from_slice's the filled
run into the output. Only the non-wrapping contiguous portion is bulked;
the per-byte loop still handles the circular-dict wrap remainder, so
decoder output stays byte-identical.

Measured (1 MiB corpus, median of 3, release):
  xz   Zeros decode ~384 -> ~570 MB/s (+48%)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lzma2_internal/lzma2_decoder.rs | 55 +++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/lzma2_internal/lzma2_decoder.rs b/src/lzma2_internal/lzma2_decoder.rs
index 1785be5..4048cce 100644
--- a/src/lzma2_internal/lzma2_decoder.rs
+++ b/src/lzma2_internal/lzma2_decoder.rs
@@ -567,6 +567,55 @@ impl LzmaCore {
         chunk
     }
 
+    /// Bulk-copy up to `n` *overlapping* match bytes (`distance + 1 < n`)
+    /// from the dictionary into both `out[*written..]` and the dict. The
+    /// source window `[src, dict_pos)` is `dist1` bytes long and is repeated
+    /// forward to fill the run; we extend it by `copy_within` in growing
+    /// windows so each byte read was already written in a previous window.
+    /// Only the contiguous portion that neither wraps the circular dict nor
+    /// overruns is handled here; the caller's per-byte loop covers the rest.
+    /// Returns the number of bytes copied. Caller must guarantee
+    /// `dict_has(distance)` and `out` room for `n` bytes from `*written`.
+    fn dict_copy_match_overlap(
+        &mut self,
+        distance: u32,
+        n: usize,
+        out: &mut [u8],
+        written: &mut usize,
+    ) -> usize {
+        let dist1 = distance as usize + 1;
+        // Source must not wrap: it begins `dist1` bytes behind dict_pos.
+        if self.dict_pos < dist1 {
+            return 0;
+        }
+        let dst = self.dict_pos;
+        let src = dst - dist1;
+        // Destination must not wrap during the whole run.
+        let dst_room = self.dict.len() - dst;
+        let chunk = n.min(dst_room);
+        if chunk == 0 {
+            return 0;
+        }
+        // Self-overlapping forward fill: copy in doubling windows so each
+        // read targets bytes written by an earlier iteration.
+        let mut filled = dist1.min(chunk);
+        self.dict.copy_within(src..src + filled, dst);
+        while filled < chunk {
+            let take = filled.min(chunk - filled);
+            self.dict.copy_within(dst..dst + take, dst + filled);
+            filled += take;
+        }
+        out[*written..*written + chunk].copy_from_slice(&self.dict[dst..dst + chunk]);
+        *written += chunk;
+        self.dict_pos += chunk;
+        if self.dict_pos >= self.dict.len() {
+            self.dict_pos = 0;
+            self.dict_full = true;
+        }
+        self.output_pos += chunk as u64;
+        chunk
+    }
+
     fn pos_state(&self) -> u32 {
         (self.output_pos as u32) & self.pos_mask
     }
@@ -812,6 +861,12 @@ impl LzmaCore {
                     if distance as usize + 1 >= remaining {
                         let did = self.dict_copy_match_bulk(distance, remaining, out, &mut written);
                         remaining -= did;
+                    } else {
+                        // Overlapping run (e.g. RLE-style fills): replicate
+                        // the source window forward in bulk.
+                        let did =
+                            self.dict_copy_match_overlap(distance, remaining, out, &mut written);
+                        remaining -= did;
                     }
                     while remaining > 0 {
                         if !self.dict_has(distance) {

From c31955ec2b46ff6b6ca2e9dbb8dfe03984e34478 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:46 +0900
Subject: [PATCH 08/32] lzma: bulk overlapping match-copy in decoder drain
 loops (.lzma decode)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .lzma streaming decoder already bulk-copied non-overlapping matches
but fell through to a byte-by-byte loop for overlapping runs (small
distance, large length — the dominant pattern on RLE-heavy inputs like
long zero runs). Add dict_copy_match_overlap mirroring the lzma2 path:
replicate the dist1-byte source window forward via doubling copy_within,
then copy_from_slice into the output. Both drain sites (the live Match
outcome and the parked pending_match) get the new branch. The per-byte
loop still covers the circular-dict wrap remainder and respects the
uncompressed-size cap, so decoder output is byte-identical.

Measured (1 MiB corpus, median of 3, release):
  lzma Zeros decode ~860 -> ~5400 MB/s (+6x)
  lzma Lorem/Random decode unchanged (no overlapping runs)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lzma/mod.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/lzma/mod.rs b/src/lzma/mod.rs
index 7156295..66f4168 100644
--- a/src/lzma/mod.rs
+++ b/src/lzma/mod.rs
@@ -503,6 +503,51 @@ impl LzmaCore {
         chunk
     }
 
+    /// Bulk-copy up to `n` *overlapping* match bytes (`distance + 1 < n`,
+    /// e.g. RLE-style runs) from the dict into both `output[*written..]` and
+    /// the dict. The `dist1`-byte source window behind `dict_pos` is
+    /// replicated forward via doubling `copy_within` windows so each read
+    /// targets bytes written by an earlier window. Only the contiguous
+    /// portion that does not wrap the circular dict is handled; the caller's
+    /// per-byte loop covers the wrap remainder. Returns bytes copied. Caller
+    /// must guarantee `dict_has(distance)` and `n` bytes of output room.
+    fn dict_copy_match_overlap(
+        &mut self,
+        distance: u32,
+        n: usize,
+        output: &mut [u8],
+        written: &mut usize,
+    ) -> usize {
+        let dist1 = distance as usize + 1;
+        // Source window must not wrap: it starts `dist1` bytes behind dict_pos.
+        if self.dict_pos < dist1 {
+            return 0;
+        }
+        let dst = self.dict_pos;
+        let src = dst - dist1;
+        let dst_room = self.dict.len() - dst;
+        let chunk = n.min(dst_room);
+        if chunk == 0 {
+            return 0;
+        }
+        let mut filled = dist1.min(chunk);
+        self.dict.copy_within(src..src + filled, dst);
+        while filled < chunk {
+            let take = filled.min(chunk - filled);
+            self.dict.copy_within(dst..dst + take, dst + filled);
+            filled += take;
+        }
+        output[*written..*written + chunk].copy_from_slice(&self.dict[dst..dst + chunk]);
+        *written += chunk;
+        self.dict_pos += chunk;
+        if self.dict_pos >= self.dict.len() {
+            self.dict_pos = 0;
+            self.dict_full = true;
+        }
+        self.output_pos += chunk as u64;
+        chunk
+    }
+
     fn dict_has(&self, distance: u32) -> bool {
         let n = if self.dict_full {
             self.dict.len()
@@ -1043,6 +1088,14 @@ impl Decoder {
                             core.finished = true;
                             pm.remaining = 0;
                         }
+                    } else if want > 0 {
+                        // Overlapping run: replicate the source window forward.
+                        let did = core.dict_copy_match_overlap(pm.distance, want, output, written);
+                        pm.remaining -= did as u32;
+                        if matches!(core.uncompressed_size, Some(t) if core.output_pos >= t) {
+                            core.finished = true;
+                            pm.remaining = 0;
+                        }
                     }
                     while pm.remaining > 0 && *written < output.len() {
                         if !core.dict_has(pm.distance) {
@@ -1147,6 +1200,14 @@ impl Decoder {
                             core.finished = true;
                             remaining = 0;
                         }
+                    } else if want > 0 {
+                        // Overlapping run: replicate the source window forward.
+                        let did = core.dict_copy_match_overlap(distance, want, output, written);
+                        remaining -= did as u32;
+                        if matches!(core.uncompressed_size, Some(t) if core.output_pos >= t) {
+                            core.finished = true;
+                            remaining = 0;
+                        }
                     }
                     while remaining > 0 && *written < output.len() {
                         if !core.dict_has(distance) {

From 521e60180f720ba503d502d9ffbeee3223c44a67 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:50:46 +0900
Subject: [PATCH 09/32] brotli: keep bit accumulator across Huffman LUT hits

The decoder's per-symbol fast path called set_position() after every LUT
hit, which zeroed the 64-bit bit accumulator and forced a fresh refill on
the next decode. Add BitSource::consume() to advance within the buffered
bits, plus peek_lut_bits() that refills once and reports how many bits are
available without asserting on a short tail. The hot Huffman decode loop
now resolves consecutive symbols out of registers.

Decode throughput (median of 3, 1 MiB inputs):
  Random: 106 -> ~140 MB/s (+~32%)
  Lorem:  1030 -> ~1040 MB/s (within noise)

cargo test --features "brotli std": green. clippy clean.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/brotli/huffman.rs | 49 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/brotli/huffman.rs b/src/brotli/huffman.rs
index 61f3c12..055d02b 100644
--- a/src/brotli/huffman.rs
+++ b/src/brotli/huffman.rs
@@ -219,13 +219,17 @@ impl HuffmanDecoder {
         let max = self.max_length as u32;
 
         // Fast path: peek PRIMARY_BITS bits, index the LUT, advance the
-        // bit position by the actual code length.
-        if br.remaining() >= PRIMARY_BITS as usize {
-            let idx = br.peek_bits(PRIMARY_BITS) as usize;
-            let entry = self.lut[idx];
+        // bit position by the actual code length. `peek_lut_bits` refills
+        // and returns however many bits (up to PRIMARY_BITS) are buffered;
+        // when the full window is available we resolve in O(1) and consume
+        // only the matched code length, keeping the rest of the
+        // accumulator intact for the next symbol.
+        let (peeked, avail) = br.peek_lut_bits(PRIMARY_BITS);
+        if avail >= PRIMARY_BITS {
+            let entry = self.lut[peeked as usize];
             let len = entry >> LUT_LEN_SHIFT;
             if len > 0 {
-                br.set_position(br.position() + len as usize);
+                br.consume(len);
                 return Ok(entry & LUT_SYM_MASK);
             }
             // Long code (> PRIMARY_BITS) -- fall through to the slow path.
@@ -307,6 +311,18 @@ impl<'a> BitSource<'a> {
         self.nbits = 0;
     }
 
+    /// Advance the logical position by `n` bits that are already buffered
+    /// in `acc`. The caller must guarantee `n <= self.nbits` (e.g. right
+    /// after a `peek_bits(m)` with `m >= n`). Unlike `set_position` this
+    /// keeps the remaining buffered bits, so the hot Huffman fast path does
+    /// not force a refill on every decoded symbol.
+    #[inline]
+    pub(crate) fn consume(&mut self, n: u32) {
+        debug_assert!(n <= self.nbits);
+        self.acc >>= n;
+        self.nbits -= n;
+    }
+
     /// Remaining bits available (still in `data` plus held in `acc`).
     #[allow(dead_code)]
     pub(crate) fn remaining(&self) -> usize {
@@ -364,6 +380,7 @@ impl<'a> BitSource<'a> {
     /// Peek `n` bits (0 < n ≤ 32) without advancing. Caller must
     /// guarantee `n <= remaining()`. Refills the internal accumulator if
     /// fewer than `n` bits are buffered.
+    #[allow(dead_code)]
     pub(crate) fn peek_bits(&mut self, n: u32) -> u32 {
         debug_assert!(n > 0 && n <= 32);
         debug_assert!(n as usize <= self.remaining());
@@ -378,6 +395,28 @@ impl<'a> BitSource<'a> {
         }
     }
 
+    /// Peek up to `n` bits (1..=32) for the Huffman LUT fast path without
+    /// advancing. Refills once, then returns `(bits, available)` where
+    /// `available = min(nbits, n)` and `bits` holds the low `available`
+    /// bits LSB-first. When `available < n` the caller must fall back to
+    /// the per-bit slow path. Unlike `peek_bits` this never asserts on a
+    /// short tail, so it is safe to call when the stream is nearly drained.
+    #[inline]
+    pub(crate) fn peek_lut_bits(&mut self, n: u32) -> (u32, u32) {
+        if self.nbits < n {
+            self.refill();
+        }
+        let avail = self.nbits.min(n);
+        let bits = if avail == 0 {
+            0
+        } else if avail >= 32 {
+            self.acc as u32
+        } else {
+            (self.acc & ((1u64 << avail) - 1)) as u32
+        };
+        (bits, avail)
+    }
+
     /// Read `n` bits (0..=32) as a little-endian integer.
     pub(crate) fn read_bits(&mut self, n: u32) -> Result<u32, Error> {
         debug_assert!(n <= 32);

From fd7d8c1db10f9d8ce0686ebe310a1161595d6a42 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:06 +0900
Subject: [PATCH 10/32] brotli: skip literal context lookup when there is a
 single tree

When NTREESL == 1 the literal context map is all zeroes, so the per-byte
context-id computation (context::literal_context plus the cmapl index)
always selects tree 0. Specialize the insert-literal loop to decode
straight from htree_l[0] in that case, hoisting the single tree reference
out of the loop. Block-type switching still runs (it drives block_len_l)
but no longer feeds an unused context lookup.

Decode throughput (median of 3, 1 MiB):
  Random: ~140 -> ~235 MB/s  (+~68% on top of the prior commit; +~120%
          vs the original 106 MB/s baseline)
  Lorem:  unchanged (uses multiple context trees -> slow path)

cargo test --features "brotli std": green. clippy clean.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/brotli/mod.rs | 52 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/src/brotli/mod.rs b/src/brotli/mod.rs
index 1e655da..436364a 100644
--- a/src/brotli/mod.rs
+++ b/src/brotli/mod.rs
@@ -2157,6 +2157,11 @@ impl Decoder {
             htree_d.push(Self::read_prefix_code(src, num_dist_codes)?);
         }
 
+        // When there is a single literal Huffman tree the context map is
+        // all zeroes, so literal decoding can skip the per-byte context
+        // lookup entirely (the tree index is constant 0).
+        let single_literal_tree = ntreesl == 1;
+
         // ─── decoding loop ───
         let mut emitted: u32 = 0;
         let mut block_type_l: u32 = 0;
@@ -2232,20 +2237,43 @@ impl Decoder {
             let copy_len = COPY_BASE[copy_code as usize] + copy_extra;
 
             // Emit `insert_len` literals.
-            for _ in 0..insert_len {
-                if emitted >= mlen {
-                    return Err(Error::Corrupt);
+            if single_literal_tree {
+                // Single literal Huffman tree: the context map is all
+                // zeroes, so the per-byte context computation and the
+                // `cmapl` lookup are dead work — the tree index is always
+                // 0. (Block-type switches still drive `block_len_l`, but
+                // they never change which tree we use here.)
+                let tree = &htree_l[0];
+                for _ in 0..insert_len {
+                    if emitted >= mlen {
+                        return Err(Error::Corrupt);
+                    }
+                    maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
+                    block_len_l -= 1;
+                    let sym = tree.decode(src)?;
+                    if sym > 255 {
+                        return Err(Error::Corrupt);
+                    }
+                    self.emit_literal(sym as u8);
+                    emitted += 1;
                 }
-                maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
-                block_len_l -= 1;
-                let cid = context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
-                let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
-                let sym = htree_l[tree_idx].decode(src)?;
-                if sym > 255 {
-                    return Err(Error::Corrupt);
+            } else {
+                for _ in 0..insert_len {
+                    if emitted >= mlen {
+                        return Err(Error::Corrupt);
+                    }
+                    maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
+                    block_len_l -= 1;
+                    let cid =
+                        context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
+                    let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
+                    let sym = htree_l[tree_idx].decode(src)?;
+                    if sym > 255 {
+                        return Err(Error::Corrupt);
+                    }
+                    self.emit_literal(sym as u8);
+                    emitted += 1;
                 }
-                self.emit_literal(sym as u8);
-                emitted += 1;
             }
 
             if emitted >= mlen {

From 58d02b6af80de5ebfcdcbf3fb52a13a79cb932c5 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:54:44 +0900
Subject: [PATCH 11/32] brotli: widen Huffman fast-path LUT from 9 to 11 bits

The primary lookup table now covers codes up to length 11 instead of 9,
resolving more literal/distance symbols in a single indexed load before
falling back to the per-bit canonical walk. The table grows to 2048 u32
(8 KiB) per tree, still L1-resident; build cost is paid once per tree per
meta-block and is dwarfed by the per-symbol decode savings on 1 MiB+
inputs.

Decode throughput (median of 3, 1 MiB):
  Random: ~235 -> ~255 MB/s
  Lorem:  unchanged (~1030, within noise)

cargo test --features "brotli std": green. clippy clean.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/brotli/huffman.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/brotli/huffman.rs b/src/brotli/huffman.rs
index 055d02b..be72649 100644
--- a/src/brotli/huffman.rs
+++ b/src/brotli/huffman.rs
@@ -21,8 +21,10 @@ use crate::error::Error;
 
 /// Primary-LUT width for the fast-path symbol lookup. Codes of length
 /// ≤ `PRIMARY_BITS` resolve in O(1); longer codes fall back to the
-/// per-bit walk.
-const PRIMARY_BITS: u32 = 9;
+/// per-bit walk. Brotli codes cap at length 15; an 11-bit table resolves
+/// the vast majority of literal/distance symbols in one indexed load
+/// (2048 u32 = 8 KiB per tree) while still fitting comfortably in L1.
+const PRIMARY_BITS: u32 = 11;
 const PRIMARY_SIZE: usize = 1 << PRIMARY_BITS;
 
 /// Packed (symbol, length) entry in the primary LUT. The low 16 bits hold

From d83d66df9a598660db7ae7f897e93f346f30ebea Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:53:59 +0900
Subject: [PATCH 12/32] zstd: faster Huffman literal decode via peek/consume

Replace the per-symbol read+unread reseed in HuffTable::decode with a
peek_bits/consume pair on RevBitReader. The old path rebuilt the bit
accumulator from memory on every literal (reseed_from_consumed); the new
path peeks max_bits without consuming, indexes the lookup table, and
consumes only the matched code length. #[inline] the bit-reader read.

Decode micro-bench (4 MiB mixed-entropy text, Huffman+FSE heavy):
~314 -> ~330 MB/s median.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zstd/bitreader.rs | 52 +++++++++++++++++++++++++++++++++++++++----
 src/zstd/huffman.rs   | 29 ++++++++++++------------
 2 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/src/zstd/bitreader.rs b/src/zstd/bitreader.rs
index 3a238bc..3cdb016 100644
--- a/src/zstd/bitreader.rs
+++ b/src/zstd/bitreader.rs
@@ -91,8 +91,11 @@ impl<'a> RevBitReader<'a> {
         self.consumed >= self.available
     }
 
-    /// Give back `n` previously-read bits. Required by the Huffman decoder
-    /// which peeks `max_bits` and then keeps only the actual code length.
+    /// Give back `n` previously-read bits by rewinding the cursor and rebuilding
+    /// the accumulator. Retained as a general bit-reader primitive (and exercised
+    /// by tests); the Huffman decoder now uses the cheaper [`Self::peek_bits`] +
+    /// [`Self::consume`] pair instead, which avoids this per-symbol reseed.
+    #[allow(dead_code)]
     pub fn unread(&mut self, n: u32) {
         let n_usize = n as usize;
         debug_assert!(self.consumed >= n_usize);
@@ -103,8 +106,8 @@ impl<'a> RevBitReader<'a> {
         self.reseed_from_consumed();
     }
 
-    /// Rebuild the internal accumulator from `consumed`. Called from `unread`,
-    /// which is rare (one call per Huffman symbol at most).
+    /// Rebuild the internal accumulator from `consumed`. Called from `unread`.
+    #[allow(dead_code)]
     fn reseed_from_consumed(&mut self) {
         // Position of the next bit to deliver in global bit numbering.
         let next_bit = self.available - 1 - self.consumed;
@@ -133,9 +136,50 @@ impl<'a> RevBitReader<'a> {
         }
     }
 
+    /// Peek up to `peek_bits` bits MSB-first **without** consuming them,
+    /// returning them right-justified in a `u64` alongside the number of real
+    /// payload bits available in that window.
+    ///
+    /// `peek_bits` must be in `1..=56`. When fewer than `peek_bits` payload
+    /// bits remain, the low-order positions of the returned value are zero
+    /// (the accumulator shifts in zeros at the bottom), which is exactly what
+    /// a left-justified canonical-code lookup expects. The second return value
+    /// is `min(peek_bits, remaining)` so the caller can detect truncation.
+    ///
+    /// Used by the Huffman decoder to index a fixed-width lookup table and then
+    /// [`Self::consume`] only the matched code's actual length — avoiding the
+    /// expensive `read` + `unread` reseed that the old per-symbol path paid.
+    #[inline]
+    pub fn peek_bits(&mut self, peek_bits: u32) -> (u64, u32) {
+        debug_assert!((1..=56).contains(&peek_bits));
+        if self.bits_in_acc < peek_bits {
+            self.refill();
+        }
+        let remaining = self.available - self.consumed;
+        let avail = core::cmp::min(peek_bits as usize, remaining) as u32;
+        let raw = self.acc >> (64 - peek_bits);
+        (raw, avail)
+    }
+
+    /// Consume `n` bits previously inspected via [`Self::peek_bits`]. The caller
+    /// must ensure `n` does not exceed the bits the matching peek reported as
+    /// available and that `consumed + n <= available`.
+    #[inline]
+    pub fn consume(&mut self, n: u32) {
+        debug_assert!(n <= self.bits_in_acc);
+        debug_assert!(self.consumed + n as usize <= self.available);
+        if n == 0 {
+            return;
+        }
+        self.acc <<= n;
+        self.bits_in_acc -= n;
+        self.consumed += n as usize;
+    }
+
     /// Read `n` bits (0..=64) MSB-first from the current backward cursor.
     ///
     /// Bits returned right-justified.
+    #[inline]
     pub fn read(&mut self, n: u32) -> Result<u64, Error> {
         if n == 0 {
             return Ok(0);
diff --git a/src/zstd/huffman.rs b/src/zstd/huffman.rs
index 5634561..bd1e4ad 100644
--- a/src/zstd/huffman.rs
+++ b/src/zstd/huffman.rs
@@ -30,28 +30,27 @@ pub struct HuffTable {
 
 impl HuffTable {
     /// Decode one symbol from `br`, consuming exactly its bit length.
+    ///
+    /// Fast path: peek `max_bits` (without consuming), index the lookup table,
+    /// then consume only the matched code's actual length. Peeking returns the
+    /// next `max_bits` already left-justified, so the index is `raw` directly —
+    /// no `read`+`unread` reseed per symbol.
+    #[inline]
     pub fn decode(&self, br: &mut RevBitReader<'_>) -> Result<u8, Error> {
-        if br.remaining() == 0 {
-            return Err(Error::Corrupt);
-        }
         let max = self.max_bits as u32;
-        let avail = br.remaining() as u32;
-        let take = core::cmp::min(max, avail);
-        let raw = br.read(take)?;
-        // Left-justify into a `max`-bit window so the table index matches the
-        // canonical MSB-first code regardless of how many bits remained.
-        let idx = (raw << (max - take)) as usize;
-        if idx >= self.lookup.len() {
+        let (raw, avail) = br.peek_bits(max);
+        if avail == 0 {
             return Err(Error::Corrupt);
         }
+        let idx = raw as usize;
+        // `idx` is in `0..(1 << max)` by construction of `peek_bits`, and the
+        // lookup table is sized `1 << max`, so the index is always in range.
+        debug_assert!(idx < self.lookup.len());
         let (sym, len) = self.lookup[idx];
-        if len == 0 || (len as u32) > take {
+        if len == 0 || len as u32 > avail {
             return Err(Error::Corrupt);
         }
-        // Give back any bits we consumed beyond the actual code length.
-        if take > len as u32 {
-            br.unread(take - len as u32);
-        }
+        br.consume(len as u32);
         Ok(sym)
     }
 }

From 79bdb94df21d1edb8329a69bfdfe1f03f1719fac Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:54:03 +0900
Subject: [PATCH 13/32] zstd: skip zero-bit reads and inline FSE state
 transitions

FseState::advance now special-cases num_bits==0 (max-probability symbols)
to avoid a RevBitReader::read call whose result is always 0, and inlines
symbol()/advance(). A meaningful fraction of FSE table entries carry
num_bits==0, so this removes a hot per-sequence function call.

Decode micro-bench: ~330 -> ~350 MB/s median.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zstd/fse.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/zstd/fse.rs b/src/zstd/fse.rs
index e650b3c..0fdb708 100644
--- a/src/zstd/fse.rs
+++ b/src/zstd/fse.rs
@@ -347,15 +347,24 @@ impl FseState {
     }
 
     /// Return the current symbol (without advancing state).
+    #[inline]
     pub fn symbol(&self, table: &FseTable) -> u16 {
         table.entries[self.state as usize].symbol
     }
 
     /// Advance: read `num_bits` from the reader and update state.
+    #[inline]
     pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> {
         let e = table.entries[self.state as usize];
-        let extra = br.read(e.num_bits as u32)? as u16;
-        let next = e.base_state.wrapping_add(extra);
+        // Most table entries carry a non-trivial `num_bits`, but a meaningful
+        // fraction are 0 (max-probability symbols); skip the bit-reader call
+        // entirely in that case — `base_state` is already the next state.
+        let next = if e.num_bits == 0 {
+            e.base_state
+        } else {
+            let extra = br.read(e.num_bits as u32)? as u16;
+            e.base_state.wrapping_add(extra)
+        };
         if (next as usize) >= table.size() {
             return Err(Error::Corrupt);
         }

From 56e4fd0a3f05d963f4d051550b810ba0b0274023 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:56:17 +0900
Subject: [PATCH 14/32] zstd: inline RevBitReader::read fast path, split wide
 reads out of line

read() is called up to 6x per sequence (FSE state advances + LL/OF/ML
extra bits) and was a non-inlined ~30% hotspot. Mark the n<=56 fast path
#[inline(always)] and move the rare 57..=64-bit wide-read branch into a
#[cold] #[inline(never)] read_wide(). The hot small-read path now inlines
directly into decode_sequences and FseState::advance, eliminating the
call overhead and bounds-check duplication.

Decode micro-bench: instruction count -16% (callgrind), wall-clock
~350 -> ~425 MB/s.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zstd/bitreader.rs | 57 ++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/src/zstd/bitreader.rs b/src/zstd/bitreader.rs
index 3cdb016..f0bc08c 100644
--- a/src/zstd/bitreader.rs
+++ b/src/zstd/bitreader.rs
@@ -179,7 +179,12 @@ impl<'a> RevBitReader<'a> {
     /// Read `n` bits (0..=64) MSB-first from the current backward cursor.
     ///
     /// Bits returned right-justified.
-    #[inline]
+    ///
+    /// The `n <= 56` fast path is `#[inline(always)]` and is the only path the
+    /// FSE/sequence decoders ever take (their reads are at most ~16 bits); the
+    /// rare 57..=64-bit wide path is split into an out-of-line cold function so
+    /// inlining the fast path into hot callers stays cheap.
+    #[inline(always)]
     pub fn read(&mut self, n: u32) -> Result<u64, Error> {
         if n == 0 {
             return Ok(0);
@@ -202,30 +207,36 @@ impl<'a> RevBitReader<'a> {
             self.consumed += n as usize;
             Ok(result)
         } else {
-            // Wide-read path (n in 57..=64): take the top 56 bits in one
-            // shot, then the remaining n-56 bits with a second refill. This
-            // matches the byte-by-byte version's semantics without needing
-            // a u128 accumulator.
-            let high_n = 56u32;
-            let low_n = n - 56;
-            // Top chunk.
-            if self.bits_in_acc < high_n {
-                self.refill();
-            }
-            let high = self.acc >> (64 - high_n);
-            self.acc <<= high_n;
-            self.bits_in_acc -= high_n;
-            // Low chunk.
-            if self.bits_in_acc < low_n {
-                self.refill();
-            }
-            let low = self.acc >> (64 - low_n);
-            self.acc <<= low_n;
-            self.bits_in_acc -= low_n;
-            self.consumed += n as usize;
-            Ok((high << low_n) | low)
+            self.read_wide(n)
         }
     }
+
+    /// Cold path for 57..=64-bit reads: take the top 56 bits, then the
+    /// remaining `n-56` bits with a second refill. Kept out of line so the
+    /// common small-read path inlines compactly into hot callers.
+    #[cold]
+    #[inline(never)]
+    fn read_wide(&mut self, n: u32) -> Result<u64, Error> {
+        // Matches the byte-by-byte version's semantics without a u128 accumulator.
+        let high_n = 56u32;
+        let low_n = n - 56;
+        // Top chunk.
+        if self.bits_in_acc < high_n {
+            self.refill();
+        }
+        let high = self.acc >> (64 - high_n);
+        self.acc <<= high_n;
+        self.bits_in_acc -= high_n;
+        // Low chunk.
+        if self.bits_in_acc < low_n {
+            self.refill();
+        }
+        let low = self.acc >> (64 - low_n);
+        self.acc <<= low_n;
+        self.bits_in_acc -= low_n;
+        self.consumed += n as usize;
+        Ok((high << low_n) | low)
+    }
 }
 
 #[cfg(test)]

From bc17156f0f1ddcf9b2e6b9efff8a7d61c3de2ff8 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:03:56 +0900
Subject: [PATCH 15/32] zstd: hoist LL/ML base+extra tables to module-level
 const

ll_base_extra/ml_base_extra rebuilt two 36/53-element stack arrays on
every call (once per sequence). Replace with module-level const
[(base, extra); N] tables indexed via .get(), so the hot sequence loop
reads a single rodata table instead of re-materialising arrays. Tables
verified element-for-element against the RFC 8478 LL/ML code tables.

Decode micro-bench: instruction count -13% (callgrind), wall-clock
~425 -> ~470 MB/s.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zstd/sequences.rs | 135 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 109 insertions(+), 26 deletions(-)

diff --git a/src/zstd/sequences.rs b/src/zstd/sequences.rs
index 36b71a0..4a8443c 100644
--- a/src/zstd/sequences.rs
+++ b/src/zstd/sequences.rs
@@ -288,36 +288,119 @@ fn resolve_table(
 
 // ─── code → (base, extra_bits) lookups (RFC §3.1.1.3.2.1) ────────────────
 
+/// Literal-length code → (base, extra_bits). Spec tables A.4.1 / A.4.2.
+/// Module-level `const` so the inner sequence loop indexes a single rodata
+/// table instead of materialising two stack arrays per call.
+const LL_BASE_EXTRA: [(u32, u32); 36] = [
+    (0, 0),
+    (1, 0),
+    (2, 0),
+    (3, 0),
+    (4, 0),
+    (5, 0),
+    (6, 0),
+    (7, 0),
+    (8, 0),
+    (9, 0),
+    (10, 0),
+    (11, 0),
+    (12, 0),
+    (13, 0),
+    (14, 0),
+    (15, 0),
+    (16, 1),
+    (18, 1),
+    (20, 1),
+    (22, 1),
+    (24, 2),
+    (28, 2),
+    (32, 3),
+    (40, 3),
+    (48, 4),
+    (64, 6),
+    (128, 7),
+    (256, 8),
+    (512, 9),
+    (1024, 10),
+    (2048, 11),
+    (4096, 12),
+    (8192, 13),
+    (16384, 14),
+    (32768, 15),
+    (65536, 16),
+];
+
+/// Match-length code → (base, extra_bits). From the zstd reference tables.
+const ML_BASE_EXTRA: [(u32, u32); 53] = [
+    (3, 0),
+    (4, 0),
+    (5, 0),
+    (6, 0),
+    (7, 0),
+    (8, 0),
+    (9, 0),
+    (10, 0),
+    (11, 0),
+    (12, 0),
+    (13, 0),
+    (14, 0),
+    (15, 0),
+    (16, 0),
+    (17, 0),
+    (18, 0),
+    (19, 0),
+    (20, 0),
+    (21, 0),
+    (22, 0),
+    (23, 0),
+    (24, 0),
+    (25, 0),
+    (26, 0),
+    (27, 0),
+    (28, 0),
+    (29, 0),
+    (30, 0),
+    (31, 0),
+    (32, 0),
+    (33, 0),
+    (34, 0),
+    (35, 1),
+    (37, 1),
+    (39, 1),
+    (41, 1),
+    (43, 2),
+    (47, 2),
+    (51, 3),
+    (59, 3),
+    (67, 4),
+    (83, 4),
+    (99, 5),
+    (131, 7),
+    (259, 8),
+    (515, 9),
+    (1027, 10),
+    (2051, 11),
+    (4099, 12),
+    (8195, 13),
+    (16387, 14),
+    (32771, 15),
+    (65539, 16),
+];
+
+#[inline]
 fn ll_base_extra(code: u8) -> Result<(u32, u32), Error> {
-    if code > 35 {
-        return Err(Error::Corrupt);
-    }
-    // Spec tables A.4.1 / A.4.2: literal-length codes.
-    let bases: [u32; 36] = [
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 28, 32, 40, 48,
-        64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536,
-    ];
-    let extras: [u32; 36] = [
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10,
-        11, 12, 13, 14, 15, 16,
-    ];
-    Ok((bases[code as usize], extras[code as usize]))
+    LL_BASE_EXTRA
+        .get(code as usize)
+        .copied()
+        .ok_or(Error::Corrupt)
 }
 
+#[inline]
 fn ml_base_extra(code: u8) -> Result<(u32, u32), Error> {
-    if code > 52 {
-        return Err(Error::Corrupt);
-    }
-    let bases: [u32; 53] = [
-        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
-        27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 131, 259, 515,
-        1027, 2051, 4099, 8195, 16387, 32771, 65539,
-    ];
-    let extras: [u32; 53] = [
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-    ];
-    Ok((bases[code as usize], extras[code as usize]))
+    ML_BASE_EXTRA
+        .get(code as usize)
+        .copied()
+        .ok_or(Error::Corrupt)
 }
 
 /// Translate the `offset_value` produced by the offset FSE+extra-bits sum

From 6faec5fe91ac557772457ae09fb1d1d403b4170e Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:04:08 +0900
Subject: [PATCH 16/32] zstd: fetch each FSE entry once per sequence (symbol +
 advance share load)

The sequence loop indexed each FSE table twice per state per sequence:
once in symbol() and again in advance(). Add FseState::entry() to fetch
the FseEntry once (yielding the symbol) and advance_with(entry, size) to
reuse it, and hoist the loop-invariant table sizes. This cuts the
per-sequence memory traffic on the three FSE tables.

Decode micro-bench wall-clock: ~470 -> ~483 MB/s (consistent across runs).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zstd/fse.rs       | 29 +++++++++++++++++++++++++----
 src/zstd/sequences.rs | 28 +++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/src/zstd/fse.rs b/src/zstd/fse.rs
index 0fdb708..8101fa5 100644
--- a/src/zstd/fse.rs
+++ b/src/zstd/fse.rs
@@ -346,16 +346,30 @@ impl FseState {
         Ok(Self { state: s })
     }
 
+    /// Return the table entry for the current state. The entry carries the
+    /// emitted symbol plus the `(num_bits, base_state)` recipe for the next
+    /// transition; fetching it once lets a caller read the symbol and then
+    /// [`Self::advance_with`] using the same load instead of re-indexing.
+    #[inline]
+    pub fn entry(&self, table: &FseTable) -> FseEntry {
+        table.entries[self.state as usize]
+    }
+
     /// Return the current symbol (without advancing state).
     #[inline]
     pub fn symbol(&self, table: &FseTable) -> u16 {
         table.entries[self.state as usize].symbol
     }
 
-    /// Advance: read `num_bits` from the reader and update state.
+    /// Advance using a pre-fetched [`FseEntry`] (from [`Self::entry`]) for the
+    /// *current* state, avoiding a second bounds-checked table index.
     #[inline]
-    pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> {
-        let e = table.entries[self.state as usize];
+    pub fn advance_with(
+        &mut self,
+        e: FseEntry,
+        table_size: usize,
+        br: &mut RevBitReader<'_>,
+    ) -> Result<(), Error> {
         // Most table entries carry a non-trivial `num_bits`, but a meaningful
         // fraction are 0 (max-probability symbols); skip the bit-reader call
         // entirely in that case — `base_state` is already the next state.
@@ -365,12 +379,19 @@ impl FseState {
             let extra = br.read(e.num_bits as u32)? as u16;
             e.base_state.wrapping_add(extra)
         };
-        if (next as usize) >= table.size() {
+        if (next as usize) >= table_size {
             return Err(Error::Corrupt);
         }
         self.state = next;
         Ok(())
     }
+
+    /// Advance: read `num_bits` from the reader and update state.
+    #[inline]
+    pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> {
+        let e = table.entries[self.state as usize];
+        self.advance_with(e, table.size(), br)
+    }
 }
 
 // ─── default tables (RFC 8478 §3.1.1.3.2.2.1) ─────────────────────────────
diff --git a/src/zstd/sequences.rs b/src/zstd/sequences.rs
index 4a8443c..1811c50 100644
--- a/src/zstd/sequences.rs
+++ b/src/zstd/sequences.rs
@@ -127,6 +127,12 @@ pub fn decode_sequences(data: &[u8], state: &mut SequencesState) -> Result<Vec<S
     // data is parsed; the loop below still pushes exactly `n_seq` entries.
     let mut sequences: Vec<Sequence> = Vec::with_capacity((n_seq as usize).min(128 * 1024));
 
+    // Table sizes are loop-invariant; hoist them so the per-sequence advance
+    // doesn't reload `entries.len()` three times.
+    let ll_size = ll_table.size();
+    let ml_size = ml_table.size();
+    let of_size = of_table.size();
+
     for i in 0..n_seq {
         // Per RFC §3.1.1.3.2.1.1 decoding order:
         //   1. Read literal_length extra bits.
@@ -134,9 +140,16 @@ pub fn decode_sequences(data: &[u8], state: &mut SequencesState) -> Result<Vec<S
         //   3. Read match_length extra bits.
         // Then advance ll, ml, of states (in that order) by reading their
         // num_bits. Final sequence skips the advance.
-        let ll_sym = ll_state.symbol(&ll_table) as u8;
-        let ml_sym = ml_state.symbol(&ml_table) as u8;
-        let of_sym = of_state.symbol(&of_table) as u8;
+        //
+        // Fetch each state's table entry once: it yields both the symbol (used
+        // now) and the (num_bits, base_state) recipe reused by advance_with
+        // below, so we index each FSE table only once per sequence.
+        let ll_entry = ll_state.entry(&ll_table);
+        let ml_entry = ml_state.entry(&ml_table);
+        let of_entry = of_state.entry(&of_table);
+        let ll_sym = ll_entry.symbol as u8;
+        let ml_sym = ml_entry.symbol as u8;
+        let of_sym = of_entry.symbol as u8;
 
         let (ll_base, ll_extra) = ll_base_extra(ll_sym)?;
         let (ml_base, ml_extra) = ml_base_extra(ml_sym)?;
@@ -176,10 +189,11 @@ pub fn decode_sequences(data: &[u8], state: &mut SequencesState) -> Result<Vec<S
             break;
         }
 
-        // Advance states: LL, ML, OF (RFC ordering).
-        ll_state.advance(&ll_table, &mut br)?;
-        ml_state.advance(&ml_table, &mut br)?;
-        of_state.advance(&of_table, &mut br)?;
+        // Advance states: LL, ML, OF (RFC ordering), reusing the entries we
+        // already fetched for this state above.
+        ll_state.advance_with(ll_entry, ll_size, &mut br)?;
+        ml_state.advance_with(ml_entry, ml_size, &mut br)?;
+        of_state.advance_with(of_entry, of_size, &mut br)?;
     }
 
     // Stash tables for potential Repeat_Mode reuse next block.

From b2b1ebf9f9906f1931157fc2cc6ca60c115f475c Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:51:40 +0900
Subject: [PATCH 17/32] perf(decoders): bulk overlapping match copy in
 lz4/lz5/lzo/snappy

Replace byte-at-a-time self-overlap copy loops with chunked
extend_from_within: each round duplicates the offset-byte tail produced
so far, doubling the source region, so the loop runs O(log len) rounds
instead of one push per byte. Decoder output is byte-identical.

Measured (1 MiB Lorem, decode MB/s):
  lz4:  1470 -> ~18000  (~12x)
  lzo:  2396 -> ~18000  (~7x)
snappy/lz5 overlap-heavy paths similarly bulk-copy now.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lz4/block.rs  | 15 +++++++++++----
 src/lz5/block.rs  | 14 +++++++++-----
 src/lzo/block.rs  | 13 +++++++++----
 src/snappy/mod.rs | 13 +++++++++----
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/src/lz4/block.rs b/src/lz4/block.rs
index 9370f63..69f8970 100644
--- a/src/lz4/block.rs
+++ b/src/lz4/block.rs
@@ -325,7 +325,8 @@ pub fn decode_block(input: &[u8], out: &mut Vec<u8>, raw_max: usize) -> Result<(
         }
 
         // Non-overlapping match collapses to memcpy; offset==1 is a byte-splat;
-        // otherwise replicate byte-by-byte to handle LZ77 self-overlap.
+        // otherwise replicate in `offset`-sized chunks to handle LZ77
+        // self-overlap while still copying in bulk.
         let start = out.len() - offset;
         if offset >= match_len {
             out.extend_from_within(start..start + match_len);
@@ -333,9 +334,15 @@ pub fn decode_block(input: &[u8], out: &mut Vec<u8>, raw_max: usize) -> Result<(
             let b = out[start];
             out.resize(out.len() + match_len, b);
         } else {
-            for i in 0..match_len {
-                let b = out[start + i];
-                out.push(b);
+            // Overlapping: each round copies the `offset`-byte tail produced so
+            // far. The source region doubles every round, so the number of
+            // rounds is logarithmic in `match_len`.
+            let mut remaining = match_len;
+            while remaining > 0 {
+                let chunk = remaining.min(offset);
+                let s = out.len() - offset;
+                out.extend_from_within(s..s + chunk);
+                remaining -= chunk;
             }
         }
     }
diff --git a/src/lz5/block.rs b/src/lz5/block.rs
index 42f06d8..a674319 100644
--- a/src/lz5/block.rs
+++ b/src/lz5/block.rs
@@ -274,11 +274,15 @@ fn copy_match(out: &mut Vec<u8>, offset: usize, match_len: usize, cap: usize) ->
         let b = out[start];
         out.resize(out.len() + match_len, b);
     } else {
-        // Self-overlap — must copy byte-by-byte so back-references read
-        // from already-written bytes.
-        for i in 0..match_len {
-            let b = out[start + i];
-            out.push(b);
+        // Self-overlap — copy in `offset`-sized chunks. Each round duplicates
+        // the tail produced so far, doubling the source region, so the loop
+        // runs a logarithmic number of times instead of once per byte.
+        let mut remaining = match_len;
+        while remaining > 0 {
+            let chunk = remaining.min(offset);
+            let s = out.len() - offset;
+            out.extend_from_within(s..s + chunk);
+            remaining -= chunk;
         }
     }
     Ok(())
diff --git a/src/lzo/block.rs b/src/lzo/block.rs
index 32adbc6..2583a26 100644
--- a/src/lzo/block.rs
+++ b/src/lzo/block.rs
@@ -622,10 +622,15 @@ fn copy_match(
         let b = out[start];
         out.resize(out.len() + length, b);
     } else {
-        // Self-overlap (LZ77 RLE-style): replicate byte-by-byte.
-        for i in 0..length {
-            let b = out[start + i];
-            out.push(b);
+        // Self-overlap (LZ77 RLE-style): copy in `distance`-sized chunks. Each
+        // round duplicates the tail produced so far, doubling the source
+        // region, so the loop runs a logarithmic number of times.
+        let mut remaining = length;
+        while remaining > 0 {
+            let chunk = remaining.min(distance);
+            let s = out.len() - distance;
+            out.extend_from_within(s..s + chunk);
+            remaining -= chunk;
         }
     }
     Ok(())
diff --git a/src/snappy/mod.rs b/src/snappy/mod.rs
index bdad3b1..f99509e 100644
--- a/src/snappy/mod.rs
+++ b/src/snappy/mod.rs
@@ -540,10 +540,15 @@ fn copy_from_back(
         let b = out[start];
         out.resize(out.len() + length, b);
     } else {
-        // Self-overlapping (RLE-style) — must replicate byte-by-byte.
-        for i in 0..length {
-            let b = out[start + i];
-            out.push(b);
+        // Self-overlapping (RLE-style) — copy in `offset`-sized chunks. Each
+        // round duplicates the tail produced so far, doubling the source
+        // region, so the loop runs a logarithmic number of times.
+        let mut remaining = length;
+        while remaining > 0 {
+            let chunk = remaining.min(offset);
+            let s = out.len() - offset;
+            out.extend_from_within(s..s + chunk);
+            remaining -= chunk;
         }
     }
     Ok(())

From 672fc92d3c6a15782fc0b72d6af2e2b1ff7b90a8 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:53:36 +0900
Subject: [PATCH 18/32] perf(lzw): single-pass string emit, drop scratch stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

decode_string_to_emit_buf walked the prefix chain into a scratch Vec
(reversing), then popped it into emit_buf (un-reversing) — two passes and
a second buffer. Walk the chain straight into emit_buf and reverse just
the written region in place: one walk + one tight in-place reverse, and
the scratch stack field is removed. Decoder output is byte-identical.

Measured (decode MB/s): Lorem 425 -> ~510, Zeros 641 -> ~950 (~1.45x).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lzw/mod.rs | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/lzw/mod.rs b/src/lzw/mod.rs
index 01846e1..88e931b 100644
--- a/src/lzw/mod.rs
+++ b/src/lzw/mod.rs
@@ -518,8 +518,6 @@ pub struct Decoder {
     /// in forward order. `emit_head` is the read cursor.
     emit_buf: Vec<u8>,
     emit_head: usize,
-    /// Scratch stack used while reversing a decoded string.
-    stack: Vec<u8>,
     /// Once `finish` has nothing more to flush.
     completed: bool,
 }
@@ -542,7 +540,6 @@ impl Decoder {
             codes_in_group: 0,
             emit_buf: Vec::new(),
             emit_head: 0,
-            stack: Vec::with_capacity(max_size),
             completed: false,
         }
     }
@@ -629,17 +626,20 @@ impl Decoder {
     /// Decode the string represented by `code`, pushing characters forward
     /// into `self.emit_buf`. Updates `self.finchar` to the first character.
     fn decode_string_to_emit_buf(&mut self, mut code: u32) {
-        self.stack.clear();
+        // Walk the prefix chain straight into `emit_buf`. Suffixes come out
+        // deepest-last (reverse order), so we append them followed by the
+        // first character, then reverse just the region we wrote. This avoids
+        // the separate scratch stack and its second copy pass — a single walk
+        // plus one in-place reverse (tight, cache-friendly).
+        let start = self.emit_buf.len();
         while code >= 256 {
-            self.stack.push(self.suffix[code as usize]);
+            self.emit_buf.push(self.suffix[code as usize]);
             code = self.prefix[code as usize] as u32;
         }
         let first = code as u8;
         self.finchar = first;
         self.emit_buf.push(first);
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        self.emit_buf[start..].reverse();
     }
 
     /// Drain `self.emit_buf` (from `self.emit_head`) into `out`, returning
@@ -827,7 +827,6 @@ impl RawDecoder for Decoder {
         self.codes_in_group = 0;
         self.emit_buf.clear();
         self.emit_head = 0;
-        self.stack.clear();
         self.completed = false;
     }
 }

From 957dcf463dd05c7f8592e6e474b142d1cd853559 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:56:07 +0900
Subject: [PATCH 19/32] perf(lzo): skip-step accelerator in encoder match
 search

On a miss, advance by a stride that grows with the consecutive-miss
count (LZ4-style) instead of one byte at a time, so incompressible data
is scanned in large strides. The first ~64 misses still step 1 byte, so
compressible data keeps a dense hash table and its ratio/speed are
unchanged; a hit resets the stride. Round-trip tests pass (decode
output unchanged).

Measured (features=lzo,factory,std; encode MB/s):
  Random: 495 -> ~3000  (~6x)
  Lorem:  1335 -> ~1290 (flat, within noise; output size unchanged)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lzo/block.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/lzo/block.rs b/src/lzo/block.rs
index 2583a26..5f00200 100644
--- a/src/lzo/block.rs
+++ b/src/lzo/block.rs
@@ -125,6 +125,11 @@ pub fn encode_block(input: &[u8], out: &mut Vec<u8>) {
     let in_len = input.len();
     let hash_limit = in_len.saturating_sub(4);
 
+    // Skip-step accelerator: count consecutive misses and advance faster the
+    // longer we go without a match, so incompressible data is scanned in big
+    // strides instead of one byte at a time. Reset to 1-byte steps on a hit.
+    let mut search_match_nb: u32 = 1 << 6;
+
     while ip < hash_limit {
         let h = hash4([input[ip], input[ip + 1], input[ip + 2], input[ip + 3]]);
         let candidate = table[h];
@@ -150,9 +155,15 @@ pub fn encode_block(input: &[u8], out: &mut Vec<u8>) {
         }
 
         if !found {
-            ip += 1;
+            // Grow the step the longer we search without a hit. The first
+            // ~64 misses still step 1 byte (keeping the hash table dense for
+            // compressible data); after that the stride ramps up.
+            let step = (search_match_nb >> 6) as usize;
+            search_match_nb += 1;
+            ip += step;
             continue;
         }
+        search_match_nb = 1 << 6;
 
         // Extend the match forward as far as possible.
         let mut match_len = 4usize;

From 09cc2c849c426d85c103b674e43d415a4b2e09e5 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:57:21 +0900
Subject: [PATCH 20/32] perf(snappy): skip-step accelerator in encoder match
 search

On a miss, advance by a stride that grows with the consecutive-miss
count (matching the reference encoder's bytes_between_hash_lookups), so
incompressible regions are scanned in large strides instead of one byte
at a time. A hit resets the stride. Round-trip tests pass and the >2x
ratio test still holds (output stays well-compressed).

Measured (features=snappy,factory,std; encode MB/s):
  Random: 804 -> ~4900  (~6x)
  Lorem:  2557 -> ~2760 (slightly up)
  Zeros:  flat (within run-to-run noise)

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/snappy/mod.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/snappy/mod.rs b/src/snappy/mod.rs
index f99509e..971dd1f 100644
--- a/src/snappy/mod.rs
+++ b/src/snappy/mod.rs
@@ -214,6 +214,12 @@ fn compress_block(input: &[u8], out: &mut Vec<u8>) {
     };
 
     // Match-or-literal main loop.
+    // Skip-step accelerator: advance faster the longer the matcher goes
+    // without a hit, so incompressible regions are scanned in large strides
+    // (mirrors the reference encoder's `skip`/`bytes_between_hash_lookups`).
+    // A hit resets the stride to 1 byte.
+    let mut search_match_nb: u32 = 1 << 5;
+
     while ip < match_limit {
         let h = hash(input, ip);
         let candidate = table[h] as usize;
@@ -231,9 +237,12 @@ fn compress_block(input: &[u8], out: &mut Vec<u8>) {
             && input[candidate + 3] == input[ip + 3];
 
         if !four_match {
-            ip += 1;
+            let step = (search_match_nb >> 5) as usize;
+            search_match_nb += 1;
+            ip += step;
             continue;
         }
+        search_match_nb = 1 << 5;
 
         // Found a 4-byte match. First, flush any pending literal.
         if next_emit < ip {

From e667989a85910dbff52f3b47f8d0d3fde1b16af1 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:52:57 +0900
Subject: [PATCH 21/32] bzip2: cut SA-IS allocations and inline induced-sort
 hot paths

Reuse a single bucket scratch buffer across all induced-sort passes
instead of allocating fresh bucket-start/-end Vecs each call, collect
LMS positions once during type classification (removing the later
rescan + lms_positions rebuild), and inline is_lms / bucket fills.

SA-IS build throughput on a 900 KB block (median of 3, --release):
  lorem  18.6 -> 19.2 MB/s
  zeros  31.8 -> 32.9 MB/s
  random 10.8 -> 13.2 MB/s  (+22%)

Output unchanged: same induced-sort order => identical BWT+origin.
Full test suite (round-trip, reference fixtures, bunzip2 cross-check)
stays green.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/bzip2/bwt.rs | 180 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 117 insertions(+), 63 deletions(-)

diff --git a/src/bzip2/bwt.rs b/src/bzip2/bwt.rs
index 985fe13..c91495c 100644
--- a/src/bzip2/bwt.rs
+++ b/src/bzip2/bwt.rs
@@ -149,50 +149,67 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) {
     //    suffix i+1 is S-type. Otherwise L-type.
     //
     //    `t[i] == true` ⇒ S-type.
+    //
+    //    While we classify, collect the LMS positions (left-to-right)
+    //    once so we don't have to rescan the type array later.
     let mut t = vec![false; n];
     t[n - 1] = true;
+    // An LMS position is an S-type whose left neighbour is L-type. We
+    // know `t[i+1]` as we walk right-to-left, so we can detect the LMS
+    // at `i+1` the moment we set `t[i]` (it is LMS iff t[i+1] && !t[i]).
+    let mut lms_positions: Vec<i32> = Vec::new();
     for i in (0..n - 1).rev() {
-        t[i] = if text[i] < text[i + 1] {
+        let si = if text[i] < text[i + 1] {
             true
         } else if text[i] == text[i + 1] {
             t[i + 1]
         } else {
             false
         };
+        t[i] = si;
+        // i+1 is LMS iff it is S-type (t[i+1]) and i is L-type (!si).
+        if t[i + 1] && !si {
+            lms_positions.push((i + 1) as i32);
+        }
     }
+    // We pushed LMS positions in descending order; reverse for ascending.
+    lms_positions.reverse();
+    let n1 = lms_positions.len();
 
     // 2. Compute bucket sizes (count of each symbol in `text`).
+    //    `counts` holds the per-symbol counts; `bucket` is a reusable
+    //    scratch into which we materialise either bucket starts or ends.
     let mut counts = vec![0i32; alphabet_size];
     for &c in text {
         counts[c as usize] += 1;
     }
+    let mut bucket = vec![0i32; alphabet_size];
 
     // 3. Step A: place LMS suffixes at the END of their buckets in `sa`.
     sa.fill(-1);
-    let mut ends = bucket_ends(&counts);
-    for (i, &c_i) in text.iter().enumerate().take(n).skip(1) {
-        if is_lms(&t, i) {
-            let c = c_i as usize;
-            ends[c] -= 1;
-            sa[ends[c] as usize] = i as i32;
-        }
+    fill_bucket_ends(&counts, &mut bucket);
+    for &p in &lms_positions {
+        let c = text[p as usize] as usize;
+        bucket[c] -= 1;
+        sa[bucket[c] as usize] = p;
     }
 
     // 4. Induced sort of L-suffixes (left-to-right pass).
-    induce_sort_l(text, sa, &t, &counts);
+    induce_sort_l(text, sa, &t, &counts, &mut bucket);
 
     // 5. Induced sort of S-suffixes (right-to-left pass).
-    induce_sort_s(text, sa, &t, &counts);
+    induce_sort_s(text, sa, &t, &counts, &mut bucket);
 
     // 6. Compact LMS suffixes to the front of SA (preserving the order
     //    we just induced) and name them by their LMS-substring identity.
-    let mut n1 = 0usize;
+    let mut j1 = 0usize;
     for i in 0..n {
         if sa[i] >= 0 && is_lms(&t, sa[i] as usize) {
-            sa[n1] = sa[i];
-            n1 += 1;
+            sa[j1] = sa[i];
+            j1 += 1;
         }
     }
+    debug_assert_eq!(j1, n1);
     // Clear the rest as a workspace for naming.
     for slot in sa.iter_mut().take(n).skip(n1) {
         *slot = -1;
@@ -266,21 +283,11 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) {
         sa_is_inner(&reduced_text, sa1, new_alpha);
     }
 
-    // 8. Recover positions of LMS suffixes in the original text.
-    //    The trailing region currently holds the reduced text; we need
-    //    to translate name-indices in sa1 back to original positions.
-    //    We rebuild a list of LMS positions in left-to-right order.
-    let mut lms_positions: Vec<i32> = Vec::with_capacity(n1);
-    for (i, &is_s) in t.iter().enumerate().take(n).skip(1) {
-        if is_s && !t[i - 1] {
-            lms_positions.push(i as i32);
-        }
-    }
-    debug_assert_eq!(lms_positions.len(), n1);
-
-    // Translate: sa1[i] = index-of-LMS in original. Reuse trailing area
-    // as scratch for translated positions, then place them at bucket
-    // ends.
+    // 8. Recover positions of LMS suffixes in the original text using
+    //    the `lms_positions` list (in left-to-right order) we collected
+    //    during classification. sa1[i] is the rank/index in that list.
+    //    Translate the sorted LMS order (currently in sa[..n1]) into
+    //    original positions, in place.
     for slot in sa.iter_mut().take(n1) {
         let idx = *slot as usize; // recursive SA gave us the LMS index in left-to-right order.
         *slot = lms_positions[idx];
@@ -292,72 +299,80 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) {
 
     // 9. Place sorted LMS suffixes at the ENDS of their buckets in SA,
     //    in the order produced by the recursive call.
-    let mut ends = bucket_ends(&counts);
-    // Move them from positions 0..n1 to bucket-end positions, going
-    // right-to-left to preserve relative order within each bucket.
+    //
+    //    The sorted LMS positions sit in sa[..n1]. We scatter them to
+    //    bucket ends going right-to-left. Because scattering reads from
+    //    the front of `sa` while writing toward bucket ends (which are
+    //    at indices >= the read cursor for every symbol except possibly
+    //    the sentinel — and the sentinel bucket holds exactly the single
+    //    n-1 suffix that is never LMS), a destructive in-place scatter
+    //    could clobber a not-yet-read entry. To stay safe and simple we
+    //    snapshot the n1 sorted positions, clear `sa`, then scatter.
     let mut lms_sorted: Vec<i32> = Vec::with_capacity(n1);
     lms_sorted.extend_from_slice(&sa[..n1]);
     for slot in sa.iter_mut().take(n) {
         *slot = -1;
     }
+    fill_bucket_ends(&counts, &mut bucket);
     for &pos in lms_sorted.iter().rev() {
         let c = text[pos as usize] as usize;
-        ends[c] -= 1;
-        sa[ends[c] as usize] = pos;
+        bucket[c] -= 1;
+        sa[bucket[c] as usize] = pos;
     }
 
     // 10. Final induced sorts: L then S.
-    induce_sort_l(text, sa, &t, &counts);
-    induce_sort_s(text, sa, &t, &counts);
+    induce_sort_l(text, sa, &t, &counts, &mut bucket);
+    induce_sort_s(text, sa, &t, &counts, &mut bucket);
 }
 
 /// `true` iff suffix `i` is S-type AND suffix `i-1` is L-type (left-
 /// most S in a run). Suffix 0 is never an LMS in our convention.
+#[inline(always)]
 fn is_lms(t: &[bool], i: usize) -> bool {
     i > 0 && t[i] && !t[i - 1]
 }
 
-/// Compute exclusive prefix sums giving the *start* index of each
-/// bucket in SA.
-fn bucket_starts(counts: &[i32]) -> Vec<i32> {
-    let mut s = Vec::with_capacity(counts.len());
+/// Materialise the *start* index of each bucket (exclusive prefix sum
+/// of `counts`) into the reusable scratch `out`.
+#[inline]
+fn fill_bucket_starts(counts: &[i32], out: &mut [i32]) {
     let mut acc = 0i32;
-    for &c in counts {
-        s.push(acc);
+    for (o, &c) in out.iter_mut().zip(counts.iter()) {
+        *o = acc;
         acc += c;
     }
-    s
 }
 
-/// Compute the *end* (one-past-last) index of each bucket in SA.
-fn bucket_ends(counts: &[i32]) -> Vec<i32> {
-    let mut e = Vec::with_capacity(counts.len());
+/// Materialise the *end* (one-past-last) index of each bucket
+/// (inclusive prefix sum of `counts`) into the reusable scratch `out`.
+#[inline]
+fn fill_bucket_ends(counts: &[i32], out: &mut [i32]) {
     let mut acc = 0i32;
-    for &c in counts {
+    for (o, &c) in out.iter_mut().zip(counts.iter()) {
         acc += c;
-        e.push(acc);
+        *o = acc;
     }
-    e
 }
 
 /// Induced sort of L-type suffixes. Scans `sa` left-to-right; for each
 /// non-negative entry `sa[i] = j`, if `j > 0` and suffix `j-1` is
 /// L-type, place `j-1` at the next free slot at the START of bucket
-/// `text[j-1]`.
-fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) {
+/// `text[j-1]`. `bucket` is reusable scratch of length `alphabet_size`.
+fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) {
     let n = text.len();
-    let mut starts = bucket_starts(counts);
+    fill_bucket_starts(counts, bucket);
     for i in 0..n {
-        if sa[i] <= 0 {
+        let v = sa[i];
+        if v <= 0 {
             continue; // -1 or 0 — we handle 0 by not predecessing.
         }
-        let j = (sa[i] as usize) - 1;
+        let j = (v as usize) - 1;
         if !t[j] {
             // L-type.
             let c = text[j] as usize;
-            let slot = starts[c] as usize;
-            sa[slot] = j as i32;
-            starts[c] += 1;
+            let slot = bucket[c];
+            sa[slot as usize] = j as i32;
+            bucket[c] = slot + 1;
         }
     }
 }
@@ -365,20 +380,22 @@ fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) {
 /// Induced sort of S-type suffixes. Scans `sa` right-to-left; for each
 /// non-negative entry `sa[i] = j`, if `j > 0` and suffix `j-1` is
 /// S-type, place `j-1` at the next free slot at the END of bucket
-/// `text[j-1]`.
-fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) {
+/// `text[j-1]`. `bucket` is reusable scratch of length `alphabet_size`.
+fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) {
     let n = text.len();
-    let mut ends = bucket_ends(counts);
+    fill_bucket_ends(counts, bucket);
     for i in (0..n).rev() {
-        if sa[i] <= 0 {
+        let v = sa[i];
+        if v <= 0 {
             continue;
         }
-        let j = (sa[i] as usize) - 1;
+        let j = (v as usize) - 1;
         if t[j] {
             // S-type.
             let c = text[j] as usize;
-            ends[c] -= 1;
-            sa[ends[c] as usize] = j as i32;
+            let slot = bucket[c] - 1;
+            bucket[c] = slot;
+            sa[slot as usize] = j as i32;
         }
     }
 }
@@ -549,6 +566,43 @@ mod tests {
         assert_eq!(back, data);
     }
 
+    #[cfg(feature = "std")]
+    #[test]
+    #[ignore]
+    fn timing_bwt_forward() {
+        extern crate std;
+        let n = 900_000usize;
+        let lorem_src = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, \
+sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. ";
+        let mut lorem = Vec::with_capacity(n);
+        while lorem.len() < n {
+            lorem.extend_from_slice(lorem_src);
+        }
+        lorem.truncate(n);
+        let zeros = vec![0u8; n];
+        let mut random = Vec::with_capacity(n);
+        let mut state: u32 = 0xDEAD_BEEF;
+        for _ in 0..n {
+            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+            random.push((state >> 16) as u8);
+        }
+        for (name, data) in [("lorem", &lorem), ("zeros", &zeros), ("random", &random)] {
+            let _ = bwt_forward(data);
+            let mut best = f64::MAX;
+            for _ in 0..3 {
+                let t = std::time::Instant::now();
+                let (l, _o) = bwt_forward(data);
+                let el = t.elapsed().as_secs_f64();
+                std::hint::black_box(&l);
+                if el < best {
+                    best = el;
+                }
+            }
+            let mbps = (n as f64) / best / 1e6;
+            std::eprintln!("BWT {name}: {:.2} ms  {:.1} MB/s", best * 1e3, mbps);
+        }
+    }
+
     #[test]
     fn matches_naive_on_small_inputs() {
         // Cross-check SA-IS output against a naive cyclic sort for

From 93acccae2fe8e59728dd7f148ad19650326b227a Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:54:23 +0900
Subject: [PATCH 22/32] bzip2: recurse SA-IS reduced problem in place (drop
 per-level copy)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reduced LMS text already lives in the trailing n1 cells of `sa`
and the recursive sub-suffix-array is written into the leading n1
cells. Since no two adjacent positions can both be LMS, n1 <= n/2, so
those two regions are disjoint halves of split_at_mut and can be
borrowed (immutable text / mutable output) simultaneously — removing
the fresh reduced_text Vec allocated and filled at every recursion
level.

SA-IS build throughput on a 900 KB block (median of 3, --release),
relative to the previous commit:
  lorem  19.2 -> 21.2 MB/s
  zeros  32.9 -> 40.5 MB/s  (+23%)
  random 13.2 -> 14.1 MB/s

Output unchanged (identical recursion, identical BWT+origin); full
suite green.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/bzip2/bwt.rs | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/bzip2/bwt.rs b/src/bzip2/bwt.rs
index c91495c..da48f0b 100644
--- a/src/bzip2/bwt.rs
+++ b/src/bzip2/bwt.rs
@@ -273,14 +273,17 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) {
             sa1_area[name_of_pos as usize] = i as i32;
         }
     } else {
-        // Recurse on the reduced text. We need a slice of length n1 for
-        // sa1, and the reduced text occupies the trailing n1 cells.
+        // Recurse on the reduced text in place, with no copy. The
+        // reduced text occupies the trailing n1 cells (t1_area[..n1])
+        // and the sub-suffix-array is written into the leading n1 cells
+        // (sa1_area[..n1]). These come from the two disjoint halves of
+        // `split_at_mut`, so we can hold an immutable borrow of the text
+        // and a mutable borrow of the output simultaneously. They are
+        // guaranteed non-overlapping because n1 <= n/2 (no two adjacent
+        // positions are both LMS), hence n1 <= n - n1.
+        let reduced_text: &[i32] = &t1_area[..n1];
         let sa1 = &mut sa1_area[..n1];
-        // The reduced text is t1_area[..n1] but we want to pass it as
-        // an immutable slice. We must copy to avoid aliasing.
-        let mut reduced_text: Vec<i32> = Vec::with_capacity(n1);
-        reduced_text.extend_from_slice(&t1_area[..n1]);
-        sa_is_inner(&reduced_text, sa1, new_alpha);
+        sa_is_inner(reduced_text, sa1, new_alpha);
     }
 
     // 8. Recover positions of LMS suffixes in the original text using

From 02e6627642d2e72c6eacaf51503d39d1387ba59a Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:50:28 +0900
Subject: [PATCH 23/32] =?UTF-8?q?xpress=5Fhuffman:=20amortize=20decoder=20?=
 =?UTF-8?q?history=20trim=20(O(n=C2=B2)=20=E2=86=92=20O(n))?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

emit_byte() drained out_history back to MAX_DISTANCE on every emitted
byte once the 64 KiB window filled, shifting the whole buffer per byte —
quadratic over the stream and the dominant decode cost. Let the buffer
grow to 2*MAX_DISTANCE and trim the oldest half only then; all reads are
relative to len() and bounded by MAX_DISTANCE, so correctness is
unchanged and decode stays byte-identical.

Decode MB/s (1 MiB): Lorem 1.34→786, Zeros 1.46→588, Random 1.40→266.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/xpress_huffman/decoder.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/xpress_huffman/decoder.rs b/src/xpress_huffman/decoder.rs
index 51bcbd3..f67da9c 100644
--- a/src/xpress_huffman/decoder.rs
+++ b/src/xpress_huffman/decoder.rs
@@ -126,7 +126,15 @@ impl Decoder {
     fn emit_byte(&mut self, b: u8) {
         self.decoded.push(b);
         self.out_history.push(b);
-        if self.out_history.len() > MAX_DISTANCE {
+        // Trim the retained history so it never grows without bound, but do
+        // it amortized: a naive `drain` back to `MAX_DISTANCE` on every byte
+        // shifts the whole 64 KiB buffer per emit, which is O(n²) over the
+        // stream. Instead let it grow to `2 * MAX_DISTANCE` and only then drop
+        // the oldest half, keeping at least the last `MAX_DISTANCE` bytes.
+        // Every read into `out_history` is relative to its current `len()` and
+        // bounded by `MAX_DISTANCE` (validated above against `out_history.len()`),
+        // so retaining that many is always sufficient. Amortized O(1) per byte.
+        if self.out_history.len() >= 2 * MAX_DISTANCE {
             let drop = self.out_history.len() - MAX_DISTANCE;
             self.out_history.drain(0..drop);
         }

From 5da0abf6bd523eed649eea9cfef5d0ae87069a05 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:51:23 +0900
Subject: [PATCH 24/32] lznt1: bulk copy_within for non-overlapping match
 copies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

decode_compressed_chunk emitted every match byte-by-byte. For the common
non-overlapping case (offset >= length) the source range is already fully
populated, so resize + copy_within does it in one shot. The overlapping
run case (offset < length, run-length expansion) keeps the byte loop.
Decode byte-identical.

Decode MB/s (Zeros 1 MiB, match-heavy): ~1807 → ~1980 (+~10%).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lznt1/decoder.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/lznt1/decoder.rs b/src/lznt1/decoder.rs
index 7a5ee58..442df17 100644
--- a/src/lznt1/decoder.rs
+++ b/src/lznt1/decoder.rs
@@ -138,12 +138,20 @@ impl Decoder {
                     if out.len() + length > CHUNK_SIZE {
                         return Err(Error::Corrupt);
                     }
-                    // Byte-by-byte copy from the chunk's own output so
-                    // far. Self-overlap is permitted (offset < length).
                     let src_start = pos - offset;
-                    for k in 0..length {
-                        let b = out[src_start + k];
-                        out.push(b);
+                    if offset >= length {
+                        // Non-overlapping: the source range is fully
+                        // populated already, so grow the buffer and bulk
+                        // copy in one shot instead of byte-by-byte.
+                        out.resize(pos + length, 0);
+                        out.copy_within(src_start..src_start + length, pos);
+                    } else {
+                        // Self-overlapping run (offset < length): each
+                        // emitted byte feeds the next, so copy one at a time.
+                        for k in 0..length {
+                            let b = out[src_start + k];
+                            out.push(b);
+                        }
                     }
                 }
             }

From b54a7712d4c76c24b3319f93e28011fee3af5a32 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:54:40 +0900
Subject: [PATCH 25/32] hpack: byte-wide FSA Huffman decoder

Replace the bit-at-a-time canonical decode (one table probe per input bit)
with a byte-wide finite-state machine over the canonical trie: one lookup per
input byte, emitting 0..=8 symbols. Built per call but cheaply (composed from
a per-nibble table), so even the fast/short-code case stays flat.

h2-huffman decode MB/s (1 MiB):
  Lorem:  385 -> 378 (flat, within noise)
  Zeros:  155 -> 202 (+30%)
  Random:  64 ->  93 (+45%)
All hpack + full-feature tests green; output byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/hpack/huffman.rs | 222 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 169 insertions(+), 53 deletions(-)

diff --git a/src/hpack/huffman.rs b/src/hpack/huffman.rs
index cb0851b..a12f4cf 100644
--- a/src/hpack/huffman.rs
+++ b/src/hpack/huffman.rs
@@ -92,22 +92,173 @@ pub(crate) const CODES: [(u32, u8); 257] = [
     (0x3fffffff, 30),
 ];
 
+#[cfg(test)]
 const MAX_LEN: usize = 30;
 
-/// Canonical decode tables reconstructed from [`CODES`]. Cheap to build
-/// (257-entry sweep); built per decode call.
+// ─── byte FSA fast decoder ───────────────────────────────────────────────
+//
+// Bit-at-a-time canonical decoding is correct but slow (one table probe per
+// input bit). For throughput we precompute a byte-wide finite-state machine
+// over the canonical code's binary trie: each transition consumes a whole
+// input byte and emits 0..=8 complete symbols. A Huffman string then costs
+// exactly one table lookup per input byte instead of ~8 bit probes. The FSA
+// is rebuilt per `decode` call; its construction is a fixed sweep over the
+// trie (≈ states × 256 steps), negligible against any non-trivial input.
+//
+// The byte-for-byte output and every RFC 7541 §5.2 rejection (EOS symbol,
+// over-long padding, non-`1` padding) are identical to the bit-at-a-time
+// path — the FSA is just a faster way to walk the same trie.
+
+/// One byte transition: where to go and what to emit.
+#[derive(Clone, Copy)]
+struct Trans {
+    /// Trie node reached after consuming this byte's 8 bits.
+    next: u16,
+    /// Number of complete symbols emitted while consuming the byte (0..=8).
+    n: u8,
+    /// Set if any consumed bit completed the EOS symbol (→ Corrupt).
+    eos: bool,
+    /// The emitted symbol bytes (only the first `n` are meaningful).
+    out: [u8; 8],
+}
+
+/// Byte FSA: `trans[state * 256 + byte]` gives the transition. State 0 is the
+/// trie root, the only valid end-of-string boundary.
+struct FastTable {
+    trans: Vec<Trans>,
+    /// Per-state padding metadata: `(depth, all_ones)` for the partial path
+    /// from the root to this node. A valid end state has `depth < 8` and
+    /// `all_ones` (the RFC 7541 §5.2 EOS-prefix padding rule).
+    pad: Vec<(u8, bool)>,
+}
+
+impl FastTable {
+    fn build() -> Self {
+        // Canonical binary trie. Node 0 is the root. `child[node][bit]` is the
+        // next node index (0 = unset, since the root is never a child).
+        // `leaf_sym[node]` is the symbol for a leaf, or -1.
+        let mut child: Vec<[u16; 2]> = Vec::new();
+        child.push([0, 0]); // root
+        let mut leaf_sym: Vec<i32> = Vec::new();
+        leaf_sym.push(-1);
+
+        for (sym, &(code, len)) in CODES.iter().enumerate() {
+            let len = len as u32;
+            let mut node = 0usize;
+            for i in (0..len).rev() {
+                let bit = ((code >> i) & 1) as usize;
+                let nxt = child[node][bit];
+                if nxt == 0 {
+                    let new = child.len() as u16;
+                    child.push([0, 0]);
+                    leaf_sym.push(-1);
+                    child[node][bit] = new;
+                    node = new as usize;
+                } else {
+                    node = nxt as usize;
+                }
+            }
+            leaf_sym[node] = sym as i32;
+        }
+
+        let n_states = child.len();
+
+        // Per-node padding metadata: depth from root and whether the path is
+        // all `1`-bits. Leaves reset to the root after emitting, so only
+        // non-leaf nodes are ever a resting state, but we fill every node.
+        let mut pad = alloc::vec![(0u8, true); n_states];
+        // Iterative DFS from the root; children are always added after their
+        // parent, so a single forward pass over node indices in creation
+        // order would also work, but we walk explicitly for clarity.
+        let mut stack = alloc::vec![0usize];
+        while let Some(node) = stack.pop() {
+            let (d, ones) = pad[node];
+            let kids = child[node];
+            for (bit, &c) in kids.iter().enumerate() {
+                if c != 0 {
+                    let c = c as usize;
+                    pad[c] = (d + 1, ones && bit == 1);
+                    stack.push(c);
+                }
+            }
+        }
+
+        // Build a per-nibble transition first (n_states × 16, four bit-steps
+        // each), then compose each byte transition from its two nibble halves.
+        // This costs ≈ n_states·(16·4 + 256·2) build steps instead of
+        // n_states·256·8 — roughly a 4× cheaper construction, which matters
+        // because the table is rebuilt on every `decode` call.
+        struct Nib {
+            next: u16,
+            n: u8,
+            eos: bool,
+            out: [u8; 4],
+        }
+        let mut nib = Vec::with_capacity(n_states * 16);
+        for state in 0..n_states {
+            for half in 0..16u32 {
+                let mut node = state;
+                let mut out = [0u8; 4];
+                let mut n = 0u8;
+                let mut eos = false;
+                for i in (0..4).rev() {
+                    let bit = ((half >> i) & 1) as usize;
+                    node = child[node][bit] as usize;
+                    if leaf_sym[node] >= 0 {
+                        let sym = leaf_sym[node] as u16;
+                        if sym == EOS {
+                            eos = true;
+                        } else {
+                            out[n as usize] = sym as u8;
+                            n += 1;
+                        }
+                        node = 0;
+                    }
+                }
+                nib.push(Nib {
+                    next: node as u16,
+                    n,
+                    eos,
+                    out,
+                });
+            }
+        }
+
+        let mut trans = Vec::with_capacity(n_states * 256);
+        for state in 0..n_states {
+            for byte in 0..256usize {
+                let hi = &nib[state * 16 + (byte >> 4)];
+                let lo = &nib[hi.next as usize * 16 + (byte & 0x0f)];
+                let mut out = [0u8; 8];
+                let hn = hi.n as usize;
+                out[..hn].copy_from_slice(&hi.out[..hn]);
+                let ln = lo.n as usize;
+                out[hn..hn + ln].copy_from_slice(&lo.out[..ln]);
+                trans.push(Trans {
+                    next: lo.next,
+                    n: (hn + ln) as u8,
+                    eos: hi.eos || lo.eos,
+                    out,
+                });
+            }
+        }
+
+        FastTable { trans, pad }
+    }
+}
+
+/// Canonical decode tables reconstructed from [`CODES`], retained only for
+/// the canonicality self-test (which also underpins the FSA's correctness).
+#[cfg(test)]
 struct DecodeTable {
     /// `first_code[len]` = numeric value of the first codeword of length
     /// `len` (1..=30).
     first_code: [u32; MAX_LEN + 1],
-    /// `first_index[len]` = offset into `symbols` of the first codeword of
-    /// length `len`.
-    first_index: [usize; MAX_LEN + 1],
     /// Symbols ordered by (length asc, code asc).
     symbols: Vec<u16>,
-    count: [u32; MAX_LEN + 1],
 }
 
+#[cfg(test)]
 impl DecodeTable {
     fn build() -> Self {
         let mut count = [0u32; MAX_LEN + 1];
@@ -125,35 +276,14 @@ impl DecodeTable {
             }
         }
         let mut first_code = [0u32; MAX_LEN + 1];
-        let mut first_index = [0usize; MAX_LEN + 1];
         let mut code = 0u32;
-        let mut index = 0usize;
         for len in 1..=MAX_LEN {
             first_code[len] = code;
-            first_index[len] = index;
             code = (code + count[len]) << 1;
-            index += count[len] as usize;
         }
         DecodeTable {
             first_code,
-            first_index,
             symbols,
-            count,
-        }
-    }
-
-    /// If `acc` (a value of exactly `len` bits) is a complete codeword,
-    /// return its symbol.
-    fn lookup(&self, acc: u32, len: usize) -> Option<u16> {
-        let c = self.count[len];
-        if c == 0 {
-            return None;
-        }
-        let off = acc.checked_sub(self.first_code[len])?;
-        if off < c {
-            Some(self.symbols[self.first_index[len] + off as usize])
-        } else {
-            None
         }
     }
 }
@@ -193,40 +323,26 @@ pub fn encoded_len(data: &[u8]) -> usize {
 /// bits, padding not consisting of EOS-prefix `1`s, and any appearance of
 /// the EOS symbol — all as [`Error::Corrupt`].
 pub fn decode(data: &[u8]) -> Result<Vec<u8>, Error> {
-    let table = DecodeTable::build();
+    let table = FastTable::build();
     let mut out = Vec::with_capacity(data.len() * 2);
-    let mut acc: u32 = 0;
-    let mut nbits: usize = 0;
+    // Current trie node (state). State 0 = root = clean symbol boundary.
+    let mut state = 0usize;
+    let trans = &table.trans[..];
     for &byte in data {
-        for i in (0..8).rev() {
-            let bit = ((byte >> i) & 1) as u32;
-            acc = (acc << 1) | bit;
-            nbits += 1;
-            if nbits > MAX_LEN {
-                // No codeword is longer than 30 bits.
-                return Err(Error::Corrupt);
-            }
-            if let Some(sym) = table.lookup(acc, nbits) {
-                if sym == EOS {
-                    return Err(Error::Corrupt);
-                }
-                out.push(sym as u8);
-                acc = 0;
-                nbits = 0;
-            }
+        let t = &trans[state * 256 + byte as usize];
+        if t.eos {
+            return Err(Error::Corrupt);
         }
+        // Emit the symbols completed in this byte (0..=8).
+        out.extend_from_slice(&t.out[..t.n as usize]);
+        state = t.next as usize;
     }
     // Trailing bits are padding: must be < 8 bits, all 1s. A prefix-free code
     // guarantees these EOS-prefix 1s cannot complete a real symbol above.
-    if nbits >= 8 {
+    let (depth, all_ones) = table.pad[state];
+    if depth >= 8 || !all_ones {
         return Err(Error::Corrupt);
     }
-    if nbits > 0 {
-        let mask = (1u32 << nbits) - 1;
-        if acc & mask != mask {
-            return Err(Error::Corrupt);
-        }
-    }
     Ok(out)
 }
 

From f1fcef5eae12fab57e09bc9b76d69857c3beded7 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 11:59:34 +0900
Subject: [PATCH 26/32] arc_crunch: single-write LZW string assembly + literal
 fast path

Decode previously wrote every output byte twice: pushed onto a scratch stack
during the prefix-chain walk, then popped into emit_buf. Replace with a
fixed-size reverse-assembly scratch (allocated once) filled back-to-front in
one walk, then a single vectorised extend_from_slice into emit_buf. A
length-1 literal (common on incompressible input) skips assembly entirely.

crunch decode MB/s (1 MiB):
  Lorem:  322 -> 390 (+21%)
  Zeros:  686 -> 1078 (+57%)
  Random: 194 -> 207 (+7%)
Crafted-stream guards preserved (i==0 rejects over-long/cyclic chains);
all arc_crunch + full-feature tests green, output byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/arc_crunch/mod.rs | 57 ++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/arc_crunch/mod.rs b/src/arc_crunch/mod.rs
index 049d1a3..6566bb9 100644
--- a/src/arc_crunch/mod.rs
+++ b/src/arc_crunch/mod.rs
@@ -39,7 +39,7 @@
 //!
 //! Crafted streams never panic: the classic LZW KwKwK case and any
 //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
-//! dictionary and the decoded-string stack are bounded by `1 << maxbits`;
+//! dictionary and the decoded-string scratch are bounded by `1 << maxbits`;
 //! every dictionary index is bounds-checked and width arithmetic is checked.
 //!
 //! ## References
@@ -386,7 +386,7 @@ pub struct Decoder {
     /// Decoded characters waiting to flush, forward order.
     emit_buf: Vec<u8>,
     emit_head: usize,
-    /// Scratch stack used while reversing a decoded string.
+    /// Scratch buffer used while reversing a decoded string.
     stack: Vec<u8>,
     completed: bool,
 }
@@ -408,7 +408,9 @@ impl Decoder {
             finchar: 0,
             emit_buf: Vec::new(),
             emit_head: 0,
-            stack: Vec::with_capacity(max_size),
+            // Fixed-size reverse-assembly scratch: a decoded string is at most
+            // `1 << maxbits` ≤ `max_size` bytes, so its tail always fits.
+            stack: vec![0u8; max_size],
             completed: false,
         }
     }
@@ -439,27 +441,48 @@ impl Decoder {
     /// Decode `code` into `emit_buf` (forward order); updates `finchar`.
     /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
     /// out of range) — defends against crafted streams.
+    ///
+    /// The chain is walked once, writing the reversed string straight into a
+    /// reserved tail region of `emit_buf` (deepest suffix last). This avoids
+    /// the previous scratch-stack round trip (every byte was written twice:
+    /// once pushed, once popped) — each output byte is now written exactly
+    /// once.
     fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
-        self.stack.clear();
-        let limit = 1usize << self.maxbits;
-        let mut hops = 0usize;
+        // `stack` is a fixed-size scratch (length == 1 << MAX_BITS, allocated
+        // once). We walk the prefix chain writing the string back-to-front into
+        // its tail, then bulk-copy the assembled forward-order slice into
+        // `emit_buf` with a single vectorised `extend_from_slice`. This avoids
+        // both the old per-byte `emit_buf.push` (a capacity check per byte) and
+        // any per-call zero-initialisation.
+        // Fast path: a bare literal (very common on incompressible input) is a
+        // length-1 string — emit it directly and skip the reverse-assembly.
+        if code < 256 {
+            let first = code as u8;
+            self.finchar = first;
+            self.emit_buf.push(first);
+            return Ok(());
+        }
+        let scratch = &mut self.stack[..];
+        let mut i = scratch.len();
         while code >= 256 {
-            if code as usize >= self.prefix.len() {
+            // `i` reaching 0 means the chain is longer than any valid string
+            // (> 1 << maxbits): a malformed / cyclic prefix table. Reject
+            // rather than underflow.
+            if code as usize >= self.prefix.len() || i == 0 {
                 return Err(Error::Corrupt);
             }
-            self.stack.push(self.suffix[code as usize]);
+            i -= 1;
+            scratch[i] = self.suffix[code as usize];
             code = self.prefix[code as usize] as u32;
-            hops += 1;
-            if hops > limit {
-                return Err(Error::Corrupt);
-            }
+        }
+        if i == 0 {
+            return Err(Error::Corrupt);
         }
         let first = code as u8;
         self.finchar = first;
-        self.emit_buf.push(first);
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        i -= 1;
+        scratch[i] = first;
+        self.emit_buf.extend_from_slice(&scratch[i..]);
         Ok(())
     }
 
@@ -621,7 +644,7 @@ impl RawDecoder for Decoder {
         self.finchar = 0;
         self.emit_buf.clear();
         self.emit_head = 0;
-        self.stack.clear();
+        // `stack` is fixed-size scratch overwritten on every use; leave it.
         self.completed = false;
     }
 }

From 0a0ddb4576b8452f0481a83eac38d33a5e529797 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:00:55 +0900
Subject: [PATCH 27/32] arc_squash: single-write LZW string assembly + literal
 fast path

Same optimization as arc_crunch: replace the push-to-stack / pop-to-emit_buf
double write with a fixed-size reverse-assembly scratch filled in one walk and
bulk-copied via extend_from_slice; bare literals skip assembly.

squashed decode MB/s (1 MiB):
  Lorem:  384 -> 502 (+31%)
  Zeros:  669 -> 976 (+46%)
  Random: 194 -> 210 (+8%)
Crafted-stream guards preserved; all arc_squash + full-feature tests green,
output byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/arc_squash/mod.rs | 47 +++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/src/arc_squash/mod.rs b/src/arc_squash/mod.rs
index 83b769a..00c63f1 100644
--- a/src/arc_squash/mod.rs
+++ b/src/arc_squash/mod.rs
@@ -35,7 +35,7 @@
 //!
 //! Crafted streams never panic: the classic LZW KwKwK case and any
 //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
-//! dictionary and the decoded-string stack are bounded by `1 << 13`; every
+//! dictionary and the decoded-string scratch are bounded by `1 << 13`; every
 //! dictionary index is bounds-checked.
 //!
 //! ## References
@@ -347,7 +347,7 @@ pub struct Decoder {
     /// Decoded characters waiting to flush, forward order.
     emit_buf: Vec<u8>,
     emit_head: usize,
-    /// Scratch stack used while reversing a decoded string.
+    /// Fixed-size scratch used while reversing a decoded string.
     stack: Vec<u8>,
     completed: bool,
 }
@@ -366,7 +366,9 @@ impl Decoder {
             finchar: 0,
             emit_buf: Vec::new(),
             emit_head: 0,
-            stack: Vec::with_capacity(max_size),
+            // Fixed-size reverse-assembly scratch: a decoded string is at most
+            // `MAX_CODE` bytes, so its tail always fits.
+            stack: vec![0u8; max_size],
             completed: false,
         }
     }
@@ -396,26 +398,37 @@ impl Decoder {
     /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
     /// out of range) — defends against crafted streams.
     fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
-        self.stack.clear();
-        let limit = MAX_CODE as usize;
-        let mut hops = 0usize;
+        // Fast path: a bare literal is a length-1 string — emit directly.
+        if code < 256 {
+            let first = code as u8;
+            self.finchar = first;
+            self.emit_buf.push(first);
+            return Ok(());
+        }
+        // Walk the prefix chain back-to-front into the fixed-size scratch, then
+        // bulk-copy the forward-order slice into emit_buf with one
+        // extend_from_slice. This avoids the old per-byte push/pop round trip
+        // (each output byte written twice).
+        let scratch = &mut self.stack[..];
+        let mut i = scratch.len();
         while code >= 256 {
-            if code as usize >= self.prefix.len() {
+            // `i == 0` means the chain is longer than any valid string: a
+            // malformed / cyclic prefix table. Reject rather than underflow.
+            if code as usize >= self.prefix.len() || i == 0 {
                 return Err(Error::Corrupt);
             }
-            self.stack.push(self.suffix[code as usize]);
+            i -= 1;
+            scratch[i] = self.suffix[code as usize];
             code = self.prefix[code as usize] as u32;
-            hops += 1;
-            if hops > limit {
-                return Err(Error::Corrupt);
-            }
+        }
+        if i == 0 {
+            return Err(Error::Corrupt);
         }
         let first = code as u8;
         self.finchar = first;
-        self.emit_buf.push(first);
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        i -= 1;
+        scratch[i] = first;
+        self.emit_buf.extend_from_slice(&scratch[i..]);
         Ok(())
     }
 
@@ -544,7 +557,7 @@ impl RawDecoder for Decoder {
         self.finchar = 0;
         self.emit_buf.clear();
         self.emit_head = 0;
-        self.stack.clear();
+        // `stack` is fixed-size scratch overwritten on every use; leave it.
         self.completed = false;
     }
 }

From 66d7f1821a9f1041d1a4f8fa1f749ceec47ed98d Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:06:00 +0900
Subject: [PATCH 28/32] delta: vectorizable filter loop via direct predecessor
 indexing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-byte loop kept a dist-byte ring with a modulo branch and a
read-modify-write of history every byte, serialising the whole transform.
Split into three phases: seed the first dist bytes through the ring (cross-call
history), then run a flat recurrence over the bulk — encode reads input[i-dist]
directly (read-only input → auto-vectorises), decode reads output[i-dist] —
then refresh the ring from the tail. Streaming/chunk semantics unchanged
(the 1-byte-chunk-vs-bulk equivalence test passes).

delta encode MB/s (1 MiB, default dist=1):
  ~1680 -> ~25000 (≈15x; the read-only subtract vectorises)
delta decode unchanged (dist=1 reconstruction is an inherently serial
prefix sum); larger distances also speed up decode. Output byte-identical.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/delta/mod.rs | 64 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/delta/mod.rs b/src/delta/mod.rs
index 1521cc3..8c08a95 100644
--- a/src/delta/mod.rs
+++ b/src/delta/mod.rs
@@ -168,20 +168,37 @@ impl RawEncoder for Encoder {
         self.hist.check()?;
         let n = input.len().min(output.len());
         let h = &mut self.hist;
-        for i in 0..n {
+        let dist = h.dist;
+
+        // Phase 1: the first `dist` outputs subtract bytes from the ring
+        // history (previous calls / the all-zero seed).
+        let seed = dist.min(n);
+        for i in 0..seed {
             let orig = input[i];
-            // history[i - dist] is the original byte we are about to
-            // overwrite at the ring cursor.
             let prev = h.buf[h.pos];
-            // Modular subtraction is the defined transform (see module docs).
             output[i] = orig.wrapping_sub(prev);
-            // Store the *original* byte for future positions.
             h.buf[h.pos] = orig;
             h.pos += 1;
-            if h.pos == h.dist {
+            if h.pos == dist {
                 h.pos = 0;
             }
         }
+        // Phase 2: for `i >= dist` the predecessor is `input[i - dist]` (the
+        // input *is* the original stream), so read it directly and drop the
+        // ring modulo branch and history accesses from the hot loop.
+        for i in dist..n {
+            output[i] = input[i].wrapping_sub(input[i - dist]);
+        }
+        // Phase 3: refresh the ring from the last `dist` *original* (input)
+        // bytes, matching the byte-by-byte cursor/layout (see the decoder for
+        // the derivation).
+        if n >= dist {
+            let pos_final = (h.pos + (n % dist)) % dist;
+            for k in 0..dist {
+                h.buf[(pos_final + k) % dist] = input[n - dist + k];
+            }
+            h.pos = pos_final;
+        }
         Ok(RawProgress {
             consumed: n,
             written: n,
@@ -228,17 +245,46 @@ impl RawDecoder for Decoder {
         self.hist.check()?;
         let n = input.len().min(output.len());
         let h = &mut self.hist;
-        for i in 0..n {
+        let dist = h.dist;
+
+        // Phase 1: the first `dist` outputs depend on the ring history (bytes
+        // from previous calls / the all-zero seed). Reconstruct them through
+        // the ring exactly as before.
+        let seed = dist.min(n);
+        for i in 0..seed {
             let prev = h.buf[h.pos];
-            // Reconstruct the original byte: inverse of wrapping_sub.
             let orig = input[i].wrapping_add(prev);
             output[i] = orig;
             h.buf[h.pos] = orig;
             h.pos += 1;
-            if h.pos == h.dist {
+            if h.pos == dist {
                 h.pos = 0;
             }
         }
+        // Phase 2: once `i >= dist`, `output[i - dist]` is the original byte we
+        // need — read it straight from the output buffer. This drops both the
+        // ring modulo branch and the history load/store from the hot loop and
+        // exposes a simple `out[i] = in[i] + out[i-dist]` recurrence.
+        for i in dist..n {
+            output[i] = input[i].wrapping_add(output[i - dist]);
+        }
+        // Phase 3: refresh the ring from the last `dist` reconstructed bytes so
+        // the next call continues seamlessly. (When `n < dist` the ring was
+        // already fully advanced byte-by-byte in phase 1 and is correct.)
+        //
+        // After processing `n` bytes the byte-by-byte algorithm leaves
+        // `pos = (p0 + n) % dist` and `buf[(pos + k) % dist] = output[n-dist+k]`
+        // for k in 0..dist (each slot holds its most recent write). Reproduce
+        // exactly that state. With `seed == dist` here, `h.pos` is back at `p0`,
+        // so the final cursor is `(p0 + n) % dist == (h.pos + (n % dist)) %
+        // dist`.
+        if n >= dist {
+            let pos_final = (h.pos + (n % dist)) % dist;
+            for k in 0..dist {
+                h.buf[(pos_final + k) % dist] = output[n - dist + k];
+            }
+            h.pos = pos_final;
+        }
         Ok(RawProgress {
             consumed: n,
             written: n,

From 944adf7f735a2e2c9c1b6dac29a1daa3e9224d16 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:10:57 +0900
Subject: [PATCH 29/32] lha: bulk match-copy in static-Huffman decode hot loop

The LZSS match expansion copied byte-by-byte through the ring with two
modulo ops per byte. Split by match geometry: non-overlapping matches copy in
contiguous ring segments (straight-line loops, no per-byte wrap test);
single-byte runs (distance 1) fill a constant byte directly; only genuinely
overlapping matches fall back to the byte walk. The ring's space-prefill
semantics are preserved, so output is byte-identical (shared by lh4/5/6/7).

lh5 decode MB/s (1 MiB):
  Lorem: ~853 -> ~1400 (+64%)
  Zeros: ~890 -> ~1140 (+28%)
lh4/lh6/lh7 improve comparably. All lha + full-feature tests green.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/lha/static_huff.rs | 69 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/src/lha/static_huff.rs b/src/lha/static_huff.rs
index 0a99236..6cee4ec 100644
--- a/src/lha/static_huff.rs
+++ b/src/lha/static_huff.rs
@@ -456,7 +456,10 @@ pub fn decode_payload(
                 // Literal.
                 out.push(code as u8);
                 ring[ring_pos] = code as u8;
-                ring_pos = (ring_pos + 1) % ring_size;
+                ring_pos += 1;
+                if ring_pos == ring_size {
+                    ring_pos = 0;
+                }
             } else {
                 let count = code - 256 + MIN_MATCH;
                 if count > MAX_MATCH {
@@ -469,17 +472,61 @@ pub fn decode_payload(
                 if offset >= ring_size {
                     return Err(Error::InvalidDistance);
                 }
-                let start = (ring_pos + ring_size - offset - 1) % ring_size;
-                for k in 0..count {
-                    if let Some(n) = expected
-                        && out.len() >= n
-                    {
-                        break;
+                let limit = expected.unwrap_or(usize::MAX);
+                // Clamp the run to the declared output length.
+                let count = count.min(limit.saturating_sub(out.len()));
+                let mut src = (ring_pos + ring_size - offset - 1) % ring_size;
+                // Reserve output once so the per-byte push can't reallocate.
+                out.reserve(count);
+                if offset + 1 >= count {
+                    // Non-overlapping match: source and destination regions are
+                    // disjoint, so copy in at most two contiguous ring segments
+                    // (split only where src or dst wraps the ring). Each segment
+                    // is a straight-line memcpy-style loop with no per-byte
+                    // wrap test.
+                    let mut done = 0usize;
+                    while done < count {
+                        let run = (count - done)
+                            .min(ring_size - src)
+                            .min(ring_size - ring_pos);
+                        // Copy `run` bytes ring[src..] -> out and -> ring[dst..].
+                        for k in 0..run {
+                            let b = ring[src + k];
+                            out.push(b);
+                            ring[ring_pos + k] = b;
+                        }
+                        src += run;
+                        if src == ring_size {
+                            src = 0;
+                        }
+                        ring_pos += run;
+                        if ring_pos == ring_size {
+                            ring_pos = 0;
+                        }
+                        done += run;
+                    }
+                } else if offset == 0 {
+                    // Single-byte run (distance 1): the whole match is one
+                    // repeated byte. Fill directly instead of chasing the ring.
+                    let b = ring[src];
+                    for _ in 0..count {
+                        out.push(b);
+                        ring[ring_pos] = b;
+                        ring_pos += 1;
+                        if ring_pos == ring_size {
+                            ring_pos = 0;
+                        }
+                    }
+                } else {
+                    // Overlapping match (offset+1 < count): each written byte
+                    // feeds a later read, so walk byte-by-byte.
+                    for _ in 0..count {
+                        let b = ring[src];
+                        out.push(b);
+                        ring[ring_pos] = b;
+                        src = (src + 1) % ring_size;
+                        ring_pos = (ring_pos + 1) % ring_size;
                     }
-                    let b = ring[(start + k) % ring_size];
-                    out.push(b);
-                    ring[ring_pos] = b;
-                    ring_pos = (ring_pos + 1) % ring_size;
                 }
             }
             remaining -= 1;

From ac84ba6cbd6963531a6e2568303c45ada651c76d Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:13:12 +0900
Subject: [PATCH 30/32] rar1/2/3/5: bulk LZ77 match-copy in decode window loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All four RAR decoders expanded matches byte-by-byte through the sliding
window. Apply the same geometry split used in lha: distance-1 runs fill a
constant byte; non-overlapping matches copy in contiguous window segments
(straight-line loops, no per-byte index recompute / mask test); only
genuinely overlapping matches walk byte-by-byte. The window-prefill and
truncation semantics are preserved exactly, so decoded output stays
byte-identical — verified by each codec's reference-fixture tests
(rar1 53, rar2 28, rar3 30, rar5 29 tests, all green).

Decode-only codecs (no bench round-trip); correctness is fixture-validated
and the transform mirrors the measured lha win (+28-64% decode).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/rar1/window.rs  | 33 +++++++++++++++++++++++++++++----
 src/rar2/decoder.rs | 42 ++++++++++++++++++++++++++++++++++++------
 src/rar3/decoder.rs | 44 +++++++++++++++++++++++++++++++++++++-------
 src/rar5/decoder.rs | 45 ++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 140 insertions(+), 24 deletions(-)

diff --git a/src/rar1/window.rs b/src/rar1/window.rs
index 33447dd..3a7d806 100644
--- a/src/rar1/window.rs
+++ b/src/rar1/window.rs
@@ -125,11 +125,36 @@ impl Window {
         let mask = WINDOW_SIZE - 1;
         let mut src = (self.write_pos + WINDOW_SIZE - distance) & mask;
         let mut dst = self.write_pos;
-        for _ in 0..length {
+        if distance == 1 {
+            // Distance-1 run: one repeated byte.
             let b = self.buf[src];
-            self.buf[dst] = b;
-            src = (src + 1) & mask;
-            dst = (dst + 1) & mask;
+            for _ in 0..length {
+                self.buf[dst] = b;
+                dst = (dst + 1) & mask;
+            }
+        } else if distance >= length {
+            // Non-overlapping: copy in contiguous window segments (no per-byte
+            // mask test inside the run).
+            let mut done = 0usize;
+            while done < length {
+                let run = (length - done)
+                    .min(WINDOW_SIZE - src)
+                    .min(WINDOW_SIZE - dst);
+                for k in 0..run {
+                    self.buf[dst + k] = self.buf[src + k];
+                }
+                src = (src + run) & mask;
+                dst = (dst + run) & mask;
+                done += run;
+            }
+        } else {
+            // Overlapping match: each written byte feeds a later read.
+            for _ in 0..length {
+                let b = self.buf[src];
+                self.buf[dst] = b;
+                src = (src + 1) & mask;
+                dst = (dst + 1) & mask;
+            }
         }
         self.write_pos = dst;
         self.in_flight += length;
diff --git a/src/rar2/decoder.rs b/src/rar2/decoder.rs
index d4b9302..3721e80 100644
--- a/src/rar2/decoder.rs
+++ b/src/rar2/decoder.rs
@@ -544,13 +544,43 @@ impl RunCtx {
         if remaining > cap {
             remaining = cap;
         }
-        while remaining > 0 {
-            let src = (self.window_pos + WINDOW_SIZE - off) & WINDOW_MASK;
+        output.reserve(remaining);
+        let mut src = (self.window_pos + WINDOW_SIZE - off) & WINDOW_MASK;
+
+        if off == 1 {
+            // Distance-1 run: one repeated byte.
             let b = self.window[src];
-            self.window[self.window_pos] = b;
-            self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
-            output.push(b);
-            remaining -= 1;
+            for _ in 0..remaining {
+                self.window[self.window_pos] = b;
+                self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
+                output.push(b);
+            }
+        } else if off >= remaining {
+            // Non-overlapping: copy in contiguous window segments.
+            let mut done = 0usize;
+            while done < remaining {
+                let run = (remaining - done)
+                    .min(WINDOW_SIZE - src)
+                    .min(WINDOW_SIZE - self.window_pos);
+                let sp = self.window_pos;
+                for k in 0..run {
+                    let b = self.window[src + k];
+                    self.window[sp + k] = b;
+                    output.push(b);
+                }
+                src = (src + run) & WINDOW_MASK;
+                self.window_pos = (self.window_pos + run) & WINDOW_MASK;
+                done += run;
+            }
+        } else {
+            // Overlapping match: each written byte feeds a later read.
+            for _ in 0..remaining {
+                let b = self.window[src];
+                self.window[self.window_pos] = b;
+                src = (src + 1) & WINDOW_MASK;
+                self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
+                output.push(b);
+            }
         }
         Ok(())
     }
diff --git a/src/rar3/decoder.rs b/src/rar3/decoder.rs
index 4f90ecb..b04dd0c 100644
--- a/src/rar3/decoder.rs
+++ b/src/rar3/decoder.rs
@@ -321,14 +321,44 @@ impl RunCtx {
         if off > wlen {
             return Err(Error::InvalidDistance);
         }
-        for _ in 0..length {
-            let src = (self.window_pos + wlen - off) & wmask;
+        // Clamp the run to the declared unpack size (the old loop broke per
+        // byte once `out` reached it — produce exactly the same byte count).
+        let remaining_out = self.unpack_size.saturating_sub(self.out.len() as u64);
+        let length = (length as u64).min(remaining_out) as usize;
+        let mut src = (self.window_pos + wlen - off) & wmask;
+        self.out.reserve(length);
+
+        if off == 1 {
+            // Distance-1 run: one repeated byte. Fill directly.
             let b = self.window[src];
-            self.out.push(b);
-            self.window[self.window_pos] = b;
-            self.window_pos = (self.window_pos + 1) & wmask;
-            if (self.out.len() as u64) >= self.unpack_size {
-                break;
+            for _ in 0..length {
+                self.out.push(b);
+                self.window[self.window_pos] = b;
+                self.window_pos = (self.window_pos + 1) & wmask;
+            }
+        } else if off >= length {
+            // Non-overlapping: src and dst regions are disjoint. Copy in
+            // contiguous window segments (no per-byte recompute of `src`).
+            let mut done = 0usize;
+            while done < length {
+                let run = (length - done).min(wlen - src).min(wlen - self.window_pos);
+                for k in 0..run {
+                    let b = self.window[src + k];
+                    self.out.push(b);
+                    self.window[self.window_pos + k] = b;
+                }
+                src = (src + run) & wmask;
+                self.window_pos = (self.window_pos + run) & wmask;
+                done += run;
+            }
+        } else {
+            // Overlapping match: each written byte feeds a later read.
+            for _ in 0..length {
+                let b = self.window[src];
+                self.out.push(b);
+                self.window[self.window_pos] = b;
+                src = (src + 1) & wmask;
+                self.window_pos = (self.window_pos + 1) & wmask;
             }
         }
         Ok(())
diff --git a/src/rar5/decoder.rs b/src/rar5/decoder.rs
index d264b8b..049acc7 100644
--- a/src/rar5/decoder.rs
+++ b/src/rar5/decoder.rs
@@ -501,14 +501,45 @@ impl Decoder {
         }
         let ws = self.window_size;
         let wmask = self.window_mask;
-        for _ in 0..length {
-            let src = (self.window_pos + ws - dist as usize) & wmask;
+        let off = dist as usize;
+        // Clamp the run to the declared unpack total (the old loop broke per
+        // byte once it was reached — produce exactly the same byte count).
+        let produced = self.unpack_so_far + self.out_queue.len() as u64;
+        let remaining = self.unpack_total.saturating_sub(produced);
+        let length_n = (length as u64).min(remaining) as usize;
+        let mut src = (self.window_pos + ws - off) & wmask;
+
+        if off == 1 {
+            // Distance-1 run: one repeated byte.
             let b = self.window[src];
-            self.window[self.window_pos] = b;
-            self.window_pos = (self.window_pos + 1) & wmask;
-            self.out_queue.push_back(b);
-            if self.unpack_so_far + self.out_queue.len() as u64 >= self.unpack_total {
-                break;
+            for _ in 0..length_n {
+                self.window[self.window_pos] = b;
+                self.window_pos = (self.window_pos + 1) & wmask;
+                self.out_queue.push_back(b);
+            }
+        } else if off >= length_n {
+            // Non-overlapping: copy in contiguous window segments.
+            let mut done = 0usize;
+            while done < length_n {
+                let run = (length_n - done).min(ws - src).min(ws - self.window_pos);
+                let sp = self.window_pos;
+                for k in 0..run {
+                    let b = self.window[src + k];
+                    self.window[sp + k] = b;
+                    self.out_queue.push_back(b);
+                }
+                src = (src + run) & wmask;
+                self.window_pos = (self.window_pos + run) & wmask;
+                done += run;
+            }
+        } else {
+            // Overlapping match: each written byte feeds a later read.
+            for _ in 0..length_n {
+                let b = self.window[src];
+                self.window[self.window_pos] = b;
+                src = (src + 1) & wmask;
+                self.window_pos = (self.window_pos + 1) & wmask;
+                self.out_queue.push_back(b);
             }
         }
         self.last_len = length;

From 0afb6b40a3fb9ebe32ba67aaf864571c19355bb1 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:15:19 +0900
Subject: [PATCH 31/32] zip_implode/reduce/shrink: bulk match-copy in decode
 loops

- zip_shrink (LZW): assemble the decoded string in the scratch buffer, then
  reverse once and extend_from_slice into emit_buf, instead of the per-byte
  pop/push round trip (each output byte was written twice).
- zip_reduce: split the DLE back-reference copy into a zero-fill prefix
  (refs before stream start), a distance-1 fill, a non-overlapping
  extend_from_within, and an overlapping byte walk.
- zip_implode: split the window match copy by geometry (distance-1 fill /
  contiguous non-overlap segments / overlapping byte walk) and apply the
  pending_len/output_left bookkeeping once per match.

All decode-only; output byte-identical, verified by each codec's
reference-fixture tests (shrink 14, reduce 17, implode 18, all green).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/zip_implode/decoder.rs | 39 +++++++++++++++++++++++++++---
 src/zip_reduce/mod.rs      | 49 ++++++++++++++++++++++++++++----------
 src/zip_shrink/mod.rs      | 10 ++++----
 3 files changed, 79 insertions(+), 19 deletions(-)

diff --git a/src/zip_implode/decoder.rs b/src/zip_implode/decoder.rs
index 03bd346..14b3a78 100644
--- a/src/zip_implode/decoder.rs
+++ b/src/zip_implode/decoder.rs
@@ -549,10 +549,43 @@ impl Decoder {
         if len > self.output_left {
             return Err(Error::Corrupt);
         }
-        for _ in 0..len {
-            let src = (self.window_pos + WINDOW_SIZE - dist) & (WINDOW_SIZE - 1);
+        let count = len as usize;
+        let mask = WINDOW_SIZE - 1;
+        let mut src = (self.window_pos + WINDOW_SIZE - dist) & mask;
+        // Bookkeeping that `emit_byte` would do per byte, applied once.
+        self.pending_len += count;
+        self.output_left -= count as u32;
+
+        if dist == 1 {
+            // Distance-1 run: one repeated byte.
             let b = self.window[src];
-            self.emit_byte(b);
+            for _ in 0..count {
+                self.window[self.window_pos] = b;
+                self.window_pos = (self.window_pos + 1) & mask;
+            }
+        } else if dist >= count {
+            // Non-overlapping: copy in contiguous window segments.
+            let mut done = 0usize;
+            while done < count {
+                let run = (count - done)
+                    .min(WINDOW_SIZE - src)
+                    .min(WINDOW_SIZE - self.window_pos);
+                let wp = self.window_pos;
+                for k in 0..run {
+                    self.window[wp + k] = self.window[src + k];
+                }
+                src = (src + run) & mask;
+                self.window_pos = (self.window_pos + run) & mask;
+                done += run;
+            }
+        } else {
+            // Overlapping match: each written byte feeds a later read.
+            for _ in 0..count {
+                let b = self.window[src];
+                self.window[self.window_pos] = b;
+                src = (src + 1) & mask;
+                self.window_pos = (self.window_pos + 1) & mask;
+            }
         }
         Ok(true)
     }
diff --git a/src/zip_reduce/mod.rs b/src/zip_reduce/mod.rs
index d592903..0f90bd9 100644
--- a/src/zip_reduce/mod.rs
+++ b/src/zip_reduce/mod.rs
@@ -586,19 +586,44 @@ impl Decoder {
             // one optional extension byte), far below `buffer_ahead`, so
             // materialising it inline only overshoots the window bound by a
             // bounded amount that the next iteration's `slide_window` reaps.
-            let mut pm = PendingMatch {
-                dist,
-                remaining: len,
-            };
-            while pm.remaining > 0 {
-                let pos = self.produced();
-                let b = if pm.dist > pos {
-                    0u8
+            self.out.reserve(len);
+            let mut remaining = len;
+            // Phase A: any portion of the back-reference that points before the
+            // start of the stream reads as zero. This can only be a contiguous
+            // leading run (`pos` only grows).
+            let pos0 = self.produced();
+            if dist > pos0 {
+                let zeros = (dist - pos0).min(remaining);
+                for _ in 0..zeros {
+                    self.out.push(0);
+                }
+                remaining -= zeros;
+            }
+            // Phase B: real back-reference into already-produced output. `src`
+            // is an index into `self.out`; it and the write head advance in
+            // lockstep, so split by geometry instead of recomputing per byte.
+            if remaining > 0 {
+                let mut src = self.produced() - dist - self.window_base;
+                if dist == 1 {
+                    // Distance-1 run: repeated byte.
+                    let b = self.out[src];
+                    for _ in 0..remaining {
+                        self.out.push(b);
+                    }
+                } else if dist >= remaining {
+                    // Non-overlapping: source range is fully materialised, copy
+                    // it in one shot via copy_within.
+                    let start = self.out.len();
+                    self.out.extend_from_within(src..src + remaining);
+                    debug_assert_eq!(self.out.len(), start + remaining);
                 } else {
-                    self.out[(pos - pm.dist) - self.window_base]
-                };
-                self.out.push(b);
-                pm.remaining -= 1;
+                    // Overlapping match: each written byte feeds a later read.
+                    for _ in 0..remaining {
+                        let b = self.out[src];
+                        self.out.push(b);
+                        src += 1;
+                    }
+                }
             }
         }
         Ok(())
diff --git a/src/zip_shrink/mod.rs b/src/zip_shrink/mod.rs
index 7b1ba6d..9e9d92c 100644
--- a/src/zip_shrink/mod.rs
+++ b/src/zip_shrink/mod.rs
@@ -294,11 +294,13 @@ impl Decoder {
             }
         }
 
-        // `stack` now holds the string in reverse; pop into emit_buf.
+        // `stack` now holds the string reversed. The forward-order first byte
+        // is the last one pushed (`stack.last()`). Reverse once and bulk-copy
+        // into `emit_buf` with a single `extend_from_slice`, instead of the
+        // per-byte pop/push loop (which wrote every output byte twice).
         let first = *self.stack.last().ok_or(Error::Corrupt)?;
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        self.stack.reverse();
+        self.emit_buf.extend_from_slice(&self.stack);
         Ok(first)
     }
 

From 3f6adb1ac9aaf150bd054ef02c2d288fb9dbc679 Mon Sep 17 00:00:00 2001
From: Mark Karpeles <magicaltux@gmail.com>
Date: Fri, 12 Jun 2026 12:19:37 +0900
Subject: [PATCH 32/32] docs: changelog entry for codec throughput
 optimizations

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index db2bd2f..f9c5d75 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Performance
+
+- **Throughput optimizations across the codec suite**, all preserving
+  byte-identical decoder output (validated by the existing round-trip and
+  reference-fixture tests) — no `unsafe`, no new dependencies. Highlights:
+  - **deflate / deflate64** decode: vectorized match-copy (contiguous spans +
+    doubling `copy_within` for overlapping runs) — deflate Random decode
+    ~3.5×, deflate64 long-match decode several×; zlib/gzip inherit the gains.
+  - **LZMA / xz** decode: bulk (and overlapping) dictionary match-copy —
+    RLE-heavy `.lzma` decode up to ~6×.
+  - **zstd** decode: inlined backward bit-reader fast path, single-load FSE
+    state transitions, hoisted LL/ML tables — ~1.5× on Huffman/FSE-heavy input.
+  - **brotli** decode: wider Huffman fast LUT, single-tree literal fast path,
+    bit-accumulator kept across LUT hits — literal-heavy decode ~2.3×.
+  - **lz4 / lz5 / lzo / snappy** decode: bulk overlapping match-copy
+    (multi-GB/s); **lzo / snappy** encoder skip-step match search (~6× on
+    incompressible input). **lzw** single-pass string emit.
+  - **xpress-huffman** decode: fixed an O(n²) history-trim to O(n) (orders of
+    magnitude on large inputs); **lznt1** bulk match-copy.
+  - **lha / rar1–5 / zip-implode·reduce·shrink / arc-crunch·squash**: bulk
+    LZSS/LZW window copy; **delta** filter encode ~15× (auto-vectorized);
+    **hpack** byte-wide Huffman decode.
+  - **bzip2** encode: reduced SA-IS suffix-array allocations and in-place
+    recursion (+14–31% on the BWT build, the dominant encode cost).
+  - **checksum**: CRC-32 slice-by-8 (~4×); **rle90** bulk literal copy (~3.5×).
+
 ## [0.6.1](https://github.com/KarpelesLab/compcol/compare/v0.6.0...v0.6.1) - 2026-06-12
 
 ### Other