KarpelesLab · MagicalTux · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Performance
+
+- **Throughput optimizations across the codec suite**, all preserving
+  byte-identical decoder output (validated by the existing round-trip and
+  reference-fixture tests) — no `unsafe`, no new dependencies. Highlights:
+  - **deflate / deflate64** decode: vectorized match-copy (contiguous spans +
+    doubling `copy_within` for overlapping runs) — deflate Random decode
+    ~3.5×, deflate64 long-match decode several×; zlib/gzip inherit the gains.
+  - **LZMA / xz** decode: bulk (and overlapping) dictionary match-copy —
+    RLE-heavy `.lzma` decode up to ~6×.
+  - **zstd** decode: inlined backward bit-reader fast path, single-load FSE
+    state transitions, hoisted LL/ML tables — ~1.5× on Huffman/FSE-heavy input.
+  - **brotli** decode: wider Huffman fast LUT, single-tree literal fast path,
+    bit-accumulator kept across LUT hits — literal-heavy decode ~2.3×.
+  - **lz4 / lz5 / lzo / snappy** decode: bulk overlapping match-copy
+    (multi-GB/s); **lzo / snappy** encoder skip-step match search (~6× on
+    incompressible input). **lzw** single-pass string emit.
+  - **xpress-huffman** decode: fixed an O(n²) history-trim to O(n) (orders of
+    magnitude on large inputs); **lznt1** bulk match-copy.
+  - **lha / rar1–5 / zip-implode·reduce·shrink / arc-crunch·squash**: bulk
+    LZSS/LZW window copy; **delta** filter encode ~15× (auto-vectorized);
+    **hpack** byte-wide Huffman decode.
+  - **bzip2** encode: reduced SA-IS suffix-array allocations and in-place
+    recursion (+14–31% on the BWT build, the dominant encode cost).
+  - **checksum**: CRC-32 slice-by-8 (~4×); **rle90** bulk literal copy (~3.5×).
+
 ## [0.6.1](https://github.com/KarpelesLab/compcol/compare/v0.6.0...v0.6.1) - 2026-06-12
 
 ### Other

diff --git a/src/arc_crunch/mod.rs b/src/arc_crunch/mod.rs
@@ -39,7 +39,7 @@
 //!
 //! Crafted streams never panic: the classic LZW KwKwK case and any
 //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
-//! dictionary and the decoded-string stack are bounded by `1 << maxbits`;
+//! dictionary and the decoded-string scratch are bounded by `1 << maxbits`;
 //! every dictionary index is bounds-checked and width arithmetic is checked.
 //!
 //! ## References
@@ -386,7 +386,7 @@ pub struct Decoder {
     /// Decoded characters waiting to flush, forward order.
     emit_buf: Vec<u8>,
     emit_head: usize,
-    /// Scratch stack used while reversing a decoded string.
+    /// Scratch buffer used while reversing a decoded string.
     stack: Vec<u8>,
     completed: bool,
 }
@@ -408,7 +408,9 @@ impl Decoder {
             finchar: 0,
             emit_buf: Vec::new(),
             emit_head: 0,
-            stack: Vec::with_capacity(max_size),
+            // Fixed-size reverse-assembly scratch: a decoded string is at most
+            // `1 << maxbits` ≤ `max_size` bytes, so its tail always fits.
+            stack: vec![0u8; max_size],
             completed: false,
         }
     }
@@ -439,27 +441,48 @@ impl Decoder {
     /// Decode `code` into `emit_buf` (forward order); updates `finchar`.
     /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
     /// out of range) — defends against crafted streams.
+    ///
+    /// The chain is walked once, writing the reversed string straight into a
+    /// reserved tail region of `emit_buf` (deepest suffix last). This avoids
+    /// the previous scratch-stack round trip (every byte was written twice:
+    /// once pushed, once popped) — each output byte is now written exactly
+    /// once.
     fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
-        self.stack.clear();
-        let limit = 1usize << self.maxbits;
-        let mut hops = 0usize;
+        // `stack` is a fixed-size scratch (length == 1 << MAX_BITS, allocated
+        // once). We walk the prefix chain writing the string back-to-front into
+        // its tail, then bulk-copy the assembled forward-order slice into
+        // `emit_buf` with a single vectorised `extend_from_slice`. This avoids
+        // both the old per-byte `emit_buf.push` (a capacity check per byte) and
+        // any per-call zero-initialisation.
+        // Fast path: a bare literal (very common on incompressible input) is a
+        // length-1 string — emit it directly and skip the reverse-assembly.
+        if code < 256 {
+            let first = code as u8;
+            self.finchar = first;
+            self.emit_buf.push(first);
+            return Ok(());
+        }
+        let scratch = &mut self.stack[..];
+        let mut i = scratch.len();
         while code >= 256 {
-            if code as usize >= self.prefix.len() {
+            // `i` reaching 0 means the chain is longer than any valid string
+            // (> 1 << maxbits): a malformed / cyclic prefix table. Reject
+            // rather than underflow.
+            if code as usize >= self.prefix.len() || i == 0 {
                 return Err(Error::Corrupt);
             }
-            self.stack.push(self.suffix[code as usize]);
+            i -= 1;
+            scratch[i] = self.suffix[code as usize];
             code = self.prefix[code as usize] as u32;
-            hops += 1;
-            if hops > limit {
-                return Err(Error::Corrupt);
-            }
+        }
+        if i == 0 {
+            return Err(Error::Corrupt);
         }
         let first = code as u8;
         self.finchar = first;
-        self.emit_buf.push(first);
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        i -= 1;
+        scratch[i] = first;
+        self.emit_buf.extend_from_slice(&scratch[i..]);
         Ok(())
     }
 
@@ -621,7 +644,7 @@ impl RawDecoder for Decoder {
         self.finchar = 0;
         self.emit_buf.clear();
         self.emit_head = 0;
-        self.stack.clear();
+        // `stack` is fixed-size scratch overwritten on every use; leave it.
         self.completed = false;
     }
 }
diff --git a/src/arc_squash/mod.rs b/src/arc_squash/mod.rs
@@ -35,7 +35,7 @@
 //!
 //! Crafted streams never panic: the classic LZW KwKwK case and any
 //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
-//! dictionary and the decoded-string stack are bounded by `1 << 13`; every
+//! dictionary and the decoded-string scratch are bounded by `1 << 13`; every
 //! dictionary index is bounds-checked.
 //!
 //! ## References
@@ -347,7 +347,7 @@ pub struct Decoder {
     /// Decoded characters waiting to flush, forward order.
     emit_buf: Vec<u8>,
     emit_head: usize,
-    /// Scratch stack used while reversing a decoded string.
+    /// Fixed-size scratch used while reversing a decoded string.
     stack: Vec<u8>,
     completed: bool,
 }
@@ -366,7 +366,9 @@ impl Decoder {
             finchar: 0,
             emit_buf: Vec::new(),
             emit_head: 0,
-            stack: Vec::with_capacity(max_size),
+            // Fixed-size reverse-assembly scratch: a decoded string is at most
+            // `MAX_CODE` bytes, so its tail always fits.
+            stack: vec![0u8; max_size],
             completed: false,
         }
     }
@@ -396,26 +398,37 @@ impl Decoder {
     /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
     /// out of range) — defends against crafted streams.
     fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
-        self.stack.clear();
-        let limit = MAX_CODE as usize;
-        let mut hops = 0usize;
+        // Fast path: a bare literal is a length-1 string — emit directly.
+        if code < 256 {
+            let first = code as u8;
+            self.finchar = first;
+            self.emit_buf.push(first);
+            return Ok(());
+        }
+        // Walk the prefix chain back-to-front into the fixed-size scratch, then
+        // bulk-copy the forward-order slice into emit_buf with one
+        // extend_from_slice. This avoids the old per-byte push/pop round trip
+        // (each output byte written twice).
+        let scratch = &mut self.stack[..];
+        let mut i = scratch.len();
         while code >= 256 {
-            if code as usize >= self.prefix.len() {
+            // `i == 0` means the chain is longer than any valid string: a
+            // malformed / cyclic prefix table. Reject rather than underflow.
+            if code as usize >= self.prefix.len() || i == 0 {
                 return Err(Error::Corrupt);
             }
-            self.stack.push(self.suffix[code as usize]);
+            i -= 1;
+            scratch[i] = self.suffix[code as usize];
             code = self.prefix[code as usize] as u32;
-            hops += 1;
-            if hops > limit {
-                return Err(Error::Corrupt);
-            }
+        }
+        if i == 0 {
+            return Err(Error::Corrupt);
         }
         let first = code as u8;
         self.finchar = first;
-        self.emit_buf.push(first);
-        while let Some(b) = self.stack.pop() {
-            self.emit_buf.push(b);
-        }
+        i -= 1;
+        scratch[i] = first;
+        self.emit_buf.extend_from_slice(&scratch[i..]);
         Ok(())
     }
 
@@ -544,7 +557,7 @@ impl RawDecoder for Decoder {
         self.finchar = 0;
         self.emit_buf.clear();
         self.emit_head = 0;
-        self.stack.clear();
+        // `stack` is fixed-size scratch overwritten on every use; leave it.
         self.completed = false;
     }
 }
diff --git a/src/brotli/huffman.rs b/src/brotli/huffman.rs
@@ -21,8 +21,10 @@ use crate::error::Error;
 
 /// Primary-LUT width for the fast-path symbol lookup. Codes of length
 /// ≤ `PRIMARY_BITS` resolve in O(1); longer codes fall back to the
-/// per-bit walk.
-const PRIMARY_BITS: u32 = 9;
+/// per-bit walk. Brotli codes cap at length 15; an 11-bit table resolves
+/// the vast majority of literal/distance symbols in one indexed load
+/// (2048 u32 = 8 KiB per tree) while still fitting comfortably in L1.
+const PRIMARY_BITS: u32 = 11;
 const PRIMARY_SIZE: usize = 1 << PRIMARY_BITS;
 
 /// Packed (symbol, length) entry in the primary LUT. The low 16 bits hold
@@ -219,13 +221,17 @@ impl HuffmanDecoder {
         let max = self.max_length as u32;
 
         // Fast path: peek PRIMARY_BITS bits, index the LUT, advance the
-        // bit position by the actual code length.
-        if br.remaining() >= PRIMARY_BITS as usize {
-            let idx = br.peek_bits(PRIMARY_BITS) as usize;
-            let entry = self.lut[idx];
+        // bit position by the actual code length. `peek_lut_bits` refills
+        // and returns however many bits (up to PRIMARY_BITS) are buffered;
+        // when the full window is available we resolve in O(1) and consume
+        // only the matched code length, keeping the rest of the
+        // accumulator intact for the next symbol.
+        let (peeked, avail) = br.peek_lut_bits(PRIMARY_BITS);
+        if avail >= PRIMARY_BITS {
+            let entry = self.lut[peeked as usize];
             let len = entry >> LUT_LEN_SHIFT;
             if len > 0 {
-                br.set_position(br.position() + len as usize);
+                br.consume(len);
                 return Ok(entry & LUT_SYM_MASK);
             }
             // Long code (> PRIMARY_BITS) -- fall through to the slow path.
@@ -307,6 +313,18 @@ impl<'a> BitSource<'a> {
         self.nbits = 0;
     }
 
+    /// Advance the logical position by `n` bits that are already buffered
+    /// in `acc`. The caller must guarantee `n <= self.nbits` (e.g. right
+    /// after a `peek_bits(m)` with `m >= n`). Unlike `set_position` this
+    /// keeps the remaining buffered bits, so the hot Huffman fast path does
+    /// not force a refill on every decoded symbol.
+    #[inline]
+    pub(crate) fn consume(&mut self, n: u32) {
+        debug_assert!(n <= self.nbits);
+        self.acc >>= n;
+        self.nbits -= n;
+    }
+
     /// Remaining bits available (still in `data` plus held in `acc`).
     #[allow(dead_code)]
     pub(crate) fn remaining(&self) -> usize {
@@ -364,6 +382,7 @@ impl<'a> BitSource<'a> {
     /// Peek `n` bits (0 < n ≤ 32) without advancing. Caller must
     /// guarantee `n <= remaining()`. Refills the internal accumulator if
     /// fewer than `n` bits are buffered.
+    #[allow(dead_code)]
     pub(crate) fn peek_bits(&mut self, n: u32) -> u32 {
         debug_assert!(n > 0 && n <= 32);
         debug_assert!(n as usize <= self.remaining());
@@ -378,6 +397,28 @@ impl<'a> BitSource<'a> {
         }
     }
 
+    /// Peek up to `n` bits (1..=32) for the Huffman LUT fast path without
+    /// advancing. Refills once, then returns `(bits, available)` where
+    /// `available = min(nbits, n)` and `bits` holds the low `available`
+    /// bits LSB-first. When `available < n` the caller must fall back to
+    /// the per-bit slow path. Unlike `peek_bits` this never asserts on a
+    /// short tail, so it is safe to call when the stream is nearly drained.
+    #[inline]
+    pub(crate) fn peek_lut_bits(&mut self, n: u32) -> (u32, u32) {
+        if self.nbits < n {
+            self.refill();
+        }
+        let avail = self.nbits.min(n);
+        let bits = if avail == 0 {
+            0
+        } else if avail >= 32 {
+            self.acc as u32
+        } else {
+            (self.acc & ((1u64 << avail) - 1)) as u32
+        };
+        (bits, avail)
+    }
+
     /// Read `n` bits (0..=32) as a little-endian integer.
     pub(crate) fn read_bits(&mut self, n: u32) -> Result<u32, Error> {
         debug_assert!(n <= 32);

diff --git a/src/brotli/mod.rs b/src/brotli/mod.rs
@@ -2157,6 +2157,11 @@ impl Decoder {
             htree_d.push(Self::read_prefix_code(src, num_dist_codes)?);
         }
 
+        // When there is a single literal Huffman tree the context map is
+        // all zeroes, so literal decoding can skip the per-byte context
+        // lookup entirely (the tree index is constant 0).
+        let single_literal_tree = ntreesl == 1;
+
         // ─── decoding loop ───
         let mut emitted: u32 = 0;
         let mut block_type_l: u32 = 0;
@@ -2232,20 +2237,43 @@ impl Decoder {
             let copy_len = COPY_BASE[copy_code as usize] + copy_extra;
 
             // Emit `insert_len` literals.
-            for _ in 0..insert_len {
-                if emitted >= mlen {
-                    return Err(Error::Corrupt);
+            if single_literal_tree {
+                // Single literal Huffman tree: the context map is all
+                // zeroes, so the per-byte context computation and the
+                // `cmapl` lookup are dead work — the tree index is always
+                // 0. (Block-type switches still drive `block_len_l`, but
+                // they never change which tree we use here.)
+                let tree = &htree_l[0];
+                for _ in 0..insert_len {
+                    if emitted >= mlen {
+                        return Err(Error::Corrupt);
+                    }
+                    maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
+                    block_len_l -= 1;
+                    let sym = tree.decode(src)?;
+                    if sym > 255 {
+                        return Err(Error::Corrupt);
+                    }
+                    self.emit_literal(sym as u8);
+                    emitted += 1;
                 }
-                maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
-                block_len_l -= 1;
-                let cid = context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
-                let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
-                let sym = htree_l[tree_idx].decode(src)?;
-                if sym > 255 {
-                    return Err(Error::Corrupt);
+            } else {
+                for _ in 0..insert_len {
+                    if emitted >= mlen {
+                        return Err(Error::Corrupt);
+                    }
+                    maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
+                    block_len_l -= 1;
+                    let cid =
+                        context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
+                    let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
+                    let sym = htree_l[tree_idx].decode(src)?;
+                    if sym > 255 {
+                        return Err(Error::Corrupt);
+                    }
+                    self.emit_literal(sym as u8);
+                    emitted += 1;
                 }
-                self.emit_literal(sym as u8);
-                emitted += 1;
             }
 
             if emitted >= mlen {