From e5f65cc52c6f3d9ce1eba00a432e937d58f879a6 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:52 +0900 Subject: [PATCH 01/32] checksum: CRC-32 slice-by-8 (642 -> 2525 MB/s, 3.9x) Replace the byte-at-a-time CRC-32 inner loop with Intel slice-by-8: fold eight bytes per iteration through eight precomputed tables instead of one. Output is byte-identical (verified against the byte-at-a-time loop over 16 MiB). Standalone microbench: 642 -> 2525 MB/s. Co-Authored-By: Claude Fable 5 --- src/checksum.rs | 57 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/src/checksum.rs b/src/checksum.rs index cb278c5..f232133 100644 --- a/src/checksum.rs +++ b/src/checksum.rs @@ -71,9 +71,29 @@ impl Crc32 { pub fn update(&mut self, data: &[u8]) { let mut s = self.state; - for &b in data { + + // Slice-by-8: consume eight bytes per iteration using eight + // precomputed tables. This shortens the per-byte dependency chain + // and branch/load count versus the byte-at-a-time loop while + // producing identical CRCs. + let mut chunks = data.chunks_exact(8); + for c in &mut chunks { + let lo = u32::from_le_bytes([c[0], c[1], c[2], c[3]]) ^ s; + let hi = u32::from_le_bytes([c[4], c[5], c[6], c[7]]); + s = CRC32_TABLE8[7][(lo & 0xFF) as usize] + ^ CRC32_TABLE8[6][((lo >> 8) & 0xFF) as usize] + ^ CRC32_TABLE8[5][((lo >> 16) & 0xFF) as usize] + ^ CRC32_TABLE8[4][(lo >> 24) as usize] + ^ CRC32_TABLE8[3][(hi & 0xFF) as usize] + ^ CRC32_TABLE8[2][((hi >> 8) & 0xFF) as usize] + ^ CRC32_TABLE8[1][((hi >> 16) & 0xFF) as usize] + ^ CRC32_TABLE8[0][(hi >> 24) as usize]; + } + + // Tail: fewer than 8 bytes remain. + for &b in chunks.remainder() { let idx = ((s ^ b as u32) & 0xFF) as usize; - s = (s >> 8) ^ CRC32_TABLE[idx]; + s = (s >> 8) ^ CRC32_TABLE8[0][idx]; } self.state = s; } @@ -94,13 +114,18 @@ impl Default for Crc32 { } } -/// Build the standard 256-entry table at compile time. +/// Slice-by-8 tables, built at compile time. `CRC32_TABLE8[0]` is the +/// standard 256-entry CRC-32 table; `CRC32_TABLE8[n]` for `n >= 1` advances +/// the CRC by an extra byte position, so eight bytes can be folded per +/// iteration. See Intel's "Slicing-by-8" technique. #[cfg(any(feature = "gzip", test))] -const CRC32_TABLE: [u32; 256] = { - let mut table = [0u32; 256]; - let mut i = 0u32; +const CRC32_TABLE8: [[u32; 256]; 8] = { + let mut tables = [[0u32; 256]; 8]; + + // Base table (slice 0): the standard reflected CRC-32 step. + let mut i = 0usize; while i < 256 { - let mut c = i; + let mut c = i as u32; let mut k = 0; while k < 8 { c = if c & 1 != 0 { @@ -110,10 +135,24 @@ const CRC32_TABLE: [u32; 256] = { }; k += 1; } - table[i as usize] = c; + tables[0][i] = c; i += 1; } - table + + // Each subsequent table folds in one more zero byte: + // table[n][i] = (table[n-1][i] >> 8) ^ table[0][table[n-1][i] & 0xFF]. + let mut n = 1usize; + while n < 8 { + let mut j = 0usize; + while j < 256 { + let prev = tables[n - 1][j]; + tables[n][j] = (prev >> 8) ^ tables[0][(prev & 0xFF) as usize]; + j += 1; + } + n += 1; + } + + tables }; #[cfg(test)] From 5027f1de133d22b09abeabfaaf5295bece250e92 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:01:08 +0900 Subject: [PATCH 02/32] rle90: bulk-copy literal runs in decoder (~1268 -> ~4600 MB/s, 3.5x) The Normal-state decode path copied literal (non-FLAG) bytes one at a time through the state machine. Scan for the contiguous non-FLAG span bounded by input/output availability and copy_from_slice it in one memcpy, updating last/have_last from the span's final byte. Output is byte-identical; all rle90 tests pass. Bench decode (1 MiB): Lorem ~1268 -> ~4600 MB/s, Random ~1211 -> ~3750 MB/s. Co-Authored-By: Claude Fable 5 --- src/rle90.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/rle90.rs b/src/rle90.rs index ce42c07..e6447b0 100644 --- a/src/rle90.rs +++ b/src/rle90.rs @@ -335,11 +335,25 @@ impl RawDecoder for Decoder { done: false, }); } - output[written] = b; - written += 1; - consumed += 1; - self.last = b; + // Bulk-copy a contiguous run of literal (non-FLAG) + // bytes, bounded by remaining input and output. This + // turns the common literal-heavy stream into a single + // memcpy instead of a per-byte state-machine cycle. + let in_avail = input.len() - consumed; + let out_avail = output.len() - written; + let limit = in_avail.min(out_avail); + let src = &input[consumed..consumed + limit]; + // Length of the leading non-FLAG span. + let span = match src.iter().position(|&x| x == FLAG) { + Some(p) => p, + None => limit, + }; + // `span >= 1` because src[0] == b != FLAG. + output[written..written + span].copy_from_slice(&src[..span]); + self.last = src[span - 1]; self.have_last = true; + written += span; + consumed += span; } } DecState::AwaitCount => { From 7a1e935fdcd7c4bf8ed8be96904b0aa4fd01b959 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:00 +0900 Subject: [PATCH 03/32] deflate: vectorize decoder match-copy incl. overlapping runs Replace the per-byte overlap fallback in the inflate EmittingMatch hot loop (distance < remaining, e.g. distance-1 zero runs) with contiguous copy_within/copy_from_slice in non-wrapping spans, plus an expanding doubling copy that replicates the d-byte pattern instead of one byte at a time. Two modulos per byte become one wrap check per span. Decode throughput (1 MiB, median of 3): deflate Zeros: 242 -> ~460 MB/s (+90%) zlib Zeros: 231 -> ~419 MB/s (+82%) gzip Zeros: 179 -> ~271 MB/s (+52%) deflate Lorem: 4751 -> ~5700 MB/s (+20%) zlib Lorem: 2483 -> ~2700 MB/s (+9%) Round-trip + reference-fixture tests (system gzip, python zlib/deflate) all green; output is byte-identical. Co-Authored-By: Claude Fable 5 --- src/deflate/decoder.rs | 124 ++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 32 deletions(-) diff --git a/src/deflate/decoder.rs b/src/deflate/decoder.rs index 7e3db14..50316f0 100644 --- a/src/deflate/decoder.rs +++ b/src/deflate/decoder.rs @@ -802,42 +802,102 @@ impl Decoder { work.phase = HuffmanPhase::NextSymbol; continue; } - // Bulk-copy the non-overlapping run; fall back - // to the byte loop for overlap (distance < remaining) - // and wrap-spanning chunks. + // Copy the match run in contiguous, non-wrapping + // chunks, advancing `window_pos`/`*written` per + // chunk instead of doing two modulos per byte. + // + // Two cases inside the loop: + // • Non-overlapping (`src + chunk <= window_pos`): + // one `copy_from_slice` to output and one + // `copy_within` in the ring. + // • Overlapping (`distance < remaining`, e.g. a + // run of zeros at distance 1): the source region + // grows as we write. We materialise it with an + // *expanding* `copy_within` — first `d` bytes, + // then doubling the produced span each step — + // which `copy_within` vectorises, then mirror + // the produced bytes to the output in one go. let d = distance as usize; - let out_room = output.len() - *written; - let mut chunk = (remaining as usize).min(out_room); - if chunk > 0 && d >= chunk { - let src = (self.window_pos + self.win_cap - d) % self.win_cap; - // Limit chunk so source and destination - // ranges do not wrap the circular window. - let src_room = self.win_cap - src; + while remaining > 0 && *written < output.len() { + let out_room = output.len() - *written; + // `src` sits `d` bytes behind `window_pos`. + let src = if self.window_pos >= d { + self.window_pos - d + } else { + self.window_pos + self.win_cap - d + }; let dst_room = self.win_cap - self.window_pos; - chunk = chunk.min(src_room).min(dst_room); - if chunk > 0 { - // Copy to output. - output[*written..*written + chunk] - .copy_from_slice(&self.window[src..src + chunk]); - // Copy to window via copy_within (src and dst - // don't overlap because d >= chunk). - self.window.copy_within(src..src + chunk, self.window_pos); - *written += chunk; - self.window_pos = (self.window_pos + chunk) % self.win_cap; - if self.window_size < self.win_cap { - self.window_size = - (self.window_size + chunk).min(self.win_cap); + let src_room = self.win_cap - src; + // Bytes we can produce before the source read or + // destination write wraps the ring, or we run + // out of output / remaining run. + let span = (remaining as usize) + .min(out_room) + .min(dst_room) + .min(src_room); + if span == 0 { + break; + } + + if d >= span { + // Non-overlapping within this span: source + // is fully behind the destination and does + // not wrap (bounded by src_room above). + let wp = self.window_pos; + self.window.copy_within(src..src + span, wp); + output[*written..*written + span] + .copy_from_slice(&self.window[wp..wp + span]); + *written += span; + self.window_pos = wp + span; + } else if src + d == self.window_pos { + // Overlapping with a contiguous source: + // `src` is exactly `d` bytes before + // `window_pos` and neither wraps. Replicate + // the d-byte pattern forward into + // `[start, start+span)` by doubling — each + // step copies an already-materialised prefix + // of length ≤ d onto the next slot, which + // `copy_within` vectorises. + let start = self.window_pos; // == src + d + let mut produced = 0usize; + while produced < span { + let copy = d.min(span - produced); + self.window.copy_within( + src + produced..src + produced + copy, + start + produced, + ); + produced += copy; } - remaining -= chunk as u16; - progress = true; + output[*written..*written + span] + .copy_from_slice(&self.window[start..start + span]); + *written += span; + self.window_pos = start + span; + } else { + // Rare: overlapping match whose source wraps + // the ring (window_pos < d). Fall back to a + // byte-wise replication for just this span. + let start = self.window_pos; + for i in 0..span { + let s = if start + i >= d { + start + i - d + } else { + start + i + self.win_cap - d + }; + let b = self.window[s]; + self.window[start + i] = b; + output[*written] = b; + *written += 1; + } + self.window_pos = start + span; } - } - while remaining > 0 && *written < output.len() { - let d = distance as usize; - let src = (self.window_pos + self.win_cap - d) % self.win_cap; - let b = self.window[src]; - self.emit_byte(b, output, written); - remaining -= 1; + + if self.window_pos == self.win_cap { + self.window_pos = 0; + } + if self.window_size < self.win_cap { + self.window_size = (self.window_size + span).min(self.win_cap); + } + remaining -= span as u16; progress = true; } if remaining == 0 { From 5d62b830c5c4b2a25128df9eb87e4d4c2eb2027e Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:46 +0900 Subject: [PATCH 04/32] deflate: replace per-literal modulo with a wrap branch in emit_byte window_pos advance used `% win_cap`; win_cap is a runtime value so this lowered to an integer division on every emitted literal. Swap for a single equality+reset branch and mark emit_byte #[inline]. Correctness unchanged (output byte-identical); removes a hardware divide from the literal hot path. Neutral-to-positive on the literal-heavy Lorem decode, no regression elsewhere. Co-Authored-By: Claude Fable 5 --- src/deflate/decoder.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/deflate/decoder.rs b/src/deflate/decoder.rs index 50316f0..e29e636 100644 --- a/src/deflate/decoder.rs +++ b/src/deflate/decoder.rs @@ -278,12 +278,19 @@ impl Decoder { } /// Write one byte to both the sliding window and the caller's output. + #[inline] fn emit_byte(&mut self, byte: u8, output: &mut [u8], written: &mut usize) { debug_assert!(*written < output.len()); output[*written] = byte; *written += 1; self.window[self.window_pos] = byte; - self.window_pos = (self.window_pos + 1) % self.win_cap; + // `win_cap` is a runtime value, so `% win_cap` would lower to an + // integer division; a single wrap branch is far cheaper on the + // per-literal hot path. + self.window_pos += 1; + if self.window_pos == self.win_cap { + self.window_pos = 0; + } if self.window_size < self.win_cap { self.window_size += 1; } From ccebe8afc3721844bd4672afabccca46bed09725 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:53:52 +0900 Subject: [PATCH 05/32] deflate64: vectorize decoder match-copy incl. overlapping runs Mirror the deflate inflate optimization in the deflate64 decoder: copy each match run in contiguous, non-wrapping spans (one copy_within + copy_from_slice for non-overlapping spans, an expanding doubling copy for overlapping ones) instead of a per-byte fallback loop. deflate64's larger window and match length make long matches common, so the bulk copy is a big win. Decode throughput (1 MiB, median of 3): deflate64 Lorem: ~1459 -> ~10800 MB/s (long repetitive matches) deflate64 Zeros/Random: unchanged within noise Round-trip tests green; output byte-identical. Co-Authored-By: Claude Fable 5 --- src/deflate64/decoder.rs | 93 ++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/src/deflate64/decoder.rs b/src/deflate64/decoder.rs index 288b2de..de0d5e4 100644 --- a/src/deflate64/decoder.rs +++ b/src/deflate64/decoder.rs @@ -662,37 +662,76 @@ impl Decoder { work.phase = HuffmanPhase::NextSymbol; continue; } - // Bulk-copy the non-overlapping run; the byte loop - // handles overlap (distance < remaining) and any - // wrap-around inside the circular window. + // Copy the match run in contiguous, non-wrapping + // spans. Non-overlapping spans use a single + // copy_within + copy_from_slice; overlapping spans + // (distance < remaining) replicate the d-byte + // pattern with an expanding doubling copy instead + // of one byte at a time. let d = distance as usize; - let out_room = output.len() - *written; - let mut chunk = (remaining as usize).min(out_room); - if chunk > 0 && d >= chunk { - let src = (self.window_pos + WINDOW_SIZE - d) % WINDOW_SIZE; - let src_room = WINDOW_SIZE - src; + while remaining > 0 && *written < output.len() { + let out_room = output.len() - *written; + let src = if self.window_pos >= d { + self.window_pos - d + } else { + self.window_pos + WINDOW_SIZE - d + }; let dst_room = WINDOW_SIZE - self.window_pos; - chunk = chunk.min(src_room).min(dst_room); - if chunk > 0 { - output[*written..*written + chunk] - .copy_from_slice(&self.window[src..src + chunk]); - self.window.copy_within(src..src + chunk, self.window_pos); - *written += chunk; - self.window_pos = (self.window_pos + chunk) % WINDOW_SIZE; - if self.window_size < WINDOW_SIZE { - self.window_size = - (self.window_size + chunk).min(WINDOW_SIZE); + let src_room = WINDOW_SIZE - src; + let span = (remaining as usize) + .min(out_room) + .min(dst_room) + .min(src_room); + if span == 0 { + break; + } + + if d >= span { + let wp = self.window_pos; + self.window.copy_within(src..src + span, wp); + output[*written..*written + span] + .copy_from_slice(&self.window[wp..wp + span]); + *written += span; + self.window_pos = wp + span; + } else if src + d == self.window_pos { + let start = self.window_pos; // == src + d + let mut produced = 0usize; + while produced < span { + let copy = d.min(span - produced); + self.window.copy_within( + src + produced..src + produced + copy, + start + produced, + ); + produced += copy; } - remaining -= chunk as u32; - progress = true; + output[*written..*written + span] + .copy_from_slice(&self.window[start..start + span]); + *written += span; + self.window_pos = start + span; + } else { + // Rare: overlapping source wraps the ring. + let start = self.window_pos; + for i in 0..span { + let s = if start + i >= d { + start + i - d + } else { + start + i + WINDOW_SIZE - d + }; + let b = self.window[s]; + self.window[start + i] = b; + output[*written] = b; + *written += 1; + } + self.window_pos = start + span; } - } - while remaining > 0 && *written < output.len() { - let d = distance as usize; - let src = (self.window_pos + WINDOW_SIZE - d) % WINDOW_SIZE; - let b = self.window[src]; - self.emit_byte(b, output, written); - remaining -= 1; + + if self.window_pos == WINDOW_SIZE { + self.window_pos = 0; + } + if self.window_size < WINDOW_SIZE { + self.window_size = (self.window_size + span).min(WINDOW_SIZE); + } + remaining -= span as u32; progress = true; } if remaining == 0 { From fa2ba85ba4941d8105d8b303eab74a6269149b83 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:50:03 +0900 Subject: [PATCH 06/32] lzma2: bulk match-copy in decode_chunk (xz/lzma2 decode) The LZMA2 chunk decoder copied match bytes one at a time through dict_get/dict_put. For non-overlapping matches (distance+1 >= length) the source bytes already sit contiguously behind dict_pos, so we can copy_from_slice into the output and copy_within inside the dict in bulk, mirroring the dict_copy_match_bulk fast path already used by the .lzma decoder. The per-byte loop still handles overlapping matches and the circular-buffer wrap remainder, so decoder output is byte-identical. Measured (1 MiB corpus, median of 3, release): xz Lorem decode 340 -> ~553 MB/s (+63%) xz Random decode 434 -> ~680 MB/s (+57%) xz Zeros decode 365 -> ~384 MB/s (+5%) Co-Authored-By: Claude Fable 5 --- src/lzma2_internal/lzma2_decoder.rs | 63 ++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/src/lzma2_internal/lzma2_decoder.rs b/src/lzma2_internal/lzma2_decoder.rs index bf33f23..1785be5 100644 --- a/src/lzma2_internal/lzma2_decoder.rs +++ b/src/lzma2_internal/lzma2_decoder.rs @@ -529,6 +529,44 @@ impl LzmaCore { (distance as usize) < n } + /// Bulk-copy up to `n` non-overlapping match bytes (requires + /// `distance + 1 >= n`) from the dictionary into both `out[*written..]` + /// and back into the dict. Returns the number of bytes copied; may be + /// less than `n` when the source or destination range wraps the circular + /// dict, in which case the caller falls back to the per-byte loop for + /// the remainder. Caller must guarantee `dict_has(distance)` and that + /// `out` has at least `n` bytes of room from `*written`. + fn dict_copy_match_bulk( + &mut self, + distance: u32, + n: usize, + out: &mut [u8], + written: &mut usize, + ) -> usize { + let dist1 = distance as usize + 1; + let src = if self.dict_pos >= dist1 { + self.dict_pos - dist1 + } else { + self.dict.len() - (dist1 - self.dict_pos) + }; + let src_room = self.dict.len() - src; + let dst_room = self.dict.len() - self.dict_pos; + let chunk = n.min(src_room).min(dst_room); + if chunk == 0 { + return 0; + } + out[*written..*written + chunk].copy_from_slice(&self.dict[src..src + chunk]); + self.dict.copy_within(src..src + chunk, self.dict_pos); + *written += chunk; + self.dict_pos += chunk; + if self.dict_pos >= self.dict.len() { + self.dict_pos = 0; + self.dict_full = true; + } + self.output_pos += chunk as u64; + chunk + } + fn pos_state(&self) -> u32 { (self.output_pos as u32) & self.pos_mask } @@ -755,17 +793,32 @@ impl LzmaCore { PacketOutcome::Match { length } => { let mut remaining = length as usize; let distance = self.rep0; + if !self.dict_has(distance) { + return Err(Error::Corrupt); + } + // A match that would write past the chunk's declared + // output size is malformed. + if remaining > target - written { + return Err(Error::Corrupt); + } + // Fast path: when the match is non-overlapping + // (distance + 1 >= remaining) the source bytes already + // exist contiguously behind `dict_pos`, so we can bulk + // `copy_from_slice` / `copy_within` instead of stepping + // byte by byte. `dict_copy_match_bulk` copies as much as + // it can without crossing the circular dict boundary and + // returns the count; the per-byte loop handles any + // wrapped remainder and the overlapping case. + if distance as usize + 1 >= remaining { + let did = self.dict_copy_match_bulk(distance, remaining, out, &mut written); + remaining -= did; + } while remaining > 0 { if !self.dict_has(distance) { return Err(Error::Corrupt); } let b = self.dict_get(distance); self.dict_put(b); - if written >= target { - // Matches that overshoot the per-chunk size cap - // are malformed. - return Err(Error::Corrupt); - } out[written] = b; written += 1; remaining -= 1; From 0fff46f5e3ccbb7b8982ea82d24555068b4b5a81 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:51:28 +0900 Subject: [PATCH 07/32] lzma2: bulk overlapping match-copy in decode_chunk (xz/lzma2 decode) Overlapping matches (distance+1 < length, e.g. RLE-style runs over long zero/repeat regions) still fell through to the byte-by-byte loop. Add dict_copy_match_overlap: it replicates the dist1-byte source window forward inside the dict via doubling copy_within windows (each read hits bytes written by an earlier window), then copy_from_slice's the filled run into the output. Only the non-wrapping contiguous portion is bulked; the per-byte loop still handles the circular-dict wrap remainder, so decoder output stays byte-identical. Measured (1 MiB corpus, median of 3, release): xz Zeros decode ~384 -> ~570 MB/s (+48%) Co-Authored-By: Claude Fable 5 --- src/lzma2_internal/lzma2_decoder.rs | 55 +++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/lzma2_internal/lzma2_decoder.rs b/src/lzma2_internal/lzma2_decoder.rs index 1785be5..4048cce 100644 --- a/src/lzma2_internal/lzma2_decoder.rs +++ b/src/lzma2_internal/lzma2_decoder.rs @@ -567,6 +567,55 @@ impl LzmaCore { chunk } + /// Bulk-copy up to `n` *overlapping* match bytes (`distance + 1 < n`) + /// from the dictionary into both `out[*written..]` and the dict. The + /// source window `[src, dict_pos)` is `dist1` bytes long and is repeated + /// forward to fill the run; we extend it by `copy_within` in growing + /// windows so each byte read was already written in a previous window. + /// Only the contiguous portion that neither wraps the circular dict nor + /// overruns is handled here; the caller's per-byte loop covers the rest. + /// Returns the number of bytes copied. Caller must guarantee + /// `dict_has(distance)` and `out` room for `n` bytes from `*written`. + fn dict_copy_match_overlap( + &mut self, + distance: u32, + n: usize, + out: &mut [u8], + written: &mut usize, + ) -> usize { + let dist1 = distance as usize + 1; + // Source must not wrap: it begins `dist1` bytes behind dict_pos. + if self.dict_pos < dist1 { + return 0; + } + let dst = self.dict_pos; + let src = dst - dist1; + // Destination must not wrap during the whole run. + let dst_room = self.dict.len() - dst; + let chunk = n.min(dst_room); + if chunk == 0 { + return 0; + } + // Self-overlapping forward fill: copy in doubling windows so each + // read targets bytes written by an earlier iteration. + let mut filled = dist1.min(chunk); + self.dict.copy_within(src..src + filled, dst); + while filled < chunk { + let take = filled.min(chunk - filled); + self.dict.copy_within(dst..dst + take, dst + filled); + filled += take; + } + out[*written..*written + chunk].copy_from_slice(&self.dict[dst..dst + chunk]); + *written += chunk; + self.dict_pos += chunk; + if self.dict_pos >= self.dict.len() { + self.dict_pos = 0; + self.dict_full = true; + } + self.output_pos += chunk as u64; + chunk + } + fn pos_state(&self) -> u32 { (self.output_pos as u32) & self.pos_mask } @@ -812,6 +861,12 @@ impl LzmaCore { if distance as usize + 1 >= remaining { let did = self.dict_copy_match_bulk(distance, remaining, out, &mut written); remaining -= did; + } else { + // Overlapping run (e.g. RLE-style fills): replicate + // the source window forward in bulk. + let did = + self.dict_copy_match_overlap(distance, remaining, out, &mut written); + remaining -= did; } while remaining > 0 { if !self.dict_has(distance) { From c31955ec2b46ff6b6ca2e9dbb8dfe03984e34478 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:46 +0900 Subject: [PATCH 08/32] lzma: bulk overlapping match-copy in decoder drain loops (.lzma decode) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .lzma streaming decoder already bulk-copied non-overlapping matches but fell through to a byte-by-byte loop for overlapping runs (small distance, large length — the dominant pattern on RLE-heavy inputs like long zero runs). Add dict_copy_match_overlap mirroring the lzma2 path: replicate the dist1-byte source window forward via doubling copy_within, then copy_from_slice into the output. Both drain sites (the live Match outcome and the parked pending_match) get the new branch. The per-byte loop still covers the circular-dict wrap remainder and respects the uncompressed-size cap, so decoder output is byte-identical. Measured (1 MiB corpus, median of 3, release): lzma Zeros decode ~860 -> ~5400 MB/s (+6x) lzma Lorem/Random decode unchanged (no overlapping runs) Co-Authored-By: Claude Fable 5 --- src/lzma/mod.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/lzma/mod.rs b/src/lzma/mod.rs index 7156295..66f4168 100644 --- a/src/lzma/mod.rs +++ b/src/lzma/mod.rs @@ -503,6 +503,51 @@ impl LzmaCore { chunk } + /// Bulk-copy up to `n` *overlapping* match bytes (`distance + 1 < n`, + /// e.g. RLE-style runs) from the dict into both `output[*written..]` and + /// the dict. The `dist1`-byte source window behind `dict_pos` is + /// replicated forward via doubling `copy_within` windows so each read + /// targets bytes written by an earlier window. Only the contiguous + /// portion that does not wrap the circular dict is handled; the caller's + /// per-byte loop covers the wrap remainder. Returns bytes copied. Caller + /// must guarantee `dict_has(distance)` and `n` bytes of output room. + fn dict_copy_match_overlap( + &mut self, + distance: u32, + n: usize, + output: &mut [u8], + written: &mut usize, + ) -> usize { + let dist1 = distance as usize + 1; + // Source window must not wrap: it starts `dist1` bytes behind dict_pos. + if self.dict_pos < dist1 { + return 0; + } + let dst = self.dict_pos; + let src = dst - dist1; + let dst_room = self.dict.len() - dst; + let chunk = n.min(dst_room); + if chunk == 0 { + return 0; + } + let mut filled = dist1.min(chunk); + self.dict.copy_within(src..src + filled, dst); + while filled < chunk { + let take = filled.min(chunk - filled); + self.dict.copy_within(dst..dst + take, dst + filled); + filled += take; + } + output[*written..*written + chunk].copy_from_slice(&self.dict[dst..dst + chunk]); + *written += chunk; + self.dict_pos += chunk; + if self.dict_pos >= self.dict.len() { + self.dict_pos = 0; + self.dict_full = true; + } + self.output_pos += chunk as u64; + chunk + } + fn dict_has(&self, distance: u32) -> bool { let n = if self.dict_full { self.dict.len() @@ -1043,6 +1088,14 @@ impl Decoder { core.finished = true; pm.remaining = 0; } + } else if want > 0 { + // Overlapping run: replicate the source window forward. + let did = core.dict_copy_match_overlap(pm.distance, want, output, written); + pm.remaining -= did as u32; + if matches!(core.uncompressed_size, Some(t) if core.output_pos >= t) { + core.finished = true; + pm.remaining = 0; + } } while pm.remaining > 0 && *written < output.len() { if !core.dict_has(pm.distance) { @@ -1147,6 +1200,14 @@ impl Decoder { core.finished = true; remaining = 0; } + } else if want > 0 { + // Overlapping run: replicate the source window forward. + let did = core.dict_copy_match_overlap(distance, want, output, written); + remaining -= did as u32; + if matches!(core.uncompressed_size, Some(t) if core.output_pos >= t) { + core.finished = true; + remaining = 0; + } } while remaining > 0 && *written < output.len() { if !core.dict_has(distance) { From 521e60180f720ba503d502d9ffbeee3223c44a67 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:50:46 +0900 Subject: [PATCH 09/32] brotli: keep bit accumulator across Huffman LUT hits The decoder's per-symbol fast path called set_position() after every LUT hit, which zeroed the 64-bit bit accumulator and forced a fresh refill on the next decode. Add BitSource::consume() to advance within the buffered bits, plus peek_lut_bits() that refills once and reports how many bits are available without asserting on a short tail. The hot Huffman decode loop now resolves consecutive symbols out of registers. Decode throughput (median of 3, 1 MiB inputs): Random: 106 -> ~140 MB/s (+~32%) Lorem: 1030 -> ~1040 MB/s (within noise) cargo test --features "brotli std": green. clippy clean. Co-Authored-By: Claude Fable 5 --- src/brotli/huffman.rs | 49 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/src/brotli/huffman.rs b/src/brotli/huffman.rs index 61f3c12..055d02b 100644 --- a/src/brotli/huffman.rs +++ b/src/brotli/huffman.rs @@ -219,13 +219,17 @@ impl HuffmanDecoder { let max = self.max_length as u32; // Fast path: peek PRIMARY_BITS bits, index the LUT, advance the - // bit position by the actual code length. - if br.remaining() >= PRIMARY_BITS as usize { - let idx = br.peek_bits(PRIMARY_BITS) as usize; - let entry = self.lut[idx]; + // bit position by the actual code length. `peek_lut_bits` refills + // and returns however many bits (up to PRIMARY_BITS) are buffered; + // when the full window is available we resolve in O(1) and consume + // only the matched code length, keeping the rest of the + // accumulator intact for the next symbol. + let (peeked, avail) = br.peek_lut_bits(PRIMARY_BITS); + if avail >= PRIMARY_BITS { + let entry = self.lut[peeked as usize]; let len = entry >> LUT_LEN_SHIFT; if len > 0 { - br.set_position(br.position() + len as usize); + br.consume(len); return Ok(entry & LUT_SYM_MASK); } // Long code (> PRIMARY_BITS) -- fall through to the slow path. @@ -307,6 +311,18 @@ impl<'a> BitSource<'a> { self.nbits = 0; } + /// Advance the logical position by `n` bits that are already buffered + /// in `acc`. The caller must guarantee `n <= self.nbits` (e.g. right + /// after a `peek_bits(m)` with `m >= n`). Unlike `set_position` this + /// keeps the remaining buffered bits, so the hot Huffman fast path does + /// not force a refill on every decoded symbol. + #[inline] + pub(crate) fn consume(&mut self, n: u32) { + debug_assert!(n <= self.nbits); + self.acc >>= n; + self.nbits -= n; + } + /// Remaining bits available (still in `data` plus held in `acc`). #[allow(dead_code)] pub(crate) fn remaining(&self) -> usize { @@ -364,6 +380,7 @@ impl<'a> BitSource<'a> { /// Peek `n` bits (0 < n ≤ 32) without advancing. Caller must /// guarantee `n <= remaining()`. Refills the internal accumulator if /// fewer than `n` bits are buffered. + #[allow(dead_code)] pub(crate) fn peek_bits(&mut self, n: u32) -> u32 { debug_assert!(n > 0 && n <= 32); debug_assert!(n as usize <= self.remaining()); @@ -378,6 +395,28 @@ impl<'a> BitSource<'a> { } } + /// Peek up to `n` bits (1..=32) for the Huffman LUT fast path without + /// advancing. Refills once, then returns `(bits, available)` where + /// `available = min(nbits, n)` and `bits` holds the low `available` + /// bits LSB-first. When `available < n` the caller must fall back to + /// the per-bit slow path. Unlike `peek_bits` this never asserts on a + /// short tail, so it is safe to call when the stream is nearly drained. + #[inline] + pub(crate) fn peek_lut_bits(&mut self, n: u32) -> (u32, u32) { + if self.nbits < n { + self.refill(); + } + let avail = self.nbits.min(n); + let bits = if avail == 0 { + 0 + } else if avail >= 32 { + self.acc as u32 + } else { + (self.acc & ((1u64 << avail) - 1)) as u32 + }; + (bits, avail) + } + /// Read `n` bits (0..=32) as a little-endian integer. pub(crate) fn read_bits(&mut self, n: u32) -> Result { debug_assert!(n <= 32); From fd7d8c1db10f9d8ce0686ebe310a1161595d6a42 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:06 +0900 Subject: [PATCH 10/32] brotli: skip literal context lookup when there is a single tree When NTREESL == 1 the literal context map is all zeroes, so the per-byte context-id computation (context::literal_context plus the cmapl index) always selects tree 0. Specialize the insert-literal loop to decode straight from htree_l[0] in that case, hoisting the single tree reference out of the loop. Block-type switching still runs (it drives block_len_l) but no longer feeds an unused context lookup. Decode throughput (median of 3, 1 MiB): Random: ~140 -> ~235 MB/s (+~68% on top of the prior commit; +~120% vs the original 106 MB/s baseline) Lorem: unchanged (uses multiple context trees -> slow path) cargo test --features "brotli std": green. clippy clean. Co-Authored-By: Claude Fable 5 --- src/brotli/mod.rs | 52 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/src/brotli/mod.rs b/src/brotli/mod.rs index 1e655da..436364a 100644 --- a/src/brotli/mod.rs +++ b/src/brotli/mod.rs @@ -2157,6 +2157,11 @@ impl Decoder { htree_d.push(Self::read_prefix_code(src, num_dist_codes)?); } + // When there is a single literal Huffman tree the context map is + // all zeroes, so literal decoding can skip the per-byte context + // lookup entirely (the tree index is constant 0). + let single_literal_tree = ntreesl == 1; + // ─── decoding loop ─── let mut emitted: u32 = 0; let mut block_type_l: u32 = 0; @@ -2232,20 +2237,43 @@ impl Decoder { let copy_len = COPY_BASE[copy_code as usize] + copy_extra; // Emit `insert_len` literals. - for _ in 0..insert_len { - if emitted >= mlen { - return Err(Error::Corrupt); + if single_literal_tree { + // Single literal Huffman tree: the context map is all + // zeroes, so the per-byte context computation and the + // `cmapl` lookup are dead work — the tree index is always + // 0. (Block-type switches still drive `block_len_l`, but + // they never change which tree we use here.) + let tree = &htree_l[0]; + for _ in 0..insert_len { + if emitted >= mlen { + return Err(Error::Corrupt); + } + maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l); + block_len_l -= 1; + let sym = tree.decode(src)?; + if sym > 255 { + return Err(Error::Corrupt); + } + self.emit_literal(sym as u8); + emitted += 1; } - maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l); - block_len_l -= 1; - let cid = context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2); - let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize; - let sym = htree_l[tree_idx].decode(src)?; - if sym > 255 { - return Err(Error::Corrupt); + } else { + for _ in 0..insert_len { + if emitted >= mlen { + return Err(Error::Corrupt); + } + maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l); + block_len_l -= 1; + let cid = + context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2); + let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize; + let sym = htree_l[tree_idx].decode(src)?; + if sym > 255 { + return Err(Error::Corrupt); + } + self.emit_literal(sym as u8); + emitted += 1; } - self.emit_literal(sym as u8); - emitted += 1; } if emitted >= mlen { From 58d02b6af80de5ebfcdcbf3fb52a13a79cb932c5 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:54:44 +0900 Subject: [PATCH 11/32] brotli: widen Huffman fast-path LUT from 9 to 11 bits The primary lookup table now covers codes up to length 11 instead of 9, resolving more literal/distance symbols in a single indexed load before falling back to the per-bit canonical walk. The table grows to 2048 u32 (8 KiB) per tree, still L1-resident; build cost is paid once per tree per meta-block and is dwarfed by the per-symbol decode savings on 1 MiB+ inputs. Decode throughput (median of 3, 1 MiB): Random: ~235 -> ~255 MB/s Lorem: unchanged (~1030, within noise) cargo test --features "brotli std": green. clippy clean. Co-Authored-By: Claude Fable 5 --- src/brotli/huffman.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/brotli/huffman.rs b/src/brotli/huffman.rs index 055d02b..be72649 100644 --- a/src/brotli/huffman.rs +++ b/src/brotli/huffman.rs @@ -21,8 +21,10 @@ use crate::error::Error; /// Primary-LUT width for the fast-path symbol lookup. Codes of length /// ≤ `PRIMARY_BITS` resolve in O(1); longer codes fall back to the -/// per-bit walk. -const PRIMARY_BITS: u32 = 9; +/// per-bit walk. Brotli codes cap at length 15; an 11-bit table resolves +/// the vast majority of literal/distance symbols in one indexed load +/// (2048 u32 = 8 KiB per tree) while still fitting comfortably in L1. +const PRIMARY_BITS: u32 = 11; const PRIMARY_SIZE: usize = 1 << PRIMARY_BITS; /// Packed (symbol, length) entry in the primary LUT. The low 16 bits hold From d83d66df9a598660db7ae7f897e93f346f30ebea Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:53:59 +0900 Subject: [PATCH 12/32] zstd: faster Huffman literal decode via peek/consume Replace the per-symbol read+unread reseed in HuffTable::decode with a peek_bits/consume pair on RevBitReader. The old path rebuilt the bit accumulator from memory on every literal (reseed_from_consumed); the new path peeks max_bits without consuming, indexes the lookup table, and consumes only the matched code length. #[inline] the bit-reader read. Decode micro-bench (4 MiB mixed-entropy text, Huffman+FSE heavy): ~314 -> ~330 MB/s median. Co-Authored-By: Claude Fable 5 --- src/zstd/bitreader.rs | 52 +++++++++++++++++++++++++++++++++++++++---- src/zstd/huffman.rs | 29 ++++++++++++------------ 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/zstd/bitreader.rs b/src/zstd/bitreader.rs index 3a238bc..3cdb016 100644 --- a/src/zstd/bitreader.rs +++ b/src/zstd/bitreader.rs @@ -91,8 +91,11 @@ impl<'a> RevBitReader<'a> { self.consumed >= self.available } - /// Give back `n` previously-read bits. Required by the Huffman decoder - /// which peeks `max_bits` and then keeps only the actual code length. + /// Give back `n` previously-read bits by rewinding the cursor and rebuilding + /// the accumulator. Retained as a general bit-reader primitive (and exercised + /// by tests); the Huffman decoder now uses the cheaper [`Self::peek_bits`] + + /// [`Self::consume`] pair instead, which avoids this per-symbol reseed. + #[allow(dead_code)] pub fn unread(&mut self, n: u32) { let n_usize = n as usize; debug_assert!(self.consumed >= n_usize); @@ -103,8 +106,8 @@ impl<'a> RevBitReader<'a> { self.reseed_from_consumed(); } - /// Rebuild the internal accumulator from `consumed`. Called from `unread`, - /// which is rare (one call per Huffman symbol at most). + /// Rebuild the internal accumulator from `consumed`. Called from `unread`. + #[allow(dead_code)] fn reseed_from_consumed(&mut self) { // Position of the next bit to deliver in global bit numbering. let next_bit = self.available - 1 - self.consumed; @@ -133,9 +136,50 @@ impl<'a> RevBitReader<'a> { } } + /// Peek up to `peek_bits` bits MSB-first **without** consuming them, + /// returning them right-justified in a `u64` alongside the number of real + /// payload bits available in that window. + /// + /// `peek_bits` must be in `1..=56`. When fewer than `peek_bits` payload + /// bits remain, the low-order positions of the returned value are zero + /// (the accumulator shifts in zeros at the bottom), which is exactly what + /// a left-justified canonical-code lookup expects. The second return value + /// is `min(peek_bits, remaining)` so the caller can detect truncation. + /// + /// Used by the Huffman decoder to index a fixed-width lookup table and then + /// [`Self::consume`] only the matched code's actual length — avoiding the + /// expensive `read` + `unread` reseed that the old per-symbol path paid. + #[inline] + pub fn peek_bits(&mut self, peek_bits: u32) -> (u64, u32) { + debug_assert!((1..=56).contains(&peek_bits)); + if self.bits_in_acc < peek_bits { + self.refill(); + } + let remaining = self.available - self.consumed; + let avail = core::cmp::min(peek_bits as usize, remaining) as u32; + let raw = self.acc >> (64 - peek_bits); + (raw, avail) + } + + /// Consume `n` bits previously inspected via [`Self::peek_bits`]. The caller + /// must ensure `n` does not exceed the bits the matching peek reported as + /// available and that `consumed + n <= available`. + #[inline] + pub fn consume(&mut self, n: u32) { + debug_assert!(n <= self.bits_in_acc); + debug_assert!(self.consumed + n as usize <= self.available); + if n == 0 { + return; + } + self.acc <<= n; + self.bits_in_acc -= n; + self.consumed += n as usize; + } + /// Read `n` bits (0..=64) MSB-first from the current backward cursor. /// /// Bits returned right-justified. + #[inline] pub fn read(&mut self, n: u32) -> Result { if n == 0 { return Ok(0); diff --git a/src/zstd/huffman.rs b/src/zstd/huffman.rs index 5634561..bd1e4ad 100644 --- a/src/zstd/huffman.rs +++ b/src/zstd/huffman.rs @@ -30,28 +30,27 @@ pub struct HuffTable { impl HuffTable { /// Decode one symbol from `br`, consuming exactly its bit length. + /// + /// Fast path: peek `max_bits` (without consuming), index the lookup table, + /// then consume only the matched code's actual length. Peeking returns the + /// next `max_bits` already left-justified, so the index is `raw` directly — + /// no `read`+`unread` reseed per symbol. + #[inline] pub fn decode(&self, br: &mut RevBitReader<'_>) -> Result { - if br.remaining() == 0 { - return Err(Error::Corrupt); - } let max = self.max_bits as u32; - let avail = br.remaining() as u32; - let take = core::cmp::min(max, avail); - let raw = br.read(take)?; - // Left-justify into a `max`-bit window so the table index matches the - // canonical MSB-first code regardless of how many bits remained. - let idx = (raw << (max - take)) as usize; - if idx >= self.lookup.len() { + let (raw, avail) = br.peek_bits(max); + if avail == 0 { return Err(Error::Corrupt); } + let idx = raw as usize; + // `idx` is in `0..(1 << max)` by construction of `peek_bits`, and the + // lookup table is sized `1 << max`, so the index is always in range. + debug_assert!(idx < self.lookup.len()); let (sym, len) = self.lookup[idx]; - if len == 0 || (len as u32) > take { + if len == 0 || len as u32 > avail { return Err(Error::Corrupt); } - // Give back any bits we consumed beyond the actual code length. - if take > len as u32 { - br.unread(take - len as u32); - } + br.consume(len as u32); Ok(sym) } } From 79bdb94df21d1edb8329a69bfdfe1f03f1719fac Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:54:03 +0900 Subject: [PATCH 13/32] zstd: skip zero-bit reads and inline FSE state transitions FseState::advance now special-cases num_bits==0 (max-probability symbols) to avoid a RevBitReader::read call whose result is always 0, and inlines symbol()/advance(). A meaningful fraction of FSE table entries carry num_bits==0, so this removes a hot per-sequence function call. Decode micro-bench: ~330 -> ~350 MB/s median. Co-Authored-By: Claude Fable 5 --- src/zstd/fse.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/zstd/fse.rs b/src/zstd/fse.rs index e650b3c..0fdb708 100644 --- a/src/zstd/fse.rs +++ b/src/zstd/fse.rs @@ -347,15 +347,24 @@ impl FseState { } /// Return the current symbol (without advancing state). + #[inline] pub fn symbol(&self, table: &FseTable) -> u16 { table.entries[self.state as usize].symbol } /// Advance: read `num_bits` from the reader and update state. + #[inline] pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> { let e = table.entries[self.state as usize]; - let extra = br.read(e.num_bits as u32)? as u16; - let next = e.base_state.wrapping_add(extra); + // Most table entries carry a non-trivial `num_bits`, but a meaningful + // fraction are 0 (max-probability symbols); skip the bit-reader call + // entirely in that case — `base_state` is already the next state. + let next = if e.num_bits == 0 { + e.base_state + } else { + let extra = br.read(e.num_bits as u32)? as u16; + e.base_state.wrapping_add(extra) + }; if (next as usize) >= table.size() { return Err(Error::Corrupt); } From 56e4fd0a3f05d963f4d051550b810ba0b0274023 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:56:17 +0900 Subject: [PATCH 14/32] zstd: inline RevBitReader::read fast path, split wide reads out of line read() is called up to 6x per sequence (FSE state advances + LL/OF/ML extra bits) and was a non-inlined ~30% hotspot. Mark the n<=56 fast path #[inline(always)] and move the rare 57..=64-bit wide-read branch into a #[cold] #[inline(never)] read_wide(). The hot small-read path now inlines directly into decode_sequences and FseState::advance, eliminating the call overhead and bounds-check duplication. Decode micro-bench: instruction count -16% (callgrind), wall-clock ~350 -> ~425 MB/s. Co-Authored-By: Claude Fable 5 --- src/zstd/bitreader.rs | 57 ++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/src/zstd/bitreader.rs b/src/zstd/bitreader.rs index 3cdb016..f0bc08c 100644 --- a/src/zstd/bitreader.rs +++ b/src/zstd/bitreader.rs @@ -179,7 +179,12 @@ impl<'a> RevBitReader<'a> { /// Read `n` bits (0..=64) MSB-first from the current backward cursor. /// /// Bits returned right-justified. - #[inline] + /// + /// The `n <= 56` fast path is `#[inline(always)]` and is the only path the + /// FSE/sequence decoders ever take (their reads are at most ~16 bits); the + /// rare 57..=64-bit wide path is split into an out-of-line cold function so + /// inlining the fast path into hot callers stays cheap. + #[inline(always)] pub fn read(&mut self, n: u32) -> Result { if n == 0 { return Ok(0); @@ -202,30 +207,36 @@ impl<'a> RevBitReader<'a> { self.consumed += n as usize; Ok(result) } else { - // Wide-read path (n in 57..=64): take the top 56 bits in one - // shot, then the remaining n-56 bits with a second refill. This - // matches the byte-by-byte version's semantics without needing - // a u128 accumulator. - let high_n = 56u32; - let low_n = n - 56; - // Top chunk. - if self.bits_in_acc < high_n { - self.refill(); - } - let high = self.acc >> (64 - high_n); - self.acc <<= high_n; - self.bits_in_acc -= high_n; - // Low chunk. - if self.bits_in_acc < low_n { - self.refill(); - } - let low = self.acc >> (64 - low_n); - self.acc <<= low_n; - self.bits_in_acc -= low_n; - self.consumed += n as usize; - Ok((high << low_n) | low) + self.read_wide(n) } } + + /// Cold path for 57..=64-bit reads: take the top 56 bits, then the + /// remaining `n-56` bits with a second refill. Kept out of line so the + /// common small-read path inlines compactly into hot callers. + #[cold] + #[inline(never)] + fn read_wide(&mut self, n: u32) -> Result { + // Matches the byte-by-byte version's semantics without a u128 accumulator. + let high_n = 56u32; + let low_n = n - 56; + // Top chunk. + if self.bits_in_acc < high_n { + self.refill(); + } + let high = self.acc >> (64 - high_n); + self.acc <<= high_n; + self.bits_in_acc -= high_n; + // Low chunk. + if self.bits_in_acc < low_n { + self.refill(); + } + let low = self.acc >> (64 - low_n); + self.acc <<= low_n; + self.bits_in_acc -= low_n; + self.consumed += n as usize; + Ok((high << low_n) | low) + } } #[cfg(test)] From bc17156f0f1ddcf9b2e6b9efff8a7d61c3de2ff8 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:03:56 +0900 Subject: [PATCH 15/32] zstd: hoist LL/ML base+extra tables to module-level const ll_base_extra/ml_base_extra rebuilt two 36/53-element stack arrays on every call (once per sequence). Replace with module-level const [(base, extra); N] tables indexed via .get(), so the hot sequence loop reads a single rodata table instead of re-materialising arrays. Tables verified element-for-element against the RFC 8478 LL/ML code tables. Decode micro-bench: instruction count -13% (callgrind), wall-clock ~425 -> ~470 MB/s. Co-Authored-By: Claude Fable 5 --- src/zstd/sequences.rs | 135 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 26 deletions(-) diff --git a/src/zstd/sequences.rs b/src/zstd/sequences.rs index 36b71a0..4a8443c 100644 --- a/src/zstd/sequences.rs +++ b/src/zstd/sequences.rs @@ -288,36 +288,119 @@ fn resolve_table( // ─── code → (base, extra_bits) lookups (RFC §3.1.1.3.2.1) ──────────────── +/// Literal-length code → (base, extra_bits). Spec tables A.4.1 / A.4.2. +/// Module-level `const` so the inner sequence loop indexes a single rodata +/// table instead of materialising two stack arrays per call. +const LL_BASE_EXTRA: [(u32, u32); 36] = [ + (0, 0), + (1, 0), + (2, 0), + (3, 0), + (4, 0), + (5, 0), + (6, 0), + (7, 0), + (8, 0), + (9, 0), + (10, 0), + (11, 0), + (12, 0), + (13, 0), + (14, 0), + (15, 0), + (16, 1), + (18, 1), + (20, 1), + (22, 1), + (24, 2), + (28, 2), + (32, 3), + (40, 3), + (48, 4), + (64, 6), + (128, 7), + (256, 8), + (512, 9), + (1024, 10), + (2048, 11), + (4096, 12), + (8192, 13), + (16384, 14), + (32768, 15), + (65536, 16), +]; + +/// Match-length code → (base, extra_bits). From the zstd reference tables. +const ML_BASE_EXTRA: [(u32, u32); 53] = [ + (3, 0), + (4, 0), + (5, 0), + (6, 0), + (7, 0), + (8, 0), + (9, 0), + (10, 0), + (11, 0), + (12, 0), + (13, 0), + (14, 0), + (15, 0), + (16, 0), + (17, 0), + (18, 0), + (19, 0), + (20, 0), + (21, 0), + (22, 0), + (23, 0), + (24, 0), + (25, 0), + (26, 0), + (27, 0), + (28, 0), + (29, 0), + (30, 0), + (31, 0), + (32, 0), + (33, 0), + (34, 0), + (35, 1), + (37, 1), + (39, 1), + (41, 1), + (43, 2), + (47, 2), + (51, 3), + (59, 3), + (67, 4), + (83, 4), + (99, 5), + (131, 7), + (259, 8), + (515, 9), + (1027, 10), + (2051, 11), + (4099, 12), + (8195, 13), + (16387, 14), + (32771, 15), + (65539, 16), +]; + +#[inline] fn ll_base_extra(code: u8) -> Result<(u32, u32), Error> { - if code > 35 { - return Err(Error::Corrupt); - } - // Spec tables A.4.1 / A.4.2: literal-length codes. - let bases: [u32; 36] = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 28, 32, 40, 48, - 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, - ]; - let extras: [u32; 36] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, - ]; - Ok((bases[code as usize], extras[code as usize])) + LL_BASE_EXTRA + .get(code as usize) + .copied() + .ok_or(Error::Corrupt) } +#[inline] fn ml_base_extra(code: u8) -> Result<(u32, u32), Error> { - if code > 52 { - return Err(Error::Corrupt); - } - let bases: [u32; 53] = [ - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 131, 259, 515, - 1027, 2051, 4099, 8195, 16387, 32771, 65539, - ]; - let extras: [u32; 53] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ]; - Ok((bases[code as usize], extras[code as usize])) + ML_BASE_EXTRA + .get(code as usize) + .copied() + .ok_or(Error::Corrupt) } /// Translate the `offset_value` produced by the offset FSE+extra-bits sum From 6faec5fe91ac557772457ae09fb1d1d403b4170e Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:04:08 +0900 Subject: [PATCH 16/32] zstd: fetch each FSE entry once per sequence (symbol + advance share load) The sequence loop indexed each FSE table twice per state per sequence: once in symbol() and again in advance(). Add FseState::entry() to fetch the FseEntry once (yielding the symbol) and advance_with(entry, size) to reuse it, and hoist the loop-invariant table sizes. This cuts the per-sequence memory traffic on the three FSE tables. Decode micro-bench wall-clock: ~470 -> ~483 MB/s (consistent across runs). Co-Authored-By: Claude Fable 5 --- src/zstd/fse.rs | 29 +++++++++++++++++++++++++---- src/zstd/sequences.rs | 28 +++++++++++++++++++++------- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/zstd/fse.rs b/src/zstd/fse.rs index 0fdb708..8101fa5 100644 --- a/src/zstd/fse.rs +++ b/src/zstd/fse.rs @@ -346,16 +346,30 @@ impl FseState { Ok(Self { state: s }) } + /// Return the table entry for the current state. The entry carries the + /// emitted symbol plus the `(num_bits, base_state)` recipe for the next + /// transition; fetching it once lets a caller read the symbol and then + /// [`Self::advance_with`] using the same load instead of re-indexing. + #[inline] + pub fn entry(&self, table: &FseTable) -> FseEntry { + table.entries[self.state as usize] + } + /// Return the current symbol (without advancing state). #[inline] pub fn symbol(&self, table: &FseTable) -> u16 { table.entries[self.state as usize].symbol } - /// Advance: read `num_bits` from the reader and update state. + /// Advance using a pre-fetched [`FseEntry`] (from [`Self::entry`]) for the + /// *current* state, avoiding a second bounds-checked table index. #[inline] - pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> { - let e = table.entries[self.state as usize]; + pub fn advance_with( + &mut self, + e: FseEntry, + table_size: usize, + br: &mut RevBitReader<'_>, + ) -> Result<(), Error> { // Most table entries carry a non-trivial `num_bits`, but a meaningful // fraction are 0 (max-probability symbols); skip the bit-reader call // entirely in that case — `base_state` is already the next state. @@ -365,12 +379,19 @@ impl FseState { let extra = br.read(e.num_bits as u32)? as u16; e.base_state.wrapping_add(extra) }; - if (next as usize) >= table.size() { + if (next as usize) >= table_size { return Err(Error::Corrupt); } self.state = next; Ok(()) } + + /// Advance: read `num_bits` from the reader and update state. + #[inline] + pub fn advance(&mut self, table: &FseTable, br: &mut RevBitReader<'_>) -> Result<(), Error> { + let e = table.entries[self.state as usize]; + self.advance_with(e, table.size(), br) + } } // ─── default tables (RFC 8478 §3.1.1.3.2.2.1) ───────────────────────────── diff --git a/src/zstd/sequences.rs b/src/zstd/sequences.rs index 4a8443c..1811c50 100644 --- a/src/zstd/sequences.rs +++ b/src/zstd/sequences.rs @@ -127,6 +127,12 @@ pub fn decode_sequences(data: &[u8], state: &mut SequencesState) -> Result = Vec::with_capacity((n_seq as usize).min(128 * 1024)); + // Table sizes are loop-invariant; hoist them so the per-sequence advance + // doesn't reload `entries.len()` three times. + let ll_size = ll_table.size(); + let ml_size = ml_table.size(); + let of_size = of_table.size(); + for i in 0..n_seq { // Per RFC §3.1.1.3.2.1.1 decoding order: // 1. Read literal_length extra bits. @@ -134,9 +140,16 @@ pub fn decode_sequences(data: &[u8], state: &mut SequencesState) -> Result Result Date: Fri, 12 Jun 2026 11:51:40 +0900 Subject: [PATCH 17/32] perf(decoders): bulk overlapping match copy in lz4/lz5/lzo/snappy Replace byte-at-a-time self-overlap copy loops with chunked extend_from_within: each round duplicates the offset-byte tail produced so far, doubling the source region, so the loop runs O(log len) rounds instead of one push per byte. Decoder output is byte-identical. Measured (1 MiB Lorem, decode MB/s): lz4: 1470 -> ~18000 (~12x) lzo: 2396 -> ~18000 (~7x) snappy/lz5 overlap-heavy paths similarly bulk-copy now. Co-Authored-By: Claude Fable 5 --- src/lz4/block.rs | 15 +++++++++++---- src/lz5/block.rs | 14 +++++++++----- src/lzo/block.rs | 13 +++++++++---- src/snappy/mod.rs | 13 +++++++++---- 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/src/lz4/block.rs b/src/lz4/block.rs index 9370f63..69f8970 100644 --- a/src/lz4/block.rs +++ b/src/lz4/block.rs @@ -325,7 +325,8 @@ pub fn decode_block(input: &[u8], out: &mut Vec, raw_max: usize) -> Result<( } // Non-overlapping match collapses to memcpy; offset==1 is a byte-splat; - // otherwise replicate byte-by-byte to handle LZ77 self-overlap. + // otherwise replicate in `offset`-sized chunks to handle LZ77 + // self-overlap while still copying in bulk. let start = out.len() - offset; if offset >= match_len { out.extend_from_within(start..start + match_len); @@ -333,9 +334,15 @@ pub fn decode_block(input: &[u8], out: &mut Vec, raw_max: usize) -> Result<( let b = out[start]; out.resize(out.len() + match_len, b); } else { - for i in 0..match_len { - let b = out[start + i]; - out.push(b); + // Overlapping: each round copies the `offset`-byte tail produced so + // far. The source region doubles every round, so the number of + // rounds is logarithmic in `match_len`. + let mut remaining = match_len; + while remaining > 0 { + let chunk = remaining.min(offset); + let s = out.len() - offset; + out.extend_from_within(s..s + chunk); + remaining -= chunk; } } } diff --git a/src/lz5/block.rs b/src/lz5/block.rs index 42f06d8..a674319 100644 --- a/src/lz5/block.rs +++ b/src/lz5/block.rs @@ -274,11 +274,15 @@ fn copy_match(out: &mut Vec, offset: usize, match_len: usize, cap: usize) -> let b = out[start]; out.resize(out.len() + match_len, b); } else { - // Self-overlap — must copy byte-by-byte so back-references read - // from already-written bytes. - for i in 0..match_len { - let b = out[start + i]; - out.push(b); + // Self-overlap — copy in `offset`-sized chunks. Each round duplicates + // the tail produced so far, doubling the source region, so the loop + // runs a logarithmic number of times instead of once per byte. + let mut remaining = match_len; + while remaining > 0 { + let chunk = remaining.min(offset); + let s = out.len() - offset; + out.extend_from_within(s..s + chunk); + remaining -= chunk; } } Ok(()) diff --git a/src/lzo/block.rs b/src/lzo/block.rs index 32adbc6..2583a26 100644 --- a/src/lzo/block.rs +++ b/src/lzo/block.rs @@ -622,10 +622,15 @@ fn copy_match( let b = out[start]; out.resize(out.len() + length, b); } else { - // Self-overlap (LZ77 RLE-style): replicate byte-by-byte. - for i in 0..length { - let b = out[start + i]; - out.push(b); + // Self-overlap (LZ77 RLE-style): copy in `distance`-sized chunks. Each + // round duplicates the tail produced so far, doubling the source + // region, so the loop runs a logarithmic number of times. + let mut remaining = length; + while remaining > 0 { + let chunk = remaining.min(distance); + let s = out.len() - distance; + out.extend_from_within(s..s + chunk); + remaining -= chunk; } } Ok(()) diff --git a/src/snappy/mod.rs b/src/snappy/mod.rs index bdad3b1..f99509e 100644 --- a/src/snappy/mod.rs +++ b/src/snappy/mod.rs @@ -540,10 +540,15 @@ fn copy_from_back( let b = out[start]; out.resize(out.len() + length, b); } else { - // Self-overlapping (RLE-style) — must replicate byte-by-byte. - for i in 0..length { - let b = out[start + i]; - out.push(b); + // Self-overlapping (RLE-style) — copy in `offset`-sized chunks. Each + // round duplicates the tail produced so far, doubling the source + // region, so the loop runs a logarithmic number of times. + let mut remaining = length; + while remaining > 0 { + let chunk = remaining.min(offset); + let s = out.len() - offset; + out.extend_from_within(s..s + chunk); + remaining -= chunk; } } Ok(()) From 672fc92d3c6a15782fc0b72d6af2e2b1ff7b90a8 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:53:36 +0900 Subject: [PATCH 18/32] perf(lzw): single-pass string emit, drop scratch stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decode_string_to_emit_buf walked the prefix chain into a scratch Vec (reversing), then popped it into emit_buf (un-reversing) — two passes and a second buffer. Walk the chain straight into emit_buf and reverse just the written region in place: one walk + one tight in-place reverse, and the scratch stack field is removed. Decoder output is byte-identical. Measured (decode MB/s): Lorem 425 -> ~510, Zeros 641 -> ~950 (~1.45x). Co-Authored-By: Claude Fable 5 --- src/lzw/mod.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/lzw/mod.rs b/src/lzw/mod.rs index 01846e1..88e931b 100644 --- a/src/lzw/mod.rs +++ b/src/lzw/mod.rs @@ -518,8 +518,6 @@ pub struct Decoder { /// in forward order. `emit_head` is the read cursor. emit_buf: Vec, emit_head: usize, - /// Scratch stack used while reversing a decoded string. - stack: Vec, /// Once `finish` has nothing more to flush. completed: bool, } @@ -542,7 +540,6 @@ impl Decoder { codes_in_group: 0, emit_buf: Vec::new(), emit_head: 0, - stack: Vec::with_capacity(max_size), completed: false, } } @@ -629,17 +626,20 @@ impl Decoder { /// Decode the string represented by `code`, pushing characters forward /// into `self.emit_buf`. Updates `self.finchar` to the first character. fn decode_string_to_emit_buf(&mut self, mut code: u32) { - self.stack.clear(); + // Walk the prefix chain straight into `emit_buf`. Suffixes come out + // deepest-last (reverse order), so we append them followed by the + // first character, then reverse just the region we wrote. This avoids + // the separate scratch stack and its second copy pass — a single walk + // plus one in-place reverse (tight, cache-friendly). + let start = self.emit_buf.len(); while code >= 256 { - self.stack.push(self.suffix[code as usize]); + self.emit_buf.push(self.suffix[code as usize]); code = self.prefix[code as usize] as u32; } let first = code as u8; self.finchar = first; self.emit_buf.push(first); - while let Some(b) = self.stack.pop() { - self.emit_buf.push(b); - } + self.emit_buf[start..].reverse(); } /// Drain `self.emit_buf` (from `self.emit_head`) into `out`, returning @@ -827,7 +827,6 @@ impl RawDecoder for Decoder { self.codes_in_group = 0; self.emit_buf.clear(); self.emit_head = 0; - self.stack.clear(); self.completed = false; } } From 957dcf463dd05c7f8592e6e474b142d1cd853559 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:56:07 +0900 Subject: [PATCH 19/32] perf(lzo): skip-step accelerator in encoder match search On a miss, advance by a stride that grows with the consecutive-miss count (LZ4-style) instead of one byte at a time, so incompressible data is scanned in large strides. The first ~64 misses still step 1 byte, so compressible data keeps a dense hash table and its ratio/speed are unchanged; a hit resets the stride. Round-trip tests pass (decode output unchanged). Measured (features=lzo,factory,std; encode MB/s): Random: 495 -> ~3000 (~6x) Lorem: 1335 -> ~1290 (flat, within noise; output size unchanged) Co-Authored-By: Claude Fable 5 --- src/lzo/block.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/lzo/block.rs b/src/lzo/block.rs index 2583a26..5f00200 100644 --- a/src/lzo/block.rs +++ b/src/lzo/block.rs @@ -125,6 +125,11 @@ pub fn encode_block(input: &[u8], out: &mut Vec) { let in_len = input.len(); let hash_limit = in_len.saturating_sub(4); + // Skip-step accelerator: count consecutive misses and advance faster the + // longer we go without a match, so incompressible data is scanned in big + // strides instead of one byte at a time. Reset to 1-byte steps on a hit. + let mut search_match_nb: u32 = 1 << 6; + while ip < hash_limit { let h = hash4([input[ip], input[ip + 1], input[ip + 2], input[ip + 3]]); let candidate = table[h]; @@ -150,9 +155,15 @@ pub fn encode_block(input: &[u8], out: &mut Vec) { } if !found { - ip += 1; + // Grow the step the longer we search without a hit. The first + // ~64 misses still step 1 byte (keeping the hash table dense for + // compressible data); after that the stride ramps up. + let step = (search_match_nb >> 6) as usize; + search_match_nb += 1; + ip += step; continue; } + search_match_nb = 1 << 6; // Extend the match forward as far as possible. let mut match_len = 4usize; From 09cc2c849c426d85c103b674e43d415a4b2e09e5 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:57:21 +0900 Subject: [PATCH 20/32] perf(snappy): skip-step accelerator in encoder match search On a miss, advance by a stride that grows with the consecutive-miss count (matching the reference encoder's bytes_between_hash_lookups), so incompressible regions are scanned in large strides instead of one byte at a time. A hit resets the stride. Round-trip tests pass and the >2x ratio test still holds (output stays well-compressed). Measured (features=snappy,factory,std; encode MB/s): Random: 804 -> ~4900 (~6x) Lorem: 2557 -> ~2760 (slightly up) Zeros: flat (within run-to-run noise) Co-Authored-By: Claude Fable 5 --- src/snappy/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/snappy/mod.rs b/src/snappy/mod.rs index f99509e..971dd1f 100644 --- a/src/snappy/mod.rs +++ b/src/snappy/mod.rs @@ -214,6 +214,12 @@ fn compress_block(input: &[u8], out: &mut Vec) { }; // Match-or-literal main loop. + // Skip-step accelerator: advance faster the longer the matcher goes + // without a hit, so incompressible regions are scanned in large strides + // (mirrors the reference encoder's `skip`/`bytes_between_hash_lookups`). + // A hit resets the stride to 1 byte. + let mut search_match_nb: u32 = 1 << 5; + while ip < match_limit { let h = hash(input, ip); let candidate = table[h] as usize; @@ -231,9 +237,12 @@ fn compress_block(input: &[u8], out: &mut Vec) { && input[candidate + 3] == input[ip + 3]; if !four_match { - ip += 1; + let step = (search_match_nb >> 5) as usize; + search_match_nb += 1; + ip += step; continue; } + search_match_nb = 1 << 5; // Found a 4-byte match. First, flush any pending literal. if next_emit < ip { From e667989a85910dbff52f3b47f8d0d3fde1b16af1 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:52:57 +0900 Subject: [PATCH 21/32] bzip2: cut SA-IS allocations and inline induced-sort hot paths Reuse a single bucket scratch buffer across all induced-sort passes instead of allocating fresh bucket-start/-end Vecs each call, collect LMS positions once during type classification (removing the later rescan + lms_positions rebuild), and inline is_lms / bucket fills. SA-IS build throughput on a 900 KB block (median of 3, --release): lorem 18.6 -> 19.2 MB/s zeros 31.8 -> 32.9 MB/s random 10.8 -> 13.2 MB/s (+22%) Output unchanged: same induced-sort order => identical BWT+origin. Full test suite (round-trip, reference fixtures, bunzip2 cross-check) stays green. Co-Authored-By: Claude Fable 5 --- src/bzip2/bwt.rs | 180 ++++++++++++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 63 deletions(-) diff --git a/src/bzip2/bwt.rs b/src/bzip2/bwt.rs index 985fe13..c91495c 100644 --- a/src/bzip2/bwt.rs +++ b/src/bzip2/bwt.rs @@ -149,50 +149,67 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) { // suffix i+1 is S-type. Otherwise L-type. // // `t[i] == true` ⇒ S-type. + // + // While we classify, collect the LMS positions (left-to-right) + // once so we don't have to rescan the type array later. let mut t = vec![false; n]; t[n - 1] = true; + // An LMS position is an S-type whose left neighbour is L-type. We + // know `t[i+1]` as we walk right-to-left, so we can detect the LMS + // at `i+1` the moment we set `t[i]` (it is LMS iff t[i+1] && !t[i]). + let mut lms_positions: Vec = Vec::new(); for i in (0..n - 1).rev() { - t[i] = if text[i] < text[i + 1] { + let si = if text[i] < text[i + 1] { true } else if text[i] == text[i + 1] { t[i + 1] } else { false }; + t[i] = si; + // i+1 is LMS iff it is S-type (t[i+1]) and i is L-type (!si). + if t[i + 1] && !si { + lms_positions.push((i + 1) as i32); + } } + // We pushed LMS positions in descending order; reverse for ascending. + lms_positions.reverse(); + let n1 = lms_positions.len(); // 2. Compute bucket sizes (count of each symbol in `text`). + // `counts` holds the per-symbol counts; `bucket` is a reusable + // scratch into which we materialise either bucket starts or ends. let mut counts = vec![0i32; alphabet_size]; for &c in text { counts[c as usize] += 1; } + let mut bucket = vec![0i32; alphabet_size]; // 3. Step A: place LMS suffixes at the END of their buckets in `sa`. sa.fill(-1); - let mut ends = bucket_ends(&counts); - for (i, &c_i) in text.iter().enumerate().take(n).skip(1) { - if is_lms(&t, i) { - let c = c_i as usize; - ends[c] -= 1; - sa[ends[c] as usize] = i as i32; - } + fill_bucket_ends(&counts, &mut bucket); + for &p in &lms_positions { + let c = text[p as usize] as usize; + bucket[c] -= 1; + sa[bucket[c] as usize] = p; } // 4. Induced sort of L-suffixes (left-to-right pass). - induce_sort_l(text, sa, &t, &counts); + induce_sort_l(text, sa, &t, &counts, &mut bucket); // 5. Induced sort of S-suffixes (right-to-left pass). - induce_sort_s(text, sa, &t, &counts); + induce_sort_s(text, sa, &t, &counts, &mut bucket); // 6. Compact LMS suffixes to the front of SA (preserving the order // we just induced) and name them by their LMS-substring identity. - let mut n1 = 0usize; + let mut j1 = 0usize; for i in 0..n { if sa[i] >= 0 && is_lms(&t, sa[i] as usize) { - sa[n1] = sa[i]; - n1 += 1; + sa[j1] = sa[i]; + j1 += 1; } } + debug_assert_eq!(j1, n1); // Clear the rest as a workspace for naming. for slot in sa.iter_mut().take(n).skip(n1) { *slot = -1; @@ -266,21 +283,11 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) { sa_is_inner(&reduced_text, sa1, new_alpha); } - // 8. Recover positions of LMS suffixes in the original text. - // The trailing region currently holds the reduced text; we need - // to translate name-indices in sa1 back to original positions. - // We rebuild a list of LMS positions in left-to-right order. - let mut lms_positions: Vec = Vec::with_capacity(n1); - for (i, &is_s) in t.iter().enumerate().take(n).skip(1) { - if is_s && !t[i - 1] { - lms_positions.push(i as i32); - } - } - debug_assert_eq!(lms_positions.len(), n1); - - // Translate: sa1[i] = index-of-LMS in original. Reuse trailing area - // as scratch for translated positions, then place them at bucket - // ends. + // 8. Recover positions of LMS suffixes in the original text using + // the `lms_positions` list (in left-to-right order) we collected + // during classification. sa1[i] is the rank/index in that list. + // Translate the sorted LMS order (currently in sa[..n1]) into + // original positions, in place. for slot in sa.iter_mut().take(n1) { let idx = *slot as usize; // recursive SA gave us the LMS index in left-to-right order. *slot = lms_positions[idx]; @@ -292,72 +299,80 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) { // 9. Place sorted LMS suffixes at the ENDS of their buckets in SA, // in the order produced by the recursive call. - let mut ends = bucket_ends(&counts); - // Move them from positions 0..n1 to bucket-end positions, going - // right-to-left to preserve relative order within each bucket. + // + // The sorted LMS positions sit in sa[..n1]. We scatter them to + // bucket ends going right-to-left. Because scattering reads from + // the front of `sa` while writing toward bucket ends (which are + // at indices >= the read cursor for every symbol except possibly + // the sentinel — and the sentinel bucket holds exactly the single + // n-1 suffix that is never LMS), a destructive in-place scatter + // could clobber a not-yet-read entry. To stay safe and simple we + // snapshot the n1 sorted positions, clear `sa`, then scatter. let mut lms_sorted: Vec = Vec::with_capacity(n1); lms_sorted.extend_from_slice(&sa[..n1]); for slot in sa.iter_mut().take(n) { *slot = -1; } + fill_bucket_ends(&counts, &mut bucket); for &pos in lms_sorted.iter().rev() { let c = text[pos as usize] as usize; - ends[c] -= 1; - sa[ends[c] as usize] = pos; + bucket[c] -= 1; + sa[bucket[c] as usize] = pos; } // 10. Final induced sorts: L then S. - induce_sort_l(text, sa, &t, &counts); - induce_sort_s(text, sa, &t, &counts); + induce_sort_l(text, sa, &t, &counts, &mut bucket); + induce_sort_s(text, sa, &t, &counts, &mut bucket); } /// `true` iff suffix `i` is S-type AND suffix `i-1` is L-type (left- /// most S in a run). Suffix 0 is never an LMS in our convention. +#[inline(always)] fn is_lms(t: &[bool], i: usize) -> bool { i > 0 && t[i] && !t[i - 1] } -/// Compute exclusive prefix sums giving the *start* index of each -/// bucket in SA. -fn bucket_starts(counts: &[i32]) -> Vec { - let mut s = Vec::with_capacity(counts.len()); +/// Materialise the *start* index of each bucket (exclusive prefix sum +/// of `counts`) into the reusable scratch `out`. +#[inline] +fn fill_bucket_starts(counts: &[i32], out: &mut [i32]) { let mut acc = 0i32; - for &c in counts { - s.push(acc); + for (o, &c) in out.iter_mut().zip(counts.iter()) { + *o = acc; acc += c; } - s } -/// Compute the *end* (one-past-last) index of each bucket in SA. -fn bucket_ends(counts: &[i32]) -> Vec { - let mut e = Vec::with_capacity(counts.len()); +/// Materialise the *end* (one-past-last) index of each bucket +/// (inclusive prefix sum of `counts`) into the reusable scratch `out`. +#[inline] +fn fill_bucket_ends(counts: &[i32], out: &mut [i32]) { let mut acc = 0i32; - for &c in counts { + for (o, &c) in out.iter_mut().zip(counts.iter()) { acc += c; - e.push(acc); + *o = acc; } - e } /// Induced sort of L-type suffixes. Scans `sa` left-to-right; for each /// non-negative entry `sa[i] = j`, if `j > 0` and suffix `j-1` is /// L-type, place `j-1` at the next free slot at the START of bucket -/// `text[j-1]`. -fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) { +/// `text[j-1]`. `bucket` is reusable scratch of length `alphabet_size`. +fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) { let n = text.len(); - let mut starts = bucket_starts(counts); + fill_bucket_starts(counts, bucket); for i in 0..n { - if sa[i] <= 0 { + let v = sa[i]; + if v <= 0 { continue; // -1 or 0 — we handle 0 by not predecessing. } - let j = (sa[i] as usize) - 1; + let j = (v as usize) - 1; if !t[j] { // L-type. let c = text[j] as usize; - let slot = starts[c] as usize; - sa[slot] = j as i32; - starts[c] += 1; + let slot = bucket[c]; + sa[slot as usize] = j as i32; + bucket[c] = slot + 1; } } } @@ -365,20 +380,22 @@ fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) { /// Induced sort of S-type suffixes. Scans `sa` right-to-left; for each /// non-negative entry `sa[i] = j`, if `j > 0` and suffix `j-1` is /// S-type, place `j-1` at the next free slot at the END of bucket -/// `text[j-1]`. -fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32]) { +/// `text[j-1]`. `bucket` is reusable scratch of length `alphabet_size`. +fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) { let n = text.len(); - let mut ends = bucket_ends(counts); + fill_bucket_ends(counts, bucket); for i in (0..n).rev() { - if sa[i] <= 0 { + let v = sa[i]; + if v <= 0 { continue; } - let j = (sa[i] as usize) - 1; + let j = (v as usize) - 1; if t[j] { // S-type. let c = text[j] as usize; - ends[c] -= 1; - sa[ends[c] as usize] = j as i32; + let slot = bucket[c] - 1; + bucket[c] = slot; + sa[slot as usize] = j as i32; } } } @@ -549,6 +566,43 @@ mod tests { assert_eq!(back, data); } + #[cfg(feature = "std")] + #[test] + #[ignore] + fn timing_bwt_forward() { + extern crate std; + let n = 900_000usize; + let lorem_src = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, \ +sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. "; + let mut lorem = Vec::with_capacity(n); + while lorem.len() < n { + lorem.extend_from_slice(lorem_src); + } + lorem.truncate(n); + let zeros = vec![0u8; n]; + let mut random = Vec::with_capacity(n); + let mut state: u32 = 0xDEAD_BEEF; + for _ in 0..n { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + random.push((state >> 16) as u8); + } + for (name, data) in [("lorem", &lorem), ("zeros", &zeros), ("random", &random)] { + let _ = bwt_forward(data); + let mut best = f64::MAX; + for _ in 0..3 { + let t = std::time::Instant::now(); + let (l, _o) = bwt_forward(data); + let el = t.elapsed().as_secs_f64(); + std::hint::black_box(&l); + if el < best { + best = el; + } + } + let mbps = (n as f64) / best / 1e6; + std::eprintln!("BWT {name}: {:.2} ms {:.1} MB/s", best * 1e3, mbps); + } + } + #[test] fn matches_naive_on_small_inputs() { // Cross-check SA-IS output against a naive cyclic sort for From 93acccae2fe8e59728dd7f148ad19650326b227a Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:54:23 +0900 Subject: [PATCH 22/32] bzip2: recurse SA-IS reduced problem in place (drop per-level copy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reduced LMS text already lives in the trailing n1 cells of `sa` and the recursive sub-suffix-array is written into the leading n1 cells. Since no two adjacent positions can both be LMS, n1 <= n/2, so those two regions are disjoint halves of split_at_mut and can be borrowed (immutable text / mutable output) simultaneously — removing the fresh reduced_text Vec allocated and filled at every recursion level. SA-IS build throughput on a 900 KB block (median of 3, --release), relative to the previous commit: lorem 19.2 -> 21.2 MB/s zeros 32.9 -> 40.5 MB/s (+23%) random 13.2 -> 14.1 MB/s Output unchanged (identical recursion, identical BWT+origin); full suite green. Co-Authored-By: Claude Fable 5 --- src/bzip2/bwt.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/bzip2/bwt.rs b/src/bzip2/bwt.rs index c91495c..da48f0b 100644 --- a/src/bzip2/bwt.rs +++ b/src/bzip2/bwt.rs @@ -273,14 +273,17 @@ fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) { sa1_area[name_of_pos as usize] = i as i32; } } else { - // Recurse on the reduced text. We need a slice of length n1 for - // sa1, and the reduced text occupies the trailing n1 cells. + // Recurse on the reduced text in place, with no copy. The + // reduced text occupies the trailing n1 cells (t1_area[..n1]) + // and the sub-suffix-array is written into the leading n1 cells + // (sa1_area[..n1]). These come from the two disjoint halves of + // `split_at_mut`, so we can hold an immutable borrow of the text + // and a mutable borrow of the output simultaneously. They are + // guaranteed non-overlapping because n1 <= n/2 (no two adjacent + // positions are both LMS), hence n1 <= n - n1. + let reduced_text: &[i32] = &t1_area[..n1]; let sa1 = &mut sa1_area[..n1]; - // The reduced text is t1_area[..n1] but we want to pass it as - // an immutable slice. We must copy to avoid aliasing. - let mut reduced_text: Vec = Vec::with_capacity(n1); - reduced_text.extend_from_slice(&t1_area[..n1]); - sa_is_inner(&reduced_text, sa1, new_alpha); + sa_is_inner(reduced_text, sa1, new_alpha); } // 8. Recover positions of LMS suffixes in the original text using From 02e6627642d2e72c6eacaf51503d39d1387ba59a Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:50:28 +0900 Subject: [PATCH 23/32] =?UTF-8?q?xpress=5Fhuffman:=20amortize=20decoder=20?= =?UTF-8?q?history=20trim=20(O(n=C2=B2)=20=E2=86=92=20O(n))?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit emit_byte() drained out_history back to MAX_DISTANCE on every emitted byte once the 64 KiB window filled, shifting the whole buffer per byte — quadratic over the stream and the dominant decode cost. Let the buffer grow to 2*MAX_DISTANCE and trim the oldest half only then; all reads are relative to len() and bounded by MAX_DISTANCE, so correctness is unchanged and decode stays byte-identical. Decode MB/s (1 MiB): Lorem 1.34→786, Zeros 1.46→588, Random 1.40→266. Co-Authored-By: Claude Fable 5 --- src/xpress_huffman/decoder.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/xpress_huffman/decoder.rs b/src/xpress_huffman/decoder.rs index 51bcbd3..f67da9c 100644 --- a/src/xpress_huffman/decoder.rs +++ b/src/xpress_huffman/decoder.rs @@ -126,7 +126,15 @@ impl Decoder { fn emit_byte(&mut self, b: u8) { self.decoded.push(b); self.out_history.push(b); - if self.out_history.len() > MAX_DISTANCE { + // Trim the retained history so it never grows without bound, but do + // it amortized: a naive `drain` back to `MAX_DISTANCE` on every byte + // shifts the whole 64 KiB buffer per emit, which is O(n²) over the + // stream. Instead let it grow to `2 * MAX_DISTANCE` and only then drop + // the oldest half, keeping at least the last `MAX_DISTANCE` bytes. + // Every read into `out_history` is relative to its current `len()` and + // bounded by `MAX_DISTANCE` (validated above against `out_history.len()`), + // so retaining that many is always sufficient. Amortized O(1) per byte. + if self.out_history.len() >= 2 * MAX_DISTANCE { let drop = self.out_history.len() - MAX_DISTANCE; self.out_history.drain(0..drop); } From 5da0abf6bd523eed649eea9cfef5d0ae87069a05 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:51:23 +0900 Subject: [PATCH 24/32] lznt1: bulk copy_within for non-overlapping match copies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit decode_compressed_chunk emitted every match byte-by-byte. For the common non-overlapping case (offset >= length) the source range is already fully populated, so resize + copy_within does it in one shot. The overlapping run case (offset < length, run-length expansion) keeps the byte loop. Decode byte-identical. Decode MB/s (Zeros 1 MiB, match-heavy): ~1807 → ~1980 (+~10%). Co-Authored-By: Claude Fable 5 --- src/lznt1/decoder.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/lznt1/decoder.rs b/src/lznt1/decoder.rs index 7a5ee58..442df17 100644 --- a/src/lznt1/decoder.rs +++ b/src/lznt1/decoder.rs @@ -138,12 +138,20 @@ impl Decoder { if out.len() + length > CHUNK_SIZE { return Err(Error::Corrupt); } - // Byte-by-byte copy from the chunk's own output so - // far. Self-overlap is permitted (offset < length). let src_start = pos - offset; - for k in 0..length { - let b = out[src_start + k]; - out.push(b); + if offset >= length { + // Non-overlapping: the source range is fully + // populated already, so grow the buffer and bulk + // copy in one shot instead of byte-by-byte. + out.resize(pos + length, 0); + out.copy_within(src_start..src_start + length, pos); + } else { + // Self-overlapping run (offset < length): each + // emitted byte feeds the next, so copy one at a time. + for k in 0..length { + let b = out[src_start + k]; + out.push(b); + } } } } From b54a7712d4c76c24b3319f93e28011fee3af5a32 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:54:40 +0900 Subject: [PATCH 25/32] hpack: byte-wide FSA Huffman decoder Replace the bit-at-a-time canonical decode (one table probe per input bit) with a byte-wide finite-state machine over the canonical trie: one lookup per input byte, emitting 0..=8 symbols. Built per call but cheaply (composed from a per-nibble table), so even the fast/short-code case stays flat. h2-huffman decode MB/s (1 MiB): Lorem: 385 -> 378 (flat, within noise) Zeros: 155 -> 202 (+30%) Random: 64 -> 93 (+45%) All hpack + full-feature tests green; output byte-identical. Co-Authored-By: Claude Fable 5 --- src/hpack/huffman.rs | 222 ++++++++++++++++++++++++++++++++----------- 1 file changed, 169 insertions(+), 53 deletions(-) diff --git a/src/hpack/huffman.rs b/src/hpack/huffman.rs index cb0851b..a12f4cf 100644 --- a/src/hpack/huffman.rs +++ b/src/hpack/huffman.rs @@ -92,22 +92,173 @@ pub(crate) const CODES: [(u32, u8); 257] = [ (0x3fffffff, 30), ]; +#[cfg(test)] const MAX_LEN: usize = 30; -/// Canonical decode tables reconstructed from [`CODES`]. Cheap to build -/// (257-entry sweep); built per decode call. +// ─── byte FSA fast decoder ─────────────────────────────────────────────── +// +// Bit-at-a-time canonical decoding is correct but slow (one table probe per +// input bit). For throughput we precompute a byte-wide finite-state machine +// over the canonical code's binary trie: each transition consumes a whole +// input byte and emits 0..=8 complete symbols. A Huffman string then costs +// exactly one table lookup per input byte instead of ~8 bit probes. The FSA +// is rebuilt per `decode` call; its construction is a fixed sweep over the +// trie (≈ states × 256 steps), negligible against any non-trivial input. +// +// The byte-for-byte output and every RFC 7541 §5.2 rejection (EOS symbol, +// over-long padding, non-`1` padding) are identical to the bit-at-a-time +// path — the FSA is just a faster way to walk the same trie. + +/// One byte transition: where to go and what to emit. +#[derive(Clone, Copy)] +struct Trans { + /// Trie node reached after consuming this byte's 8 bits. + next: u16, + /// Number of complete symbols emitted while consuming the byte (0..=8). + n: u8, + /// Set if any consumed bit completed the EOS symbol (→ Corrupt). + eos: bool, + /// The emitted symbol bytes (only the first `n` are meaningful). + out: [u8; 8], +} + +/// Byte FSA: `trans[state * 256 + byte]` gives the transition. State 0 is the +/// trie root, the only valid end-of-string boundary. +struct FastTable { + trans: Vec, + /// Per-state padding metadata: `(depth, all_ones)` for the partial path + /// from the root to this node. A valid end state has `depth < 8` and + /// `all_ones` (the RFC 7541 §5.2 EOS-prefix padding rule). + pad: Vec<(u8, bool)>, +} + +impl FastTable { + fn build() -> Self { + // Canonical binary trie. Node 0 is the root. `child[node][bit]` is the + // next node index (0 = unset, since the root is never a child). + // `leaf_sym[node]` is the symbol for a leaf, or -1. + let mut child: Vec<[u16; 2]> = Vec::new(); + child.push([0, 0]); // root + let mut leaf_sym: Vec = Vec::new(); + leaf_sym.push(-1); + + for (sym, &(code, len)) in CODES.iter().enumerate() { + let len = len as u32; + let mut node = 0usize; + for i in (0..len).rev() { + let bit = ((code >> i) & 1) as usize; + let nxt = child[node][bit]; + if nxt == 0 { + let new = child.len() as u16; + child.push([0, 0]); + leaf_sym.push(-1); + child[node][bit] = new; + node = new as usize; + } else { + node = nxt as usize; + } + } + leaf_sym[node] = sym as i32; + } + + let n_states = child.len(); + + // Per-node padding metadata: depth from root and whether the path is + // all `1`-bits. Leaves reset to the root after emitting, so only + // non-leaf nodes are ever a resting state, but we fill every node. + let mut pad = alloc::vec![(0u8, true); n_states]; + // Iterative DFS from the root; children are always added after their + // parent, so a single forward pass over node indices in creation + // order would also work, but we walk explicitly for clarity. + let mut stack = alloc::vec![0usize]; + while let Some(node) = stack.pop() { + let (d, ones) = pad[node]; + let kids = child[node]; + for (bit, &c) in kids.iter().enumerate() { + if c != 0 { + let c = c as usize; + pad[c] = (d + 1, ones && bit == 1); + stack.push(c); + } + } + } + + // Build a per-nibble transition first (n_states × 16, four bit-steps + // each), then compose each byte transition from its two nibble halves. + // This costs ≈ n_states·(16·4 + 256·2) build steps instead of + // n_states·256·8 — roughly a 4× cheaper construction, which matters + // because the table is rebuilt on every `decode` call. + struct Nib { + next: u16, + n: u8, + eos: bool, + out: [u8; 4], + } + let mut nib = Vec::with_capacity(n_states * 16); + for state in 0..n_states { + for half in 0..16u32 { + let mut node = state; + let mut out = [0u8; 4]; + let mut n = 0u8; + let mut eos = false; + for i in (0..4).rev() { + let bit = ((half >> i) & 1) as usize; + node = child[node][bit] as usize; + if leaf_sym[node] >= 0 { + let sym = leaf_sym[node] as u16; + if sym == EOS { + eos = true; + } else { + out[n as usize] = sym as u8; + n += 1; + } + node = 0; + } + } + nib.push(Nib { + next: node as u16, + n, + eos, + out, + }); + } + } + + let mut trans = Vec::with_capacity(n_states * 256); + for state in 0..n_states { + for byte in 0..256usize { + let hi = &nib[state * 16 + (byte >> 4)]; + let lo = &nib[hi.next as usize * 16 + (byte & 0x0f)]; + let mut out = [0u8; 8]; + let hn = hi.n as usize; + out[..hn].copy_from_slice(&hi.out[..hn]); + let ln = lo.n as usize; + out[hn..hn + ln].copy_from_slice(&lo.out[..ln]); + trans.push(Trans { + next: lo.next, + n: (hn + ln) as u8, + eos: hi.eos || lo.eos, + out, + }); + } + } + + FastTable { trans, pad } + } +} + +/// Canonical decode tables reconstructed from [`CODES`], retained only for +/// the canonicality self-test (which also underpins the FSA's correctness). +#[cfg(test)] struct DecodeTable { /// `first_code[len]` = numeric value of the first codeword of length /// `len` (1..=30). first_code: [u32; MAX_LEN + 1], - /// `first_index[len]` = offset into `symbols` of the first codeword of - /// length `len`. - first_index: [usize; MAX_LEN + 1], /// Symbols ordered by (length asc, code asc). symbols: Vec, - count: [u32; MAX_LEN + 1], } +#[cfg(test)] impl DecodeTable { fn build() -> Self { let mut count = [0u32; MAX_LEN + 1]; @@ -125,35 +276,14 @@ impl DecodeTable { } } let mut first_code = [0u32; MAX_LEN + 1]; - let mut first_index = [0usize; MAX_LEN + 1]; let mut code = 0u32; - let mut index = 0usize; for len in 1..=MAX_LEN { first_code[len] = code; - first_index[len] = index; code = (code + count[len]) << 1; - index += count[len] as usize; } DecodeTable { first_code, - first_index, symbols, - count, - } - } - - /// If `acc` (a value of exactly `len` bits) is a complete codeword, - /// return its symbol. - fn lookup(&self, acc: u32, len: usize) -> Option { - let c = self.count[len]; - if c == 0 { - return None; - } - let off = acc.checked_sub(self.first_code[len])?; - if off < c { - Some(self.symbols[self.first_index[len] + off as usize]) - } else { - None } } } @@ -193,40 +323,26 @@ pub fn encoded_len(data: &[u8]) -> usize { /// bits, padding not consisting of EOS-prefix `1`s, and any appearance of /// the EOS symbol — all as [`Error::Corrupt`]. pub fn decode(data: &[u8]) -> Result, Error> { - let table = DecodeTable::build(); + let table = FastTable::build(); let mut out = Vec::with_capacity(data.len() * 2); - let mut acc: u32 = 0; - let mut nbits: usize = 0; + // Current trie node (state). State 0 = root = clean symbol boundary. + let mut state = 0usize; + let trans = &table.trans[..]; for &byte in data { - for i in (0..8).rev() { - let bit = ((byte >> i) & 1) as u32; - acc = (acc << 1) | bit; - nbits += 1; - if nbits > MAX_LEN { - // No codeword is longer than 30 bits. - return Err(Error::Corrupt); - } - if let Some(sym) = table.lookup(acc, nbits) { - if sym == EOS { - return Err(Error::Corrupt); - } - out.push(sym as u8); - acc = 0; - nbits = 0; - } + let t = &trans[state * 256 + byte as usize]; + if t.eos { + return Err(Error::Corrupt); } + // Emit the symbols completed in this byte (0..=8). + out.extend_from_slice(&t.out[..t.n as usize]); + state = t.next as usize; } // Trailing bits are padding: must be < 8 bits, all 1s. A prefix-free code // guarantees these EOS-prefix 1s cannot complete a real symbol above. - if nbits >= 8 { + let (depth, all_ones) = table.pad[state]; + if depth >= 8 || !all_ones { return Err(Error::Corrupt); } - if nbits > 0 { - let mask = (1u32 << nbits) - 1; - if acc & mask != mask { - return Err(Error::Corrupt); - } - } Ok(out) } From f1fcef5eae12fab57e09bc9b76d69857c3beded7 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 11:59:34 +0900 Subject: [PATCH 26/32] arc_crunch: single-write LZW string assembly + literal fast path Decode previously wrote every output byte twice: pushed onto a scratch stack during the prefix-chain walk, then popped into emit_buf. Replace with a fixed-size reverse-assembly scratch (allocated once) filled back-to-front in one walk, then a single vectorised extend_from_slice into emit_buf. A length-1 literal (common on incompressible input) skips assembly entirely. crunch decode MB/s (1 MiB): Lorem: 322 -> 390 (+21%) Zeros: 686 -> 1078 (+57%) Random: 194 -> 207 (+7%) Crafted-stream guards preserved (i==0 rejects over-long/cyclic chains); all arc_crunch + full-feature tests green, output byte-identical. Co-Authored-By: Claude Fable 5 --- src/arc_crunch/mod.rs | 57 ++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/src/arc_crunch/mod.rs b/src/arc_crunch/mod.rs index 049d1a3..6566bb9 100644 --- a/src/arc_crunch/mod.rs +++ b/src/arc_crunch/mod.rs @@ -39,7 +39,7 @@ //! //! Crafted streams never panic: the classic LZW KwKwK case and any //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the -//! dictionary and the decoded-string stack are bounded by `1 << maxbits`; +//! dictionary and the decoded-string scratch are bounded by `1 << maxbits`; //! every dictionary index is bounds-checked and width arithmetic is checked. //! //! ## References @@ -386,7 +386,7 @@ pub struct Decoder { /// Decoded characters waiting to flush, forward order. emit_buf: Vec, emit_head: usize, - /// Scratch stack used while reversing a decoded string. + /// Scratch buffer used while reversing a decoded string. stack: Vec, completed: bool, } @@ -408,7 +408,9 @@ impl Decoder { finchar: 0, emit_buf: Vec::new(), emit_head: 0, - stack: Vec::with_capacity(max_size), + // Fixed-size reverse-assembly scratch: a decoded string is at most + // `1 << maxbits` ≤ `max_size` bytes, so its tail always fits. + stack: vec![0u8; max_size], completed: false, } } @@ -439,27 +441,48 @@ impl Decoder { /// Decode `code` into `emit_buf` (forward order); updates `finchar`. /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or /// out of range) — defends against crafted streams. + /// + /// The chain is walked once, writing the reversed string straight into a + /// reserved tail region of `emit_buf` (deepest suffix last). This avoids + /// the previous scratch-stack round trip (every byte was written twice: + /// once pushed, once popped) — each output byte is now written exactly + /// once. fn decode_string(&mut self, mut code: u32) -> Result<(), Error> { - self.stack.clear(); - let limit = 1usize << self.maxbits; - let mut hops = 0usize; + // `stack` is a fixed-size scratch (length == 1 << MAX_BITS, allocated + // once). We walk the prefix chain writing the string back-to-front into + // its tail, then bulk-copy the assembled forward-order slice into + // `emit_buf` with a single vectorised `extend_from_slice`. This avoids + // both the old per-byte `emit_buf.push` (a capacity check per byte) and + // any per-call zero-initialisation. + // Fast path: a bare literal (very common on incompressible input) is a + // length-1 string — emit it directly and skip the reverse-assembly. + if code < 256 { + let first = code as u8; + self.finchar = first; + self.emit_buf.push(first); + return Ok(()); + } + let scratch = &mut self.stack[..]; + let mut i = scratch.len(); while code >= 256 { - if code as usize >= self.prefix.len() { + // `i` reaching 0 means the chain is longer than any valid string + // (> 1 << maxbits): a malformed / cyclic prefix table. Reject + // rather than underflow. + if code as usize >= self.prefix.len() || i == 0 { return Err(Error::Corrupt); } - self.stack.push(self.suffix[code as usize]); + i -= 1; + scratch[i] = self.suffix[code as usize]; code = self.prefix[code as usize] as u32; - hops += 1; - if hops > limit { - return Err(Error::Corrupt); - } + } + if i == 0 { + return Err(Error::Corrupt); } let first = code as u8; self.finchar = first; - self.emit_buf.push(first); - while let Some(b) = self.stack.pop() { - self.emit_buf.push(b); - } + i -= 1; + scratch[i] = first; + self.emit_buf.extend_from_slice(&scratch[i..]); Ok(()) } @@ -621,7 +644,7 @@ impl RawDecoder for Decoder { self.finchar = 0; self.emit_buf.clear(); self.emit_head = 0; - self.stack.clear(); + // `stack` is fixed-size scratch overwritten on every use; leave it. self.completed = false; } } From 0a0ddb4576b8452f0481a83eac38d33a5e529797 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:00:55 +0900 Subject: [PATCH 27/32] arc_squash: single-write LZW string assembly + literal fast path Same optimization as arc_crunch: replace the push-to-stack / pop-to-emit_buf double write with a fixed-size reverse-assembly scratch filled in one walk and bulk-copied via extend_from_slice; bare literals skip assembly. squashed decode MB/s (1 MiB): Lorem: 384 -> 502 (+31%) Zeros: 669 -> 976 (+46%) Random: 194 -> 210 (+8%) Crafted-stream guards preserved; all arc_squash + full-feature tests green, output byte-identical. Co-Authored-By: Claude Fable 5 --- src/arc_squash/mod.rs | 47 +++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/arc_squash/mod.rs b/src/arc_squash/mod.rs index 83b769a..00c63f1 100644 --- a/src/arc_squash/mod.rs +++ b/src/arc_squash/mod.rs @@ -35,7 +35,7 @@ //! //! Crafted streams never panic: the classic LZW KwKwK case and any //! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the -//! dictionary and the decoded-string stack are bounded by `1 << 13`; every +//! dictionary and the decoded-string scratch are bounded by `1 << 13`; every //! dictionary index is bounds-checked. //! //! ## References @@ -347,7 +347,7 @@ pub struct Decoder { /// Decoded characters waiting to flush, forward order. emit_buf: Vec, emit_head: usize, - /// Scratch stack used while reversing a decoded string. + /// Fixed-size scratch used while reversing a decoded string. stack: Vec, completed: bool, } @@ -366,7 +366,9 @@ impl Decoder { finchar: 0, emit_buf: Vec::new(), emit_head: 0, - stack: Vec::with_capacity(max_size), + // Fixed-size reverse-assembly scratch: a decoded string is at most + // `MAX_CODE` bytes, so its tail always fits. + stack: vec![0u8; max_size], completed: false, } } @@ -396,26 +398,37 @@ impl Decoder { /// Returns `Err(Corrupt)` if the parent chain is malformed (too long or /// out of range) — defends against crafted streams. fn decode_string(&mut self, mut code: u32) -> Result<(), Error> { - self.stack.clear(); - let limit = MAX_CODE as usize; - let mut hops = 0usize; + // Fast path: a bare literal is a length-1 string — emit directly. + if code < 256 { + let first = code as u8; + self.finchar = first; + self.emit_buf.push(first); + return Ok(()); + } + // Walk the prefix chain back-to-front into the fixed-size scratch, then + // bulk-copy the forward-order slice into emit_buf with one + // extend_from_slice. This avoids the old per-byte push/pop round trip + // (each output byte written twice). + let scratch = &mut self.stack[..]; + let mut i = scratch.len(); while code >= 256 { - if code as usize >= self.prefix.len() { + // `i == 0` means the chain is longer than any valid string: a + // malformed / cyclic prefix table. Reject rather than underflow. + if code as usize >= self.prefix.len() || i == 0 { return Err(Error::Corrupt); } - self.stack.push(self.suffix[code as usize]); + i -= 1; + scratch[i] = self.suffix[code as usize]; code = self.prefix[code as usize] as u32; - hops += 1; - if hops > limit { - return Err(Error::Corrupt); - } + } + if i == 0 { + return Err(Error::Corrupt); } let first = code as u8; self.finchar = first; - self.emit_buf.push(first); - while let Some(b) = self.stack.pop() { - self.emit_buf.push(b); - } + i -= 1; + scratch[i] = first; + self.emit_buf.extend_from_slice(&scratch[i..]); Ok(()) } @@ -544,7 +557,7 @@ impl RawDecoder for Decoder { self.finchar = 0; self.emit_buf.clear(); self.emit_head = 0; - self.stack.clear(); + // `stack` is fixed-size scratch overwritten on every use; leave it. self.completed = false; } } From 66d7f1821a9f1041d1a4f8fa1f749ceec47ed98d Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:06:00 +0900 Subject: [PATCH 28/32] delta: vectorizable filter loop via direct predecessor indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-byte loop kept a dist-byte ring with a modulo branch and a read-modify-write of history every byte, serialising the whole transform. Split into three phases: seed the first dist bytes through the ring (cross-call history), then run a flat recurrence over the bulk — encode reads input[i-dist] directly (read-only input → auto-vectorises), decode reads output[i-dist] — then refresh the ring from the tail. Streaming/chunk semantics unchanged (the 1-byte-chunk-vs-bulk equivalence test passes). delta encode MB/s (1 MiB, default dist=1): ~1680 -> ~25000 (≈15x; the read-only subtract vectorises) delta decode unchanged (dist=1 reconstruction is an inherently serial prefix sum); larger distances also speed up decode. Output byte-identical. Co-Authored-By: Claude Fable 5 --- src/delta/mod.rs | 64 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/src/delta/mod.rs b/src/delta/mod.rs index 1521cc3..8c08a95 100644 --- a/src/delta/mod.rs +++ b/src/delta/mod.rs @@ -168,20 +168,37 @@ impl RawEncoder for Encoder { self.hist.check()?; let n = input.len().min(output.len()); let h = &mut self.hist; - for i in 0..n { + let dist = h.dist; + + // Phase 1: the first `dist` outputs subtract bytes from the ring + // history (previous calls / the all-zero seed). + let seed = dist.min(n); + for i in 0..seed { let orig = input[i]; - // history[i - dist] is the original byte we are about to - // overwrite at the ring cursor. let prev = h.buf[h.pos]; - // Modular subtraction is the defined transform (see module docs). output[i] = orig.wrapping_sub(prev); - // Store the *original* byte for future positions. h.buf[h.pos] = orig; h.pos += 1; - if h.pos == h.dist { + if h.pos == dist { h.pos = 0; } } + // Phase 2: for `i >= dist` the predecessor is `input[i - dist]` (the + // input *is* the original stream), so read it directly and drop the + // ring modulo branch and history accesses from the hot loop. + for i in dist..n { + output[i] = input[i].wrapping_sub(input[i - dist]); + } + // Phase 3: refresh the ring from the last `dist` *original* (input) + // bytes, matching the byte-by-byte cursor/layout (see the decoder for + // the derivation). + if n >= dist { + let pos_final = (h.pos + (n % dist)) % dist; + for k in 0..dist { + h.buf[(pos_final + k) % dist] = input[n - dist + k]; + } + h.pos = pos_final; + } Ok(RawProgress { consumed: n, written: n, @@ -228,17 +245,46 @@ impl RawDecoder for Decoder { self.hist.check()?; let n = input.len().min(output.len()); let h = &mut self.hist; - for i in 0..n { + let dist = h.dist; + + // Phase 1: the first `dist` outputs depend on the ring history (bytes + // from previous calls / the all-zero seed). Reconstruct them through + // the ring exactly as before. + let seed = dist.min(n); + for i in 0..seed { let prev = h.buf[h.pos]; - // Reconstruct the original byte: inverse of wrapping_sub. let orig = input[i].wrapping_add(prev); output[i] = orig; h.buf[h.pos] = orig; h.pos += 1; - if h.pos == h.dist { + if h.pos == dist { h.pos = 0; } } + // Phase 2: once `i >= dist`, `output[i - dist]` is the original byte we + // need — read it straight from the output buffer. This drops both the + // ring modulo branch and the history load/store from the hot loop and + // exposes a simple `out[i] = in[i] + out[i-dist]` recurrence. + for i in dist..n { + output[i] = input[i].wrapping_add(output[i - dist]); + } + // Phase 3: refresh the ring from the last `dist` reconstructed bytes so + // the next call continues seamlessly. (When `n < dist` the ring was + // already fully advanced byte-by-byte in phase 1 and is correct.) + // + // After processing `n` bytes the byte-by-byte algorithm leaves + // `pos = (p0 + n) % dist` and `buf[(pos + k) % dist] = output[n-dist+k]` + // for k in 0..dist (each slot holds its most recent write). Reproduce + // exactly that state. With `seed == dist` here, `h.pos` is back at `p0`, + // so the final cursor is `(p0 + n) % dist == (h.pos + (n % dist)) % + // dist`. + if n >= dist { + let pos_final = (h.pos + (n % dist)) % dist; + for k in 0..dist { + h.buf[(pos_final + k) % dist] = output[n - dist + k]; + } + h.pos = pos_final; + } Ok(RawProgress { consumed: n, written: n, From 944adf7f735a2e2c9c1b6dac29a1daa3e9224d16 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:10:57 +0900 Subject: [PATCH 29/32] lha: bulk match-copy in static-Huffman decode hot loop The LZSS match expansion copied byte-by-byte through the ring with two modulo ops per byte. Split by match geometry: non-overlapping matches copy in contiguous ring segments (straight-line loops, no per-byte wrap test); single-byte runs (distance 1) fill a constant byte directly; only genuinely overlapping matches fall back to the byte walk. The ring's space-prefill semantics are preserved, so output is byte-identical (shared by lh4/5/6/7). lh5 decode MB/s (1 MiB): Lorem: ~853 -> ~1400 (+64%) Zeros: ~890 -> ~1140 (+28%) lh4/lh6/lh7 improve comparably. All lha + full-feature tests green. Co-Authored-By: Claude Fable 5 --- src/lha/static_huff.rs | 69 +++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/src/lha/static_huff.rs b/src/lha/static_huff.rs index 0a99236..6cee4ec 100644 --- a/src/lha/static_huff.rs +++ b/src/lha/static_huff.rs @@ -456,7 +456,10 @@ pub fn decode_payload( // Literal. out.push(code as u8); ring[ring_pos] = code as u8; - ring_pos = (ring_pos + 1) % ring_size; + ring_pos += 1; + if ring_pos == ring_size { + ring_pos = 0; + } } else { let count = code - 256 + MIN_MATCH; if count > MAX_MATCH { @@ -469,17 +472,61 @@ pub fn decode_payload( if offset >= ring_size { return Err(Error::InvalidDistance); } - let start = (ring_pos + ring_size - offset - 1) % ring_size; - for k in 0..count { - if let Some(n) = expected - && out.len() >= n - { - break; + let limit = expected.unwrap_or(usize::MAX); + // Clamp the run to the declared output length. + let count = count.min(limit.saturating_sub(out.len())); + let mut src = (ring_pos + ring_size - offset - 1) % ring_size; + // Reserve output once so the per-byte push can't reallocate. + out.reserve(count); + if offset + 1 >= count { + // Non-overlapping match: source and destination regions are + // disjoint, so copy in at most two contiguous ring segments + // (split only where src or dst wraps the ring). Each segment + // is a straight-line memcpy-style loop with no per-byte + // wrap test. + let mut done = 0usize; + while done < count { + let run = (count - done) + .min(ring_size - src) + .min(ring_size - ring_pos); + // Copy `run` bytes ring[src..] -> out and -> ring[dst..]. + for k in 0..run { + let b = ring[src + k]; + out.push(b); + ring[ring_pos + k] = b; + } + src += run; + if src == ring_size { + src = 0; + } + ring_pos += run; + if ring_pos == ring_size { + ring_pos = 0; + } + done += run; + } + } else if offset == 0 { + // Single-byte run (distance 1): the whole match is one + // repeated byte. Fill directly instead of chasing the ring. + let b = ring[src]; + for _ in 0..count { + out.push(b); + ring[ring_pos] = b; + ring_pos += 1; + if ring_pos == ring_size { + ring_pos = 0; + } + } + } else { + // Overlapping match (offset+1 < count): each written byte + // feeds a later read, so walk byte-by-byte. + for _ in 0..count { + let b = ring[src]; + out.push(b); + ring[ring_pos] = b; + src = (src + 1) % ring_size; + ring_pos = (ring_pos + 1) % ring_size; } - let b = ring[(start + k) % ring_size]; - out.push(b); - ring[ring_pos] = b; - ring_pos = (ring_pos + 1) % ring_size; } } remaining -= 1; From ac84ba6cbd6963531a6e2568303c45ada651c76d Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:13:12 +0900 Subject: [PATCH 30/32] rar1/2/3/5: bulk LZ77 match-copy in decode window loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All four RAR decoders expanded matches byte-by-byte through the sliding window. Apply the same geometry split used in lha: distance-1 runs fill a constant byte; non-overlapping matches copy in contiguous window segments (straight-line loops, no per-byte index recompute / mask test); only genuinely overlapping matches walk byte-by-byte. The window-prefill and truncation semantics are preserved exactly, so decoded output stays byte-identical — verified by each codec's reference-fixture tests (rar1 53, rar2 28, rar3 30, rar5 29 tests, all green). Decode-only codecs (no bench round-trip); correctness is fixture-validated and the transform mirrors the measured lha win (+28-64% decode). Co-Authored-By: Claude Fable 5 --- src/rar1/window.rs | 33 +++++++++++++++++++++++++++++---- src/rar2/decoder.rs | 42 ++++++++++++++++++++++++++++++++++++------ src/rar3/decoder.rs | 44 +++++++++++++++++++++++++++++++++++++------- src/rar5/decoder.rs | 45 ++++++++++++++++++++++++++++++++++++++------- 4 files changed, 140 insertions(+), 24 deletions(-) diff --git a/src/rar1/window.rs b/src/rar1/window.rs index 33447dd..3a7d806 100644 --- a/src/rar1/window.rs +++ b/src/rar1/window.rs @@ -125,11 +125,36 @@ impl Window { let mask = WINDOW_SIZE - 1; let mut src = (self.write_pos + WINDOW_SIZE - distance) & mask; let mut dst = self.write_pos; - for _ in 0..length { + if distance == 1 { + // Distance-1 run: one repeated byte. let b = self.buf[src]; - self.buf[dst] = b; - src = (src + 1) & mask; - dst = (dst + 1) & mask; + for _ in 0..length { + self.buf[dst] = b; + dst = (dst + 1) & mask; + } + } else if distance >= length { + // Non-overlapping: copy in contiguous window segments (no per-byte + // mask test inside the run). + let mut done = 0usize; + while done < length { + let run = (length - done) + .min(WINDOW_SIZE - src) + .min(WINDOW_SIZE - dst); + for k in 0..run { + self.buf[dst + k] = self.buf[src + k]; + } + src = (src + run) & mask; + dst = (dst + run) & mask; + done += run; + } + } else { + // Overlapping match: each written byte feeds a later read. + for _ in 0..length { + let b = self.buf[src]; + self.buf[dst] = b; + src = (src + 1) & mask; + dst = (dst + 1) & mask; + } } self.write_pos = dst; self.in_flight += length; diff --git a/src/rar2/decoder.rs b/src/rar2/decoder.rs index d4b9302..3721e80 100644 --- a/src/rar2/decoder.rs +++ b/src/rar2/decoder.rs @@ -544,13 +544,43 @@ impl RunCtx { if remaining > cap { remaining = cap; } - while remaining > 0 { - let src = (self.window_pos + WINDOW_SIZE - off) & WINDOW_MASK; + output.reserve(remaining); + let mut src = (self.window_pos + WINDOW_SIZE - off) & WINDOW_MASK; + + if off == 1 { + // Distance-1 run: one repeated byte. let b = self.window[src]; - self.window[self.window_pos] = b; - self.window_pos = (self.window_pos + 1) & WINDOW_MASK; - output.push(b); - remaining -= 1; + for _ in 0..remaining { + self.window[self.window_pos] = b; + self.window_pos = (self.window_pos + 1) & WINDOW_MASK; + output.push(b); + } + } else if off >= remaining { + // Non-overlapping: copy in contiguous window segments. + let mut done = 0usize; + while done < remaining { + let run = (remaining - done) + .min(WINDOW_SIZE - src) + .min(WINDOW_SIZE - self.window_pos); + let sp = self.window_pos; + for k in 0..run { + let b = self.window[src + k]; + self.window[sp + k] = b; + output.push(b); + } + src = (src + run) & WINDOW_MASK; + self.window_pos = (self.window_pos + run) & WINDOW_MASK; + done += run; + } + } else { + // Overlapping match: each written byte feeds a later read. + for _ in 0..remaining { + let b = self.window[src]; + self.window[self.window_pos] = b; + src = (src + 1) & WINDOW_MASK; + self.window_pos = (self.window_pos + 1) & WINDOW_MASK; + output.push(b); + } } Ok(()) } diff --git a/src/rar3/decoder.rs b/src/rar3/decoder.rs index 4f90ecb..b04dd0c 100644 --- a/src/rar3/decoder.rs +++ b/src/rar3/decoder.rs @@ -321,14 +321,44 @@ impl RunCtx { if off > wlen { return Err(Error::InvalidDistance); } - for _ in 0..length { - let src = (self.window_pos + wlen - off) & wmask; + // Clamp the run to the declared unpack size (the old loop broke per + // byte once `out` reached it — produce exactly the same byte count). + let remaining_out = self.unpack_size.saturating_sub(self.out.len() as u64); + let length = (length as u64).min(remaining_out) as usize; + let mut src = (self.window_pos + wlen - off) & wmask; + self.out.reserve(length); + + if off == 1 { + // Distance-1 run: one repeated byte. Fill directly. let b = self.window[src]; - self.out.push(b); - self.window[self.window_pos] = b; - self.window_pos = (self.window_pos + 1) & wmask; - if (self.out.len() as u64) >= self.unpack_size { - break; + for _ in 0..length { + self.out.push(b); + self.window[self.window_pos] = b; + self.window_pos = (self.window_pos + 1) & wmask; + } + } else if off >= length { + // Non-overlapping: src and dst regions are disjoint. Copy in + // contiguous window segments (no per-byte recompute of `src`). + let mut done = 0usize; + while done < length { + let run = (length - done).min(wlen - src).min(wlen - self.window_pos); + for k in 0..run { + let b = self.window[src + k]; + self.out.push(b); + self.window[self.window_pos + k] = b; + } + src = (src + run) & wmask; + self.window_pos = (self.window_pos + run) & wmask; + done += run; + } + } else { + // Overlapping match: each written byte feeds a later read. + for _ in 0..length { + let b = self.window[src]; + self.out.push(b); + self.window[self.window_pos] = b; + src = (src + 1) & wmask; + self.window_pos = (self.window_pos + 1) & wmask; } } Ok(()) diff --git a/src/rar5/decoder.rs b/src/rar5/decoder.rs index d264b8b..049acc7 100644 --- a/src/rar5/decoder.rs +++ b/src/rar5/decoder.rs @@ -501,14 +501,45 @@ impl Decoder { } let ws = self.window_size; let wmask = self.window_mask; - for _ in 0..length { - let src = (self.window_pos + ws - dist as usize) & wmask; + let off = dist as usize; + // Clamp the run to the declared unpack total (the old loop broke per + // byte once it was reached — produce exactly the same byte count). + let produced = self.unpack_so_far + self.out_queue.len() as u64; + let remaining = self.unpack_total.saturating_sub(produced); + let length_n = (length as u64).min(remaining) as usize; + let mut src = (self.window_pos + ws - off) & wmask; + + if off == 1 { + // Distance-1 run: one repeated byte. let b = self.window[src]; - self.window[self.window_pos] = b; - self.window_pos = (self.window_pos + 1) & wmask; - self.out_queue.push_back(b); - if self.unpack_so_far + self.out_queue.len() as u64 >= self.unpack_total { - break; + for _ in 0..length_n { + self.window[self.window_pos] = b; + self.window_pos = (self.window_pos + 1) & wmask; + self.out_queue.push_back(b); + } + } else if off >= length_n { + // Non-overlapping: copy in contiguous window segments. + let mut done = 0usize; + while done < length_n { + let run = (length_n - done).min(ws - src).min(ws - self.window_pos); + let sp = self.window_pos; + for k in 0..run { + let b = self.window[src + k]; + self.window[sp + k] = b; + self.out_queue.push_back(b); + } + src = (src + run) & wmask; + self.window_pos = (self.window_pos + run) & wmask; + done += run; + } + } else { + // Overlapping match: each written byte feeds a later read. + for _ in 0..length_n { + let b = self.window[src]; + self.window[self.window_pos] = b; + src = (src + 1) & wmask; + self.window_pos = (self.window_pos + 1) & wmask; + self.out_queue.push_back(b); } } self.last_len = length; From 0afb6b40a3fb9ebe32ba67aaf864571c19355bb1 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:15:19 +0900 Subject: [PATCH 31/32] zip_implode/reduce/shrink: bulk match-copy in decode loops - zip_shrink (LZW): assemble the decoded string in the scratch buffer, then reverse once and extend_from_slice into emit_buf, instead of the per-byte pop/push round trip (each output byte was written twice). - zip_reduce: split the DLE back-reference copy into a zero-fill prefix (refs before stream start), a distance-1 fill, a non-overlapping extend_from_within, and an overlapping byte walk. - zip_implode: split the window match copy by geometry (distance-1 fill / contiguous non-overlap segments / overlapping byte walk) and apply the pending_len/output_left bookkeeping once per match. All decode-only; output byte-identical, verified by each codec's reference-fixture tests (shrink 14, reduce 17, implode 18, all green). Co-Authored-By: Claude Fable 5 --- src/zip_implode/decoder.rs | 39 +++++++++++++++++++++++++++--- src/zip_reduce/mod.rs | 49 ++++++++++++++++++++++++++++---------- src/zip_shrink/mod.rs | 10 ++++---- 3 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/zip_implode/decoder.rs b/src/zip_implode/decoder.rs index 03bd346..14b3a78 100644 --- a/src/zip_implode/decoder.rs +++ b/src/zip_implode/decoder.rs @@ -549,10 +549,43 @@ impl Decoder { if len > self.output_left { return Err(Error::Corrupt); } - for _ in 0..len { - let src = (self.window_pos + WINDOW_SIZE - dist) & (WINDOW_SIZE - 1); + let count = len as usize; + let mask = WINDOW_SIZE - 1; + let mut src = (self.window_pos + WINDOW_SIZE - dist) & mask; + // Bookkeeping that `emit_byte` would do per byte, applied once. + self.pending_len += count; + self.output_left -= count as u32; + + if dist == 1 { + // Distance-1 run: one repeated byte. let b = self.window[src]; - self.emit_byte(b); + for _ in 0..count { + self.window[self.window_pos] = b; + self.window_pos = (self.window_pos + 1) & mask; + } + } else if dist >= count { + // Non-overlapping: copy in contiguous window segments. + let mut done = 0usize; + while done < count { + let run = (count - done) + .min(WINDOW_SIZE - src) + .min(WINDOW_SIZE - self.window_pos); + let wp = self.window_pos; + for k in 0..run { + self.window[wp + k] = self.window[src + k]; + } + src = (src + run) & mask; + self.window_pos = (self.window_pos + run) & mask; + done += run; + } + } else { + // Overlapping match: each written byte feeds a later read. + for _ in 0..count { + let b = self.window[src]; + self.window[self.window_pos] = b; + src = (src + 1) & mask; + self.window_pos = (self.window_pos + 1) & mask; + } } Ok(true) } diff --git a/src/zip_reduce/mod.rs b/src/zip_reduce/mod.rs index d592903..0f90bd9 100644 --- a/src/zip_reduce/mod.rs +++ b/src/zip_reduce/mod.rs @@ -586,19 +586,44 @@ impl Decoder { // one optional extension byte), far below `buffer_ahead`, so // materialising it inline only overshoots the window bound by a // bounded amount that the next iteration's `slide_window` reaps. - let mut pm = PendingMatch { - dist, - remaining: len, - }; - while pm.remaining > 0 { - let pos = self.produced(); - let b = if pm.dist > pos { - 0u8 + self.out.reserve(len); + let mut remaining = len; + // Phase A: any portion of the back-reference that points before the + // start of the stream reads as zero. This can only be a contiguous + // leading run (`pos` only grows). + let pos0 = self.produced(); + if dist > pos0 { + let zeros = (dist - pos0).min(remaining); + for _ in 0..zeros { + self.out.push(0); + } + remaining -= zeros; + } + // Phase B: real back-reference into already-produced output. `src` + // is an index into `self.out`; it and the write head advance in + // lockstep, so split by geometry instead of recomputing per byte. + if remaining > 0 { + let mut src = self.produced() - dist - self.window_base; + if dist == 1 { + // Distance-1 run: repeated byte. + let b = self.out[src]; + for _ in 0..remaining { + self.out.push(b); + } + } else if dist >= remaining { + // Non-overlapping: source range is fully materialised, copy + // it in one shot via copy_within. + let start = self.out.len(); + self.out.extend_from_within(src..src + remaining); + debug_assert_eq!(self.out.len(), start + remaining); } else { - self.out[(pos - pm.dist) - self.window_base] - }; - self.out.push(b); - pm.remaining -= 1; + // Overlapping match: each written byte feeds a later read. + for _ in 0..remaining { + let b = self.out[src]; + self.out.push(b); + src += 1; + } + } } } Ok(()) diff --git a/src/zip_shrink/mod.rs b/src/zip_shrink/mod.rs index 7b1ba6d..9e9d92c 100644 --- a/src/zip_shrink/mod.rs +++ b/src/zip_shrink/mod.rs @@ -294,11 +294,13 @@ impl Decoder { } } - // `stack` now holds the string in reverse; pop into emit_buf. + // `stack` now holds the string reversed. The forward-order first byte + // is the last one pushed (`stack.last()`). Reverse once and bulk-copy + // into `emit_buf` with a single `extend_from_slice`, instead of the + // per-byte pop/push loop (which wrote every output byte twice). let first = *self.stack.last().ok_or(Error::Corrupt)?; - while let Some(b) = self.stack.pop() { - self.emit_buf.push(b); - } + self.stack.reverse(); + self.emit_buf.extend_from_slice(&self.stack); Ok(first) } From 3f6adb1ac9aaf150bd054ef02c2d288fb9dbc679 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Fri, 12 Jun 2026 12:19:37 +0900 Subject: [PATCH 32/32] docs: changelog entry for codec throughput optimizations Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index db2bd2f..f9c5d75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Performance + +- **Throughput optimizations across the codec suite**, all preserving + byte-identical decoder output (validated by the existing round-trip and + reference-fixture tests) — no `unsafe`, no new dependencies. Highlights: + - **deflate / deflate64** decode: vectorized match-copy (contiguous spans + + doubling `copy_within` for overlapping runs) — deflate Random decode + ~3.5×, deflate64 long-match decode several×; zlib/gzip inherit the gains. + - **LZMA / xz** decode: bulk (and overlapping) dictionary match-copy — + RLE-heavy `.lzma` decode up to ~6×. + - **zstd** decode: inlined backward bit-reader fast path, single-load FSE + state transitions, hoisted LL/ML tables — ~1.5× on Huffman/FSE-heavy input. + - **brotli** decode: wider Huffman fast LUT, single-tree literal fast path, + bit-accumulator kept across LUT hits — literal-heavy decode ~2.3×. + - **lz4 / lz5 / lzo / snappy** decode: bulk overlapping match-copy + (multi-GB/s); **lzo / snappy** encoder skip-step match search (~6× on + incompressible input). **lzw** single-pass string emit. + - **xpress-huffman** decode: fixed an O(n²) history-trim to O(n) (orders of + magnitude on large inputs); **lznt1** bulk match-copy. + - **lha / rar1–5 / zip-implode·reduce·shrink / arc-crunch·squash**: bulk + LZSS/LZW window copy; **delta** filter encode ~15× (auto-vectorized); + **hpack** byte-wide Huffman decode. + - **bzip2** encode: reduced SA-IS suffix-array allocations and in-place + recursion (+14–31% on the BWT build, the dominant encode cost). + - **checksum**: CRC-32 slice-by-8 (~4×); **rle90** bulk literal copy (~3.5×). + ## [0.6.1](https://github.com/KarpelesLab/compcol/compare/v0.6.0...v0.6.1) - 2026-06-12 ### Other