Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e5f65cc
checksum: CRC-32 slice-by-8 (642 -> 2525 MB/s, 3.9x)
MagicalTux Jun 12, 2026
5027f1d
rle90: bulk-copy literal runs in decoder (~1268 -> ~4600 MB/s, 3.5x)
MagicalTux Jun 12, 2026
7a1e935
deflate: vectorize decoder match-copy incl. overlapping runs
MagicalTux Jun 12, 2026
5d62b83
deflate: replace per-literal modulo with a wrap branch in emit_byte
MagicalTux Jun 12, 2026
ccebe8a
deflate64: vectorize decoder match-copy incl. overlapping runs
MagicalTux Jun 12, 2026
fa2ba85
lzma2: bulk match-copy in decode_chunk (xz/lzma2 decode)
MagicalTux Jun 12, 2026
0fff46f
lzma2: bulk overlapping match-copy in decode_chunk (xz/lzma2 decode)
MagicalTux Jun 12, 2026
c31955e
lzma: bulk overlapping match-copy in decoder drain loops (.lzma decode)
MagicalTux Jun 12, 2026
521e601
brotli: keep bit accumulator across Huffman LUT hits
MagicalTux Jun 12, 2026
fd7d8c1
brotli: skip literal context lookup when there is a single tree
MagicalTux Jun 12, 2026
58d02b6
brotli: widen Huffman fast-path LUT from 9 to 11 bits
MagicalTux Jun 12, 2026
d83d66d
zstd: faster Huffman literal decode via peek/consume
MagicalTux Jun 12, 2026
79bdb94
zstd: skip zero-bit reads and inline FSE state transitions
MagicalTux Jun 12, 2026
56e4fd0
zstd: inline RevBitReader::read fast path, split wide reads out of line
MagicalTux Jun 12, 2026
bc17156
zstd: hoist LL/ML base+extra tables to module-level const
MagicalTux Jun 12, 2026
6faec5f
zstd: fetch each FSE entry once per sequence (symbol + advance share …
MagicalTux Jun 12, 2026
b2b1ebf
perf(decoders): bulk overlapping match copy in lz4/lz5/lzo/snappy
MagicalTux Jun 12, 2026
672fc92
perf(lzw): single-pass string emit, drop scratch stack
MagicalTux Jun 12, 2026
957dcf4
perf(lzo): skip-step accelerator in encoder match search
MagicalTux Jun 12, 2026
09cc2c8
perf(snappy): skip-step accelerator in encoder match search
MagicalTux Jun 12, 2026
e667989
bzip2: cut SA-IS allocations and inline induced-sort hot paths
MagicalTux Jun 12, 2026
93accca
bzip2: recurse SA-IS reduced problem in place (drop per-level copy)
MagicalTux Jun 12, 2026
02e6627
xpress_huffman: amortize decoder history trim (O(n²) → O(n))
MagicalTux Jun 12, 2026
5da0abf
lznt1: bulk copy_within for non-overlapping match copies
MagicalTux Jun 12, 2026
b54a771
hpack: byte-wide FSA Huffman decoder
MagicalTux Jun 12, 2026
f1fcef5
arc_crunch: single-write LZW string assembly + literal fast path
MagicalTux Jun 12, 2026
0a0ddb4
arc_squash: single-write LZW string assembly + literal fast path
MagicalTux Jun 12, 2026
66d7f18
delta: vectorizable filter loop via direct predecessor indexing
MagicalTux Jun 12, 2026
944adf7
lha: bulk match-copy in static-Huffman decode hot loop
MagicalTux Jun 12, 2026
ac84ba6
rar1/2/3/5: bulk LZ77 match-copy in decode window loops
MagicalTux Jun 12, 2026
0afb6b4
zip_implode/reduce/shrink: bulk match-copy in decode loops
MagicalTux Jun 12, 2026
3f6adb1
docs: changelog entry for codec throughput optimizations
MagicalTux Jun 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Performance

- **Throughput optimizations across the codec suite**, all preserving
byte-identical decoder output (validated by the existing round-trip and
reference-fixture tests) — no `unsafe`, no new dependencies. Highlights:
- **deflate / deflate64** decode: vectorized match-copy (contiguous spans +
doubling `copy_within` for overlapping runs) — deflate Random decode
~3.5×, deflate64 long-match decode several×; zlib/gzip inherit the gains.
- **LZMA / xz** decode: bulk (and overlapping) dictionary match-copy —
RLE-heavy `.lzma` decode up to ~6×.
- **zstd** decode: inlined backward bit-reader fast path, single-load FSE
state transitions, hoisted LL/ML tables — ~1.5× on Huffman/FSE-heavy input.
- **brotli** decode: wider Huffman fast LUT, single-tree literal fast path,
bit-accumulator kept across LUT hits — literal-heavy decode ~2.3×.
- **lz4 / lz5 / lzo / snappy** decode: bulk overlapping match-copy
(multi-GB/s); **lzo / snappy** encoder skip-step match search (~6× on
incompressible input). **lzw** single-pass string emit.
- **xpress-huffman** decode: fixed an O(n²) history-trim to O(n) (orders of
magnitude on large inputs); **lznt1** bulk match-copy.
- **lha / rar1–5 / zip-implode·reduce·shrink / arc-crunch·squash**: bulk
LZSS/LZW window copy; **delta** filter encode ~15× (auto-vectorized);
**hpack** byte-wide Huffman decode.
- **bzip2** encode: reduced SA-IS suffix-array allocations and in-place
recursion (+14–31% on the BWT build, the dominant encode cost).
- **checksum**: CRC-32 slice-by-8 (~4×); **rle90** bulk literal copy (~3.5×).

## [0.6.1](https://github.com/KarpelesLab/compcol/compare/v0.6.0...v0.6.1) - 2026-06-12

### Other
Expand Down
57 changes: 40 additions & 17 deletions src/arc_crunch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
//!
//! Crafted streams never panic: the classic LZW KwKwK case and any
//! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
//! dictionary and the decoded-string stack are bounded by `1 << maxbits`;
//! dictionary and the decoded-string scratch are bounded by `1 << maxbits`;
//! every dictionary index is bounds-checked and width arithmetic is checked.
//!
//! ## References
Expand Down Expand Up @@ -386,7 +386,7 @@ pub struct Decoder {
/// Decoded characters waiting to flush, forward order.
emit_buf: Vec<u8>,
emit_head: usize,
/// Scratch stack used while reversing a decoded string.
/// Scratch buffer used while reversing a decoded string.
stack: Vec<u8>,
completed: bool,
}
Expand All @@ -408,7 +408,9 @@ impl Decoder {
finchar: 0,
emit_buf: Vec::new(),
emit_head: 0,
stack: Vec::with_capacity(max_size),
// Fixed-size reverse-assembly scratch: a decoded string is at most
// `1 << maxbits` ≤ `max_size` bytes, so its tail always fits.
stack: vec![0u8; max_size],
completed: false,
}
}
Expand Down Expand Up @@ -439,27 +441,48 @@ impl Decoder {
/// Decode `code` into `emit_buf` (forward order); updates `finchar`.
/// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
/// out of range) — defends against crafted streams.
///
/// The chain is walked once, writing the reversed string straight into a
/// reserved tail region of `emit_buf` (deepest suffix last). This avoids
/// the previous scratch-stack round trip (every byte was written twice:
/// once pushed, once popped) — each output byte is now written exactly
/// once.
fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
self.stack.clear();
let limit = 1usize << self.maxbits;
let mut hops = 0usize;
// `stack` is a fixed-size scratch (length == 1 << MAX_BITS, allocated
// once). We walk the prefix chain writing the string back-to-front into
// its tail, then bulk-copy the assembled forward-order slice into
// `emit_buf` with a single vectorised `extend_from_slice`. This avoids
// both the old per-byte `emit_buf.push` (a capacity check per byte) and
// any per-call zero-initialisation.
// Fast path: a bare literal (very common on incompressible input) is a
// length-1 string — emit it directly and skip the reverse-assembly.
if code < 256 {
let first = code as u8;
self.finchar = first;
self.emit_buf.push(first);
return Ok(());
}
let scratch = &mut self.stack[..];
let mut i = scratch.len();
while code >= 256 {
if code as usize >= self.prefix.len() {
// `i` reaching 0 means the chain is longer than any valid string
// (> 1 << maxbits): a malformed / cyclic prefix table. Reject
// rather than underflow.
if code as usize >= self.prefix.len() || i == 0 {
return Err(Error::Corrupt);
}
self.stack.push(self.suffix[code as usize]);
i -= 1;
scratch[i] = self.suffix[code as usize];
code = self.prefix[code as usize] as u32;
hops += 1;
if hops > limit {
return Err(Error::Corrupt);
}
}
if i == 0 {
return Err(Error::Corrupt);
}
let first = code as u8;
self.finchar = first;
self.emit_buf.push(first);
while let Some(b) = self.stack.pop() {
self.emit_buf.push(b);
}
i -= 1;
scratch[i] = first;
self.emit_buf.extend_from_slice(&scratch[i..]);
Ok(())
}

Expand Down Expand Up @@ -621,7 +644,7 @@ impl RawDecoder for Decoder {
self.finchar = 0;
self.emit_buf.clear();
self.emit_head = 0;
self.stack.clear();
// `stack` is fixed-size scratch overwritten on every use; leave it.
self.completed = false;
}
}
47 changes: 30 additions & 17 deletions src/arc_squash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
//!
//! Crafted streams never panic: the classic LZW KwKwK case and any
//! out-of-range / not-yet-assigned code return [`Error::Corrupt`]; the
//! dictionary and the decoded-string stack are bounded by `1 << 13`; every
//! dictionary and the decoded-string scratch are bounded by `1 << 13`; every
//! dictionary index is bounds-checked.
//!
//! ## References
Expand Down Expand Up @@ -347,7 +347,7 @@ pub struct Decoder {
/// Decoded characters waiting to flush, forward order.
emit_buf: Vec<u8>,
emit_head: usize,
/// Scratch stack used while reversing a decoded string.
/// Fixed-size scratch used while reversing a decoded string.
stack: Vec<u8>,
completed: bool,
}
Expand All @@ -366,7 +366,9 @@ impl Decoder {
finchar: 0,
emit_buf: Vec::new(),
emit_head: 0,
stack: Vec::with_capacity(max_size),
// Fixed-size reverse-assembly scratch: a decoded string is at most
// `MAX_CODE` bytes, so its tail always fits.
stack: vec![0u8; max_size],
completed: false,
}
}
Expand Down Expand Up @@ -396,26 +398,37 @@ impl Decoder {
/// Returns `Err(Corrupt)` if the parent chain is malformed (too long or
/// out of range) — defends against crafted streams.
fn decode_string(&mut self, mut code: u32) -> Result<(), Error> {
self.stack.clear();
let limit = MAX_CODE as usize;
let mut hops = 0usize;
// Fast path: a bare literal is a length-1 string — emit directly.
if code < 256 {
let first = code as u8;
self.finchar = first;
self.emit_buf.push(first);
return Ok(());
}
// Walk the prefix chain back-to-front into the fixed-size scratch, then
// bulk-copy the forward-order slice into emit_buf with one
// extend_from_slice. This avoids the old per-byte push/pop round trip
// (each output byte written twice).
let scratch = &mut self.stack[..];
let mut i = scratch.len();
while code >= 256 {
if code as usize >= self.prefix.len() {
// `i == 0` means the chain is longer than any valid string: a
// malformed / cyclic prefix table. Reject rather than underflow.
if code as usize >= self.prefix.len() || i == 0 {
return Err(Error::Corrupt);
}
self.stack.push(self.suffix[code as usize]);
i -= 1;
scratch[i] = self.suffix[code as usize];
code = self.prefix[code as usize] as u32;
hops += 1;
if hops > limit {
return Err(Error::Corrupt);
}
}
if i == 0 {
return Err(Error::Corrupt);
}
let first = code as u8;
self.finchar = first;
self.emit_buf.push(first);
while let Some(b) = self.stack.pop() {
self.emit_buf.push(b);
}
i -= 1;
scratch[i] = first;
self.emit_buf.extend_from_slice(&scratch[i..]);
Ok(())
}

Expand Down Expand Up @@ -544,7 +557,7 @@ impl RawDecoder for Decoder {
self.finchar = 0;
self.emit_buf.clear();
self.emit_head = 0;
self.stack.clear();
// `stack` is fixed-size scratch overwritten on every use; leave it.
self.completed = false;
}
}
55 changes: 48 additions & 7 deletions src/brotli/huffman.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ use crate::error::Error;

/// Primary-LUT width for the fast-path symbol lookup. Codes of length
/// ≤ `PRIMARY_BITS` resolve in O(1); longer codes fall back to the
/// per-bit walk.
const PRIMARY_BITS: u32 = 9;
/// per-bit walk. Brotli codes cap at length 15; an 11-bit table resolves
/// the vast majority of literal/distance symbols in one indexed load
/// (2048 u32 = 8 KiB per tree) while still fitting comfortably in L1.
const PRIMARY_BITS: u32 = 11;
const PRIMARY_SIZE: usize = 1 << PRIMARY_BITS;

/// Packed (symbol, length) entry in the primary LUT. The low 16 bits hold
Expand Down Expand Up @@ -219,13 +221,17 @@ impl HuffmanDecoder {
let max = self.max_length as u32;

// Fast path: peek PRIMARY_BITS bits, index the LUT, advance the
// bit position by the actual code length.
if br.remaining() >= PRIMARY_BITS as usize {
let idx = br.peek_bits(PRIMARY_BITS) as usize;
let entry = self.lut[idx];
// bit position by the actual code length. `peek_lut_bits` refills
// and returns however many bits (up to PRIMARY_BITS) are buffered;
// when the full window is available we resolve in O(1) and consume
// only the matched code length, keeping the rest of the
// accumulator intact for the next symbol.
let (peeked, avail) = br.peek_lut_bits(PRIMARY_BITS);
if avail >= PRIMARY_BITS {
let entry = self.lut[peeked as usize];
let len = entry >> LUT_LEN_SHIFT;
if len > 0 {
br.set_position(br.position() + len as usize);
br.consume(len);
return Ok(entry & LUT_SYM_MASK);
}
// Long code (> PRIMARY_BITS) -- fall through to the slow path.
Expand Down Expand Up @@ -307,6 +313,18 @@ impl<'a> BitSource<'a> {
self.nbits = 0;
}

/// Advance the logical position by `n` bits that are already buffered
/// in `acc`. The caller must guarantee `n <= self.nbits` (e.g. right
/// after a `peek_bits(m)` with `m >= n`). Unlike `set_position` this
/// keeps the remaining buffered bits, so the hot Huffman fast path does
/// not force a refill on every decoded symbol.
#[inline]
pub(crate) fn consume(&mut self, n: u32) {
debug_assert!(n <= self.nbits);
self.acc >>= n;
self.nbits -= n;
}

/// Remaining bits available (still in `data` plus held in `acc`).
#[allow(dead_code)]
pub(crate) fn remaining(&self) -> usize {
Expand Down Expand Up @@ -364,6 +382,7 @@ impl<'a> BitSource<'a> {
/// Peek `n` bits (0 < n ≤ 32) without advancing. Caller must
/// guarantee `n <= remaining()`. Refills the internal accumulator if
/// fewer than `n` bits are buffered.
#[allow(dead_code)]
pub(crate) fn peek_bits(&mut self, n: u32) -> u32 {
debug_assert!(n > 0 && n <= 32);
debug_assert!(n as usize <= self.remaining());
Expand All @@ -378,6 +397,28 @@ impl<'a> BitSource<'a> {
}
}

/// Peek up to `n` bits (1..=32) for the Huffman LUT fast path without
/// advancing. Refills once, then returns `(bits, available)` where
/// `available = min(nbits, n)` and `bits` holds the low `available`
/// bits LSB-first. When `available < n` the caller must fall back to
/// the per-bit slow path. Unlike `peek_bits` this never asserts on a
/// short tail, so it is safe to call when the stream is nearly drained.
#[inline]
pub(crate) fn peek_lut_bits(&mut self, n: u32) -> (u32, u32) {
if self.nbits < n {
self.refill();
}
let avail = self.nbits.min(n);
let bits = if avail == 0 {
0
} else if avail >= 32 {
self.acc as u32
} else {
(self.acc & ((1u64 << avail) - 1)) as u32
};
(bits, avail)
}

/// Read `n` bits (0..=32) as a little-endian integer.
pub(crate) fn read_bits(&mut self, n: u32) -> Result<u32, Error> {
debug_assert!(n <= 32);
Expand Down
52 changes: 40 additions & 12 deletions src/brotli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2157,6 +2157,11 @@ impl Decoder {
htree_d.push(Self::read_prefix_code(src, num_dist_codes)?);
}

// When there is a single literal Huffman tree the context map is
// all zeroes, so literal decoding can skip the per-byte context
// lookup entirely (the tree index is constant 0).
let single_literal_tree = ntreesl == 1;

// ─── decoding loop ───
let mut emitted: u32 = 0;
let mut block_type_l: u32 = 0;
Expand Down Expand Up @@ -2232,20 +2237,43 @@ impl Decoder {
let copy_len = COPY_BASE[copy_code as usize] + copy_extra;

// Emit `insert_len` literals.
for _ in 0..insert_len {
if emitted >= mlen {
return Err(Error::Corrupt);
if single_literal_tree {
// Single literal Huffman tree: the context map is all
// zeroes, so the per-byte context computation and the
// `cmapl` lookup are dead work — the tree index is always
// 0. (Block-type switches still drive `block_len_l`, but
// they never change which tree we use here.)
let tree = &htree_l[0];
for _ in 0..insert_len {
if emitted >= mlen {
return Err(Error::Corrupt);
}
maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
block_len_l -= 1;
let sym = tree.decode(src)?;
if sym > 255 {
return Err(Error::Corrupt);
}
self.emit_literal(sym as u8);
emitted += 1;
}
maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
block_len_l -= 1;
let cid = context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
let sym = htree_l[tree_idx].decode(src)?;
if sym > 255 {
return Err(Error::Corrupt);
} else {
for _ in 0..insert_len {
if emitted >= mlen {
return Err(Error::Corrupt);
}
maybe_switch!(block_len_l, block_type_l, prev_block_type_l, group_l);
block_len_l -= 1;
let cid =
context::literal_context(cmodes[block_type_l as usize], self.p1, self.p2);
let tree_idx = cmapl[(64 * block_type_l + cid as u32) as usize] as usize;
let sym = htree_l[tree_idx].decode(src)?;
if sym > 255 {
return Err(Error::Corrupt);
}
self.emit_literal(sym as u8);
emitted += 1;
}
self.emit_literal(sym as u8);
emitted += 1;
}

if emitted >= mlen {
Expand Down
Loading
Loading