Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
streaming XXH64 implementation; the decompressed output is hashed and checked
against the 4-byte frame trailer, reporting `ChecksumMismatch` on corruption.

### Changed

- *(brotli)* much faster encode on low-redundancy input. The literal-context
histogram clustering was O(contexts³ · 256) — it rescanned every cluster pair
and recomputed each cluster's cost from scratch on every merge — which blew up
on dense histograms (e.g. random/incompressible data: ~37k instructions per
byte). It now caches per-cluster costs and the pairwise-delta matrix and
updates only the merged cluster each round. The merge sequence, and therefore
the compressed output, is byte-for-byte identical; encode of incompressible
input is ~8× faster.
- *(zstd)* faster encode, especially on low-match input, with equal-or-better
ratio. The match finder's hash table was a fixed 64 Ki buckets over an up-to
8 MiB window (load factor in the hundreds), so every probe walked a full chain
of useless far links; it is now sized to the window. The per-block match index
is also built incrementally — the chains persist across blocks instead of
re-indexing all of history every block (which was O(history) per block, i.e.
quadratic over a stream). Output is unchanged on single-block inputs and
equal-or-smaller on multi-block inputs (no ratio regression observed).

### Fixed

- *(decoder bridge)* a decoder that buffers a whole block internally (notably
Expand Down
54 changes: 45 additions & 9 deletions src/brotli/encoder_ctx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,49 @@ pub(crate) fn cluster(
}
}

// Agglomerative clustering. The naive form recomputes every pair's merge
// delta — including each cluster's own `histogram_bits` — on every iteration,
// which is O(active³ · 256) and blows up on dense histograms (e.g. random
// input, where every context spans all 256 symbols). Instead cache each
// cluster's self-cost and the pairwise deltas, keyed by stable cluster id,
// and after each merge recompute only the merged cluster's row. The merge
// sequence — and therefore the resulting model and compressed output — is
// byte-for-byte identical to the naive version; only redundant work is cut.
let mut self_bits = alloc::vec![0u64; NUM_CONTEXTS];
for &c in &active {
self_bits[c] = histogram_bits(&histograms[c], totals[c]);
}
// `delta[ci][cj]` for `ci < cj`; valid only for currently-active pairs.
let mut delta = alloc::vec![alloc::vec![0i64; NUM_CONTEXTS]; NUM_CONTEXTS];
let pair_delta = |ci: usize, cj: usize, sb: &[u64], hs: &[[u32; 256]], ts: &[u32]| -> i64 {
let bm = merged_bits(&hs[ci], ts[ci], &hs[cj], ts[cj]);
bm as i64 - sb[ci] as i64 - sb[cj] as i64 - HEADER_COST_BITS as i64
};
for ai in 0..active.len() {
for aj in (ai + 1)..active.len() {
let (ci, cj) = (active[ai], active[aj]);
delta[ci][cj] = pair_delta(ci, cj, &self_bits, &histograms, &totals);
}
}

while active.len() > 1 {
let force = active.len() > max_trees;
let mut best_i = 0usize;
let mut best_j = 0usize;
let mut best_delta: i64 = i64::MAX;
// Same scan order and strict `<` tie-break as the naive loop, so the
// chosen pair is identical — but now a cheap matrix lookup, not a
// 256-symbol recomputation.
for ai in 0..active.len() {
for aj in (ai + 1)..active.len() {
let ci = active[ai];
let cj = active[aj];
let bi = histogram_bits(&histograms[ci], totals[ci]);
let bj = histogram_bits(&histograms[cj], totals[cj]);
let bm = merged_bits(&histograms[ci], totals[ci], &histograms[cj], totals[cj]);
// Merging trades a header allowance against extra data bits.
let delta = bm as i64 - bi as i64 - bj as i64 - HEADER_COST_BITS as i64;
if delta < best_delta {
best_delta = delta;
let (ci, cj) = (active[ai], active[aj]);
let d = if ci < cj {
delta[ci][cj]
} else {
delta[cj][ci]
};
if d < best_delta {
best_delta = d;
best_i = ai;
best_j = aj;
}
Expand All @@ -197,6 +224,15 @@ pub(crate) fn cluster(
}
}
active.swap_remove(best_j);
// Only the merged cluster `ci`'s costs changed; refresh its self-cost
// and its delta against every other surviving cluster.
self_bits[ci] = histogram_bits(&histograms[ci], totals[ci]);
for &ck in &active {
if ck != ci {
let (lo, hi) = if ci < ck { (ci, ck) } else { (ck, ci) };
delta[lo][hi] = pair_delta(lo, hi, &self_bits, &histograms, &totals);
}
}
}

// Compress cluster ids to a dense 0..num_trees range.
Expand Down
24 changes: 16 additions & 8 deletions src/zstd/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,14 +307,22 @@ impl Encoder {
let buffer = buffer.as_slice();
let buf_len = buffer.len();

// Rebuild the chains for this buffer and pre-index only the retained
// history (`[0, start)`). Each parser then splices in the *current
// block's* positions lazily as it advances, so the hash chains never
// contain positions ahead of the probe — the standard LZ invariant that
// keeps match finding correct and the depth budget meaningful. Indexing
// history up front is what enables cross-block back-references.
self.matcher.resize_for(buf_len);
for i in 0..start.min(buf_len.saturating_sub(3)) {
// Pre-index the retained history (`[0, start)`) so cross-block
// back-references are findable; each parser then splices in the
// *current block's* positions lazily as it advances, preserving the LZ
// invariant that the chains never contain positions ahead of the probe.
//
// The chains persist across blocks (the history prefix is byte-stable
// until the window is trimmed), so we only index the positions not
// already indexed by earlier blocks — `[inserted_upto, start)`. The old
// code re-indexed all of history every block, which is O(history) per
// block and quadratic over a stream; this makes it amortised O(input).
// `prepare_incremental` keeps the existing chains (rebuilding only on a
// head-size change); window trims call `resize_for`, which resets the
// high-water so the next block re-indexes from scratch.
self.matcher.prepare_incremental(buf_len);
let index_to = start.min(buf_len.saturating_sub(3));
for i in self.matcher.inserted_upto()..index_to {
self.matcher.insert(buffer, i);
}

Expand Down
103 changes: 84 additions & 19 deletions src/zstd/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
//! - `Match { length, distance }` returned by value, with `MIN_MATCH = 3`
//! (zstd's minimum) and a generous `MAX_MATCH` cap.

use alloc::boxed::Box;

/// Minimum match length the matcher will report (RFC 8478 §3.1.1.3.2 implies
/// a hard minimum of 3 via the match-length base table).
pub const MIN_MATCH: usize = 3;
Expand All @@ -30,9 +28,15 @@ pub const MIN_MATCH: usize = 3;
/// periodicity at distance ~445 bytes): each long match amortises the
/// per-sequence FSE-table cost across thousands more output bytes.
pub const MAX_MATCH: usize = 65535;
/// Hash table size (must be a power of two).
const HASH_BITS: u32 = 15;
const HASH_SIZE: usize = 1 << HASH_BITS;
/// Minimum hash-table size (power of two). The table is sized to the indexed
/// buffer at construction / `resize_for` time and floored here for tiny inputs.
const HASH_MIN_BITS: u32 = 15;
/// Upper bound on the hash table (4 Mi buckets = 16 MiB). The matcher indexes
/// up to an 8 MiB history; a fixed small table would give that window a load
/// factor in the hundreds, so on low-match input every probe walked the full
/// `max_chain` of useless far-distance links. Sizing the table to the buffer
/// keeps chains short (the same reason liblzma sizes its hash to the dict).
const HASH_MAX_BITS: u32 = 22;
/// "Empty" marker in the hash table.
const NIL: u32 = u32::MAX;

Expand All @@ -46,31 +50,86 @@ pub struct Match {

/// Per-block matcher state.
pub struct MatchFinder {
head: Box<[u32; HASH_SIZE]>,
head: Vec<u32>,
/// Right-shift applied to the 32-bit hash to land in `head`; `32 - log2(len)`.
head_shift: u32,
/// Linked-list chain `prev[pos]` = position of the previous occurrence of
/// the same 4-byte prefix.
prev: Vec<u32>,
/// Number of leading positions already spliced into the chains. The chains
/// persist across blocks (the buffer prefix is byte-stable until the window
/// is trimmed), so each block only needs to insert positions `>= this`
/// rather than re-indexing all of history — turning the per-block O(history)
/// rebuild (quadratic over a stream) into amortised O(input).
inserted_upto: usize,
}

use alloc::vec;
use alloc::vec::Vec;

/// Hash function over four bytes. A multiplicative hash with a prime
/// multiplier gives reasonable distribution and is cheap to compute.
/// Full-width multiplicative hash over four bytes. The caller takes the top
/// `head` bits via `head_shift`; the high bits of a golden-ratio multiply are
/// the well-distributed ones.
#[inline]
fn hash4(b: &[u8]) -> u32 {
let v = (b[0] as u32) | ((b[1] as u32) << 8) | ((b[2] as u32) << 16) | ((b[3] as u32) << 24);
// 0x9E3779B1 = golden-ratio multiplier; high bits are the well-distributed ones.
v.wrapping_mul(0x9E37_79B1) >> (32 - HASH_BITS)
v.wrapping_mul(0x9E37_79B1)
}

/// `(head_len, head_shift)` for a buffer of `buffer_len` bytes: the table is the
/// buffer size rounded up to a power of two, clamped to `[HASH_MIN_BITS,
/// HASH_MAX_BITS]`, so the average chain length stays O(1).
fn head_params(buffer_len: usize) -> (usize, u32) {
let bits = buffer_len
.next_power_of_two()
.trailing_zeros()
.clamp(HASH_MIN_BITS, HASH_MAX_BITS);
(1usize << bits, 32 - bits)
}

impl MatchFinder {
pub fn new(buffer_len: usize) -> Self {
let (head_len, head_shift) = head_params(buffer_len);
Self {
head: Box::new([NIL; HASH_SIZE]),
head: vec![NIL; head_len],
head_shift,
prev: vec![NIL; buffer_len.max(1)],
inserted_upto: 0,
}
}

/// How many leading positions are already in the chains.
#[inline]
pub fn inserted_upto(&self) -> usize {
self.inserted_upto
}

/// Prepare to index a buffer of `buffer_len` bytes *incrementally*, keeping
/// the chains built for the byte-stable prefix from earlier blocks. Grows
/// the per-position array (preserving entries) and only rebuilds the head
/// table when the ideal size changes (a power-of-two growth, O(log input)
/// times total) — a rebuild resets `inserted_upto` so the caller re-indexes
/// the prefix that round. Use [`resize_for`](Self::resize_for) instead when
/// the window is trimmed and absolute positions shift.
pub fn prepare_incremental(&mut self, buffer_len: usize) {
if self.prev.len() < buffer_len {
self.prev.resize(buffer_len.max(1), NIL);
}
let (head_len, head_shift) = head_params(buffer_len);
if head_len != self.head.len() {
self.head.clear();
self.head.resize(head_len, NIL);
self.head_shift = head_shift;
self.inserted_upto = 0;
}
}

/// Bucket index for the 4 bytes at `b`.
#[inline]
fn bucket(&self, b: &[u8]) -> usize {
(hash4(b) >> self.head_shift) as usize
}

/// Forget every position recorded so far. The buffer length stays the
/// same. Not currently called — [`MatchFinder::resize_for`] is used on
/// each new block — but kept for completeness / future tuning.
Expand All @@ -89,20 +148,26 @@ impl MatchFinder {
pub fn resize_for(&mut self, buffer_len: usize) {
self.prev.clear();
self.prev.resize(buffer_len.max(1), NIL);
for h in self.head.iter_mut() {
*h = NIL;
}
let (head_len, head_shift) = head_params(buffer_len);
self.head_shift = head_shift;
self.head.clear();
self.head.resize(head_len, NIL);
self.inserted_upto = 0;
}

/// Record `buffer[pos..pos+4]`.
/// Record `buffer[pos..pos+4]`. Positions must be inserted in increasing
/// order (the standard LZ invariant); `inserted_upto` tracks the high-water
/// so later blocks can skip what is already indexed.
pub fn insert(&mut self, buffer: &[u8], pos: usize) {
if pos + 4 > buffer.len() {
return;
}
let h = hash4(&buffer[pos..pos + 4]) as usize;
// Safety: head is fixed size HASH_SIZE, h < HASH_SIZE.
let h = self.bucket(&buffer[pos..pos + 4]);
self.prev[pos] = self.head[h];
self.head[h] = pos as u32;
if pos + 1 > self.inserted_upto {
self.inserted_upto = pos + 1;
}
}

/// Find the longest match for `buffer[pos..]` against any earlier
Expand All @@ -126,7 +191,7 @@ impl MatchFinder {
// Can't compute the 4-byte hash; just fail (rare; near end of buf).
return None;
}
let h = hash4(&buffer[pos..pos + 4]) as usize;
let h = self.bucket(&buffer[pos..pos + 4]);
let max_dist = window.min(pos);
let max_len = MAX_MATCH.min(buffer.len() - pos);
if max_len < MIN_MATCH {
Expand Down Expand Up @@ -225,7 +290,7 @@ impl MatchFinder {
if pos + MIN_MATCH > buffer.len() || pos + 4 > buffer.len() {
return;
}
let h = hash4(&buffer[pos..pos + 4]) as usize;
let h = self.bucket(&buffer[pos..pos + 4]);
let max_dist = window.min(pos);
let max_len = MAX_MATCH.min(buffer.len() - pos);
if max_len < MIN_MATCH {
Expand Down
Loading