diff --git a/CHANGELOG.md b/CHANGELOG.md index e708c01..0e4f2bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 streaming XXH64 implementation; the decompressed output is hashed and checked against the 4-byte frame trailer, reporting `ChecksumMismatch` on corruption. +### Changed + +- *(brotli)* much faster encode on low-redundancy input. The literal-context + histogram clustering was O(contexts³ · 256) — it rescanned every cluster pair + and recomputed each cluster's cost from scratch on every merge — which blew up + on dense histograms (e.g. random/incompressible data: ~37k instructions per + byte). It now caches per-cluster costs and the pairwise-delta matrix and + updates only the merged cluster each round. The merge sequence, and therefore + the compressed output, is byte-for-byte identical; encode of incompressible + input is ~8× faster. +- *(zstd)* faster encode, especially on low-match input, with equal-or-better + ratio. The match finder's hash table was a fixed 64 Ki buckets over an up-to + 8 MiB window (load factor in the hundreds), so every probe walked a full chain + of useless far links; it is now sized to the window. The per-block match index + is also built incrementally — the chains persist across blocks instead of + re-indexing all of history every block (which was O(history) per block, i.e. + quadratic over a stream). Output is unchanged on single-block inputs and + equal-or-smaller on multi-block inputs (no ratio regression observed). + ### Fixed - *(decoder bridge)* a decoder that buffers a whole block internally (notably diff --git a/src/brotli/encoder_ctx.rs b/src/brotli/encoder_ctx.rs index 3cc806f..4b69eb1 100644 --- a/src/brotli/encoder_ctx.rs +++ b/src/brotli/encoder_ctx.rs @@ -159,22 +159,49 @@ pub(crate) fn cluster( } } + // Agglomerative clustering. The naive form recomputes every pair's merge + // delta — including each cluster's own `histogram_bits` — on every iteration, + // which is O(active³ · 256) and blows up on dense histograms (e.g. random + // input, where every context spans all 256 symbols). Instead cache each + // cluster's self-cost and the pairwise deltas, keyed by stable cluster id, + // and after each merge recompute only the merged cluster's row. The merge + // sequence — and therefore the resulting model and compressed output — is + // byte-for-byte identical to the naive version; only redundant work is cut. + let mut self_bits = alloc::vec![0u64; NUM_CONTEXTS]; + for &c in &active { + self_bits[c] = histogram_bits(&histograms[c], totals[c]); + } + // `delta[ci][cj]` for `ci < cj`; valid only for currently-active pairs. + let mut delta = alloc::vec![alloc::vec![0i64; NUM_CONTEXTS]; NUM_CONTEXTS]; + let pair_delta = |ci: usize, cj: usize, sb: &[u64], hs: &[[u32; 256]], ts: &[u32]| -> i64 { + let bm = merged_bits(&hs[ci], ts[ci], &hs[cj], ts[cj]); + bm as i64 - sb[ci] as i64 - sb[cj] as i64 - HEADER_COST_BITS as i64 + }; + for ai in 0..active.len() { + for aj in (ai + 1)..active.len() { + let (ci, cj) = (active[ai], active[aj]); + delta[ci][cj] = pair_delta(ci, cj, &self_bits, &histograms, &totals); + } + } + while active.len() > 1 { let force = active.len() > max_trees; let mut best_i = 0usize; let mut best_j = 0usize; let mut best_delta: i64 = i64::MAX; + // Same scan order and strict `<` tie-break as the naive loop, so the + // chosen pair is identical — but now a cheap matrix lookup, not a + // 256-symbol recomputation. for ai in 0..active.len() { for aj in (ai + 1)..active.len() { - let ci = active[ai]; - let cj = active[aj]; - let bi = histogram_bits(&histograms[ci], totals[ci]); - let bj = histogram_bits(&histograms[cj], totals[cj]); - let bm = merged_bits(&histograms[ci], totals[ci], &histograms[cj], totals[cj]); - // Merging trades a header allowance against extra data bits. - let delta = bm as i64 - bi as i64 - bj as i64 - HEADER_COST_BITS as i64; - if delta < best_delta { - best_delta = delta; + let (ci, cj) = (active[ai], active[aj]); + let d = if ci < cj { + delta[ci][cj] + } else { + delta[cj][ci] + }; + if d < best_delta { + best_delta = d; best_i = ai; best_j = aj; } @@ -197,6 +224,15 @@ pub(crate) fn cluster( } } active.swap_remove(best_j); + // Only the merged cluster `ci`'s costs changed; refresh its self-cost + // and its delta against every other surviving cluster. + self_bits[ci] = histogram_bits(&histograms[ci], totals[ci]); + for &ck in &active { + if ck != ci { + let (lo, hi) = if ci < ck { (ci, ck) } else { (ck, ci) }; + delta[lo][hi] = pair_delta(lo, hi, &self_bits, &histograms, &totals); + } + } } // Compress cluster ids to a dense 0..num_trees range. diff --git a/src/zstd/encoder.rs b/src/zstd/encoder.rs index 9d8e27e..1c9ce1d 100644 --- a/src/zstd/encoder.rs +++ b/src/zstd/encoder.rs @@ -307,14 +307,22 @@ impl Encoder { let buffer = buffer.as_slice(); let buf_len = buffer.len(); - // Rebuild the chains for this buffer and pre-index only the retained - // history (`[0, start)`). Each parser then splices in the *current - // block's* positions lazily as it advances, so the hash chains never - // contain positions ahead of the probe — the standard LZ invariant that - // keeps match finding correct and the depth budget meaningful. Indexing - // history up front is what enables cross-block back-references. - self.matcher.resize_for(buf_len); - for i in 0..start.min(buf_len.saturating_sub(3)) { + // Pre-index the retained history (`[0, start)`) so cross-block + // back-references are findable; each parser then splices in the + // *current block's* positions lazily as it advances, preserving the LZ + // invariant that the chains never contain positions ahead of the probe. + // + // The chains persist across blocks (the history prefix is byte-stable + // until the window is trimmed), so we only index the positions not + // already indexed by earlier blocks — `[inserted_upto, start)`. The old + // code re-indexed all of history every block, which is O(history) per + // block and quadratic over a stream; this makes it amortised O(input). + // `prepare_incremental` keeps the existing chains (rebuilding only on a + // head-size change); window trims call `resize_for`, which resets the + // high-water so the next block re-indexes from scratch. + self.matcher.prepare_incremental(buf_len); + let index_to = start.min(buf_len.saturating_sub(3)); + for i in self.matcher.inserted_upto()..index_to { self.matcher.insert(buffer, i); } diff --git a/src/zstd/matcher.rs b/src/zstd/matcher.rs index d31fd1c..7861d54 100644 --- a/src/zstd/matcher.rs +++ b/src/zstd/matcher.rs @@ -16,8 +16,6 @@ //! - `Match { length, distance }` returned by value, with `MIN_MATCH = 3` //! (zstd's minimum) and a generous `MAX_MATCH` cap. -use alloc::boxed::Box; - /// Minimum match length the matcher will report (RFC 8478 §3.1.1.3.2 implies /// a hard minimum of 3 via the match-length base table). pub const MIN_MATCH: usize = 3; @@ -30,9 +28,15 @@ pub const MIN_MATCH: usize = 3; /// periodicity at distance ~445 bytes): each long match amortises the /// per-sequence FSE-table cost across thousands more output bytes. pub const MAX_MATCH: usize = 65535; -/// Hash table size (must be a power of two). -const HASH_BITS: u32 = 15; -const HASH_SIZE: usize = 1 << HASH_BITS; +/// Minimum hash-table size (power of two). The table is sized to the indexed +/// buffer at construction / `resize_for` time and floored here for tiny inputs. +const HASH_MIN_BITS: u32 = 15; +/// Upper bound on the hash table (4 Mi buckets = 16 MiB). The matcher indexes +/// up to an 8 MiB history; a fixed small table would give that window a load +/// factor in the hundreds, so on low-match input every probe walked the full +/// `max_chain` of useless far-distance links. Sizing the table to the buffer +/// keeps chains short (the same reason liblzma sizes its hash to the dict). +const HASH_MAX_BITS: u32 = 22; /// "Empty" marker in the hash table. const NIL: u32 = u32::MAX; @@ -46,31 +50,86 @@ pub struct Match { /// Per-block matcher state. pub struct MatchFinder { - head: Box<[u32; HASH_SIZE]>, + head: Vec, + /// Right-shift applied to the 32-bit hash to land in `head`; `32 - log2(len)`. + head_shift: u32, /// Linked-list chain `prev[pos]` = position of the previous occurrence of /// the same 4-byte prefix. prev: Vec, + /// Number of leading positions already spliced into the chains. The chains + /// persist across blocks (the buffer prefix is byte-stable until the window + /// is trimmed), so each block only needs to insert positions `>= this` + /// rather than re-indexing all of history — turning the per-block O(history) + /// rebuild (quadratic over a stream) into amortised O(input). + inserted_upto: usize, } use alloc::vec; use alloc::vec::Vec; -/// Hash function over four bytes. A multiplicative hash with a prime -/// multiplier gives reasonable distribution and is cheap to compute. +/// Full-width multiplicative hash over four bytes. The caller takes the top +/// `head` bits via `head_shift`; the high bits of a golden-ratio multiply are +/// the well-distributed ones. +#[inline] fn hash4(b: &[u8]) -> u32 { let v = (b[0] as u32) | ((b[1] as u32) << 8) | ((b[2] as u32) << 16) | ((b[3] as u32) << 24); - // 0x9E3779B1 = golden-ratio multiplier; high bits are the well-distributed ones. - v.wrapping_mul(0x9E37_79B1) >> (32 - HASH_BITS) + v.wrapping_mul(0x9E37_79B1) +} + +/// `(head_len, head_shift)` for a buffer of `buffer_len` bytes: the table is the +/// buffer size rounded up to a power of two, clamped to `[HASH_MIN_BITS, +/// HASH_MAX_BITS]`, so the average chain length stays O(1). +fn head_params(buffer_len: usize) -> (usize, u32) { + let bits = buffer_len + .next_power_of_two() + .trailing_zeros() + .clamp(HASH_MIN_BITS, HASH_MAX_BITS); + (1usize << bits, 32 - bits) } impl MatchFinder { pub fn new(buffer_len: usize) -> Self { + let (head_len, head_shift) = head_params(buffer_len); Self { - head: Box::new([NIL; HASH_SIZE]), + head: vec![NIL; head_len], + head_shift, prev: vec![NIL; buffer_len.max(1)], + inserted_upto: 0, } } + /// How many leading positions are already in the chains. + #[inline] + pub fn inserted_upto(&self) -> usize { + self.inserted_upto + } + + /// Prepare to index a buffer of `buffer_len` bytes *incrementally*, keeping + /// the chains built for the byte-stable prefix from earlier blocks. Grows + /// the per-position array (preserving entries) and only rebuilds the head + /// table when the ideal size changes (a power-of-two growth, O(log input) + /// times total) — a rebuild resets `inserted_upto` so the caller re-indexes + /// the prefix that round. Use [`resize_for`](Self::resize_for) instead when + /// the window is trimmed and absolute positions shift. + pub fn prepare_incremental(&mut self, buffer_len: usize) { + if self.prev.len() < buffer_len { + self.prev.resize(buffer_len.max(1), NIL); + } + let (head_len, head_shift) = head_params(buffer_len); + if head_len != self.head.len() { + self.head.clear(); + self.head.resize(head_len, NIL); + self.head_shift = head_shift; + self.inserted_upto = 0; + } + } + + /// Bucket index for the 4 bytes at `b`. + #[inline] + fn bucket(&self, b: &[u8]) -> usize { + (hash4(b) >> self.head_shift) as usize + } + /// Forget every position recorded so far. The buffer length stays the /// same. Not currently called — [`MatchFinder::resize_for`] is used on /// each new block — but kept for completeness / future tuning. @@ -89,20 +148,26 @@ impl MatchFinder { pub fn resize_for(&mut self, buffer_len: usize) { self.prev.clear(); self.prev.resize(buffer_len.max(1), NIL); - for h in self.head.iter_mut() { - *h = NIL; - } + let (head_len, head_shift) = head_params(buffer_len); + self.head_shift = head_shift; + self.head.clear(); + self.head.resize(head_len, NIL); + self.inserted_upto = 0; } - /// Record `buffer[pos..pos+4]`. + /// Record `buffer[pos..pos+4]`. Positions must be inserted in increasing + /// order (the standard LZ invariant); `inserted_upto` tracks the high-water + /// so later blocks can skip what is already indexed. pub fn insert(&mut self, buffer: &[u8], pos: usize) { if pos + 4 > buffer.len() { return; } - let h = hash4(&buffer[pos..pos + 4]) as usize; - // Safety: head is fixed size HASH_SIZE, h < HASH_SIZE. + let h = self.bucket(&buffer[pos..pos + 4]); self.prev[pos] = self.head[h]; self.head[h] = pos as u32; + if pos + 1 > self.inserted_upto { + self.inserted_upto = pos + 1; + } } /// Find the longest match for `buffer[pos..]` against any earlier @@ -126,7 +191,7 @@ impl MatchFinder { // Can't compute the 4-byte hash; just fail (rare; near end of buf). return None; } - let h = hash4(&buffer[pos..pos + 4]) as usize; + let h = self.bucket(&buffer[pos..pos + 4]); let max_dist = window.min(pos); let max_len = MAX_MATCH.min(buffer.len() - pos); if max_len < MIN_MATCH { @@ -225,7 +290,7 @@ impl MatchFinder { if pos + MIN_MATCH > buffer.len() || pos + 4 > buffer.len() { return; } - let h = hash4(&buffer[pos..pos + 4]) as usize; + let h = self.bucket(&buffer[pos..pos + 4]); let max_dist = window.min(pos); let max_len = MAX_MATCH.min(buffer.len() - pos); if max_len < MIN_MATCH {