diff --git a/CHANGELOG.md b/CHANGELOG.md index 99a793e..be7b6fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Performance + +- **Round 2 of encoder ratio + codec speed work** (encoder-only for ratio; + decoders unchanged and every format still decodes byte-for-byte with its + reference tool). Ratios on a 2.9 MB real-source corpus, our max level vs the + reference's max (`ours/ref`, lower is better): + - **xz / lzma2**: 1.51 → **1.10** — the LZMA2 chunk encoder now keeps the LZ + dictionary **continuous across chunks** (emits `0xC0` continue-dict control + bytes after the first `0xE0`, with a single match-finder spanning the whole + input) instead of resetting every 64 KiB. Closes nearly all of the gap to + the `.lzma` path (1.07). Also fixes the raw-LZMA2 decoder to feed + uncompressed (stored) chunks into the dictionary. + - **zstd**: 1.40 → **1.04 vs `zstd -19`** at max level (now beats `zstd -12`) + — cross-block matching over a retained sliding window (≤8 MiB, within the + advertised window) plus a two-pass, statistics-driven optimal parse + (btultra2-style repricing) and repeat-offset-aware DP pricing. + - **lz4**: 1.18 → **1.02** (frame, `-l 12` beats `lz4 -9`) — implemented LZ4 + frame **block-linked** mode so matches reference up to 64 KiB of prior + blocks' output, not just the current block. +- **Standalone-codec encode throughput** (output byte-identical): + - **bwt** encode ~3× faster — replaced the prefix-doubling rotation sort with + linear-time SA-IS suffix-array construction. + - **mtf** encode ~2.3× faster (single-pass scan-and-shift); **rangecoder** + encode/decode ~+15% (tightened hot loops). + + Note: the xz/lzma2 and zstd encoders now buffer the whole input to drive a + single continuous match-finder (same memory profile the `.lzma` path already + had); higher levels trade encode time for ratio, decode speed is unchanged. + ## [0.6.3](https://github.com/KarpelesLab/compcol/compare/v0.6.2...v0.6.3) - 2026-06-15 ### Added diff --git a/src/bwt/mod.rs b/src/bwt/mod.rs index 2a51789..243d841 100644 --- a/src/bwt/mod.rs +++ b/src/bwt/mod.rs @@ -36,15 +36,19 @@ //! //! ## Forward transform //! -//! For each block we build the order of its cyclic rotations with a -//! prefix-doubling (Manber–Myers) sort — `O(n log n)` ranking rounds, each an -//! `O(n)` counting sort on the `(rank[i], rank[i + k])` pairs. From the sorted -//! rotation order `sa` (where `sa[r]` is the starting offset of the rotation -//! ranked `r`) the BWT last column is `L[r] = block[(sa[r] + n - 1) mod n]`, -//! and the primary index is the rank of the rotation that starts at offset 0. +//! For each block we build the order of its cyclic rotations by suffix-sorting +//! the doubled block `T + T + $` (sentinel `$` smaller than any byte) with +//! **SA-IS** (Suffix Array by Induced Sorting; Nong, Zhang & Chan, 2009), a +//! linear-time `O(n)` construction. From the sorted rotation order `sa` (where +//! `sa[r]` is the starting offset of the rotation ranked `r`) the BWT last +//! column is `L[r] = block[(sa[r] + n - 1) mod n]`, and the primary index is +//! the rank of the rotation that starts at offset 0. //! -//! The prefix-doubling sort is `O(n log n)` and handles the pathological cases -//! (all-equal bytes, long repeats) without degrading to `O(n² log n)`. +//! SA-IS is `O(n)` and handles the pathological cases (all-equal bytes, long +//! repeats) without degrading. A small KMP-based tie-break pass reorders any +//! run of *equal* cyclic rotations (only possible for fully periodic blocks) +//! by ascending offset, so the emitted `(L, primary)` pair is identical to +//! that of a stable cyclic-rotation sort. //! //! ## Inverse transform //! @@ -64,8 +68,9 @@ //! ## Licensing //! //! Clean-room from the published BWT algorithm description (Burrows & Wheeler, -//! 1994) and the textbook prefix-doubling rotation sort. No code was copied -//! from `src/bzip2/` or any third-party source. +//! 1994) and the published SA-IS suffix-array construction (Nong, Zhang & +//! Chan, 2009). Implemented independently within this module; no code was +//! copied from `src/bzip2/` or any third-party source. #![cfg_attr(docsrs, doc(cfg(feature = "bwt")))] diff --git a/src/bwt/transform.rs b/src/bwt/transform.rs index 7d9d205..8ace78b 100644 --- a/src/bwt/transform.rs +++ b/src/bwt/transform.rs @@ -45,114 +45,354 @@ pub(super) fn forward(block: &[u8]) -> (Vec, usize) { /// Sort the `n` cyclic rotations of `block` and return their starting offsets /// in sorted order (`sa[r]` = offset of the rotation ranked `r`). /// -/// Uses prefix doubling (Manber–Myers): start with rank = byte value, then -/// repeatedly refine ranks by the pair `(rank[i], rank[(i + k) mod n])` for -/// doubling `k`, until every rotation has a distinct rank (or `k >= n`). Each -/// round is an `O(n)` counting sort over the rank pairs, and there are -/// `O(log n)` rounds, so the whole thing is `O(n log n)`. +/// Sorting cyclic rotations is reduced to an ordinary suffix sort over the +/// **doubled** block `T + T` followed by a sentinel strictly smaller than any +/// byte. Suffix `i` of `T + T + $` for `0 <= i < n` begins with the i-th +/// cyclic rotation of `T` (its first `n` characters are exactly that +/// rotation), so the suffix-sort order restricted to those `n` positions is a +/// valid cyclic-rotation order. The suffix array itself is built with **SA-IS** +/// (Suffix Array by Induced Sorting; Nong, Zhang & Chan, IEEE TC 2009), a +/// linear-time `O(n)` algorithm — several times faster than the previous +/// `O(n log n)` prefix-doubling sort. +/// +/// For periodic `T` two rotations can be equal; any consistent tie-break +/// yields a valid BWT (the emitted `L` bytes are identical for tied rotations +/// anyway), and the inverse transform recovers the original regardless. fn sort_rotations(block: &[u8]) -> Vec { let n = block.len(); debug_assert!(n >= 1); - // Order array: the rotation offsets, to be permuted into sorted order. - let mut order: Vec = (0..n as u32).collect(); - if n == 1 { - return order; + return vec![0u32]; } - // `rank[i]` = current rank of the rotation starting at offset `i`. - // - // Initialise from the byte at each offset, but *densely*: map each - // distinct byte value to its 0-based order among the values that actually - // occur. This keeps every rank in `0..n` from the very first round, which - // is what the counting sort's bucket sizing (`n_keys = n`) relies on — - // raw byte values would reach 255 and overflow the buckets on small - // blocks. - let mut present = [false; 256]; + // Doubled text `T + T + sentinel`. Bytes map to 1..=256, sentinel to 0, + // giving an unsigned alphabet of size 257. `n` is bounded by the BWT + // block size (<= 64 MiB), so `2 * n + 1` stays well within `i32`. + let mut text: Vec = Vec::with_capacity(2 * n + 1); + for &b in block { + text.push(b as i32 + 1); + } for &b in block { - present[b as usize] = true; - } - let mut byte_rank = [0u32; 256]; - let mut next_rank = 0u32; - for (v, &seen) in present.iter().enumerate() { - if seen { - byte_rank[v] = next_rank; - next_rank += 1; + text.push(b as i32 + 1); + } + text.push(0); // sentinel + + let sa = sa_is(&text, 257); + + // Keep only suffixes that start in the first half (`< n`): those are the + // `n` cyclic rotations of `T`. The sentinel-only suffix sorts first and is + // skipped, as are the second-half suffixes (each dominated by the matching + // first-half rotation). + let mut order: Vec = Vec::with_capacity(n); + for &s in sa.iter() { + let s = s as usize; + if s < n { + order.push(s as u32); } } - let mut rank: Vec = block.iter().map(|&b| byte_rank[b as usize]).collect(); - // Scratch buffers reused across rounds. - let mut new_rank: Vec = vec![0; n]; - let mut tmp_order: Vec = vec![0; n]; - - let mut k = 1usize; - loop { - // Sort `order` by the key (rank[i], rank[(i + k) mod n]) using a - // stable two-pass counting sort (LSD radix on the two rank fields). - // Pass 1: by the second key, rank[(i + k) mod n]. - counting_sort_by(&order, &mut tmp_order, n, |i| { - rank[(i as usize + k) % n] as usize - }); - // Pass 2: by the first key, rank[i]. Stable, so ties keep the - // second-key order established above. - counting_sort_by(&tmp_order, &mut order, n, |i| rank[i as usize] as usize); - - // Recompute ranks from the freshly sorted order. Two adjacent - // rotations share a rank iff both key components are equal. - new_rank[order[0] as usize] = 0; - let mut r = 0u32; - for w in 1..n { - let prev = order[w - 1] as usize; - let cur = order[w] as usize; - let prev_key = (rank[prev], rank[(prev + k) % n]); - let cur_key = (rank[cur], rank[(cur + k) % n]); - if cur_key != prev_key { - r += 1; + debug_assert_eq!(order.len(), n); + + // Tie-break normalization. For periodic blocks several cyclic rotations + // can be *equal*; the doubled-suffix order breaks those ties by the + // distance to the sentinel (favouring larger offsets), whereas a stable + // rotation sort favours the smaller offset. Equal rotations share an + // identical last column, so reordering within a tied run never changes + // `L` — but it can change which row is the primary index. To keep the BWT + // output byte-identical to a stable cyclic sort, reorder each maximal run + // of equal rotations by ascending starting offset. + // + // Equal rotations are contiguous in `order`. We need the smallest period + // `p` of the block: rotation `a` equals rotation `b` iff `p` divides + // `b - a`. With `p` known, equality of adjacent rotations is an O(1) test + // and the whole normalization is O(n). + let period = smallest_period(block); + if period < n { + let mut i = 0usize; + while i < n { + // Extend a run while consecutive rotations are equal. With the + // block fully periodic, rotations are equal iff their starting + // offsets are congruent modulo `period`, an O(1) test. + let base = order[i] as usize % period; + let mut j = i + 1; + while j < n && order[j] as usize % period == base { + j += 1; + } + if j - i > 1 { + order[i..j].sort_unstable(); } - new_rank[cur] = r; + i = j; } - rank.copy_from_slice(&new_rank); + } + order +} - // All rotations distinct → fully sorted. - if r as usize == n - 1 { - break; +/// Smallest period `p` (1 <= p <= n) such that `block[k] == block[k + p]` for +/// all valid `k` *and* `p` divides `n` — i.e. `block` is `n/p` repetitions of +/// its length-`p` prefix. Returns `n` when the block is not periodic. Computed +/// in O(n) with the Knuth–Morris–Pratt failure function. +fn smallest_period(block: &[u8]) -> usize { + let n = block.len(); + if n <= 1 { + return n; + } + // KMP prefix-function (longest proper border length). + let mut fail = vec![0usize; n]; + let mut k = 0usize; + for i in 1..n { + while k > 0 && block[i] != block[k] { + k = fail[k - 1]; } - // Doubling. Once k >= n the second key spans the whole rotation, so - // one more round (already done above) suffices; guard anyway. - k <<= 1; - if k >= n { - break; + if block[i] == block[k] { + k += 1; } + fail[i] = k; } - order + let p = n - fail[n - 1]; + if n.is_multiple_of(p) { p } else { n } } -/// Stable counting sort of `src` (a permutation of rotation offsets) into -/// `dst`, keyed by `key(offset)` which must return a value in `0..n_keys`. -/// -/// `n_keys` is an upper bound on the key range; we use `n` (the block length) -/// since every rank lies in `0..n`. -fn counting_sort_by(src: &[u32], dst: &mut [u32], n_keys: usize, key: F) -where - F: Fn(u32) -> usize, -{ - let mut counts = vec![0usize; n_keys + 1]; - for &i in src { - counts[key(i)] += 1; - } - // Prefix sums → starting offset for each key bucket. - let mut acc = 0usize; - for c in counts.iter_mut() { - let cur = *c; - *c = acc; - acc += cur; - } - // Scatter, preserving input order within a bucket (stability). - for &i in src { - let slot = key(i); - dst[counts[slot]] = i; - counts[slot] += 1; +// ─── SA-IS suffix array construction ────────────────────────────────────── + +/// Build the suffix array of `text` over an integer alphabet whose largest +/// symbol is `< alphabet_size`. The text MUST end with a unique sentinel +/// strictly smaller than every other symbol (we use 0; real bytes are shifted +/// into `1..=256`). Returns an array of length `text.len()`, where `sa[i]` is +/// the start index of the i-th smallest suffix. Linear time, linear space, +/// pure safe Rust. +fn sa_is(text: &[i32], alphabet_size: usize) -> Vec { + let n = text.len(); + let mut sa = vec![-1i32; n]; + sa_is_inner(text, &mut sa, alphabet_size); + sa +} + +/// SA-IS core. Writes the suffix array into `sa` (length must equal +/// `text.len()`). +fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) { + let n = text.len(); + debug_assert_eq!(sa.len(), n); + + if n == 0 { + return; + } + if n == 1 { + sa[0] = 0; + return; + } + if n == 2 { + if text[0] < text[1] { + sa[0] = 0; + sa[1] = 1; + } else { + sa[0] = 1; + sa[1] = 0; + } + return; + } + + // 1. Classify each suffix as S-type (`t[i] == true`) or L-type. The last + // suffix (sentinel only) is S-type. Suffix i is S-type iff + // text[i] < text[i+1], or text[i] == text[i+1] and i+1 is S-type. + // Collect LMS positions (an S-type with an L-type left neighbour) while + // classifying. + let mut t = vec![false; n]; + t[n - 1] = true; + let mut lms_positions: Vec = Vec::new(); + for i in (0..n - 1).rev() { + let si = match text[i].cmp(&text[i + 1]) { + core::cmp::Ordering::Less => true, + core::cmp::Ordering::Equal => t[i + 1], + core::cmp::Ordering::Greater => false, + }; + t[i] = si; + if t[i + 1] && !si { + lms_positions.push((i + 1) as i32); + } + } + lms_positions.reverse(); + let n1 = lms_positions.len(); + + // 2. Bucket sizes (per-symbol counts in `text`). + let mut counts = vec![0i32; alphabet_size]; + for &c in text { + counts[c as usize] += 1; + } + let mut bucket = vec![0i32; alphabet_size]; + + // 3. Place LMS suffixes at the END of their buckets. + sa.fill(-1); + fill_bucket_ends(&counts, &mut bucket); + for &p in &lms_positions { + let c = text[p as usize] as usize; + bucket[c] -= 1; + sa[bucket[c] as usize] = p; + } + + // 4-5. Induced sort: L-suffixes (left-to-right), then S-suffixes + // (right-to-left). + induce_sort_l(text, sa, &t, &counts, &mut bucket); + induce_sort_s(text, sa, &t, &counts, &mut bucket); + + // 6. Compact the induced LMS suffixes to the front of `sa`, then name + // each by its LMS-substring identity. + let mut j1 = 0usize; + for i in 0..n { + if sa[i] >= 0 && is_lms(&t, sa[i] as usize) { + sa[j1] = sa[i]; + j1 += 1; + } + } + debug_assert_eq!(j1, n1); + for slot in sa.iter_mut().take(n).skip(n1) { + *slot = -1; + } + + let mut name: i32 = 0; + let mut prev: i32 = -1; + for i in 0..n1 { + let pos = sa[i] as usize; + let mut diff = false; + if prev == -1 { + diff = true; + } else { + let p = prev as usize; + let mut d = 0usize; + loop { + if pos + d >= n || p + d >= n { + diff = true; + break; + } + if text[pos + d] != text[p + d] || t[pos + d] != t[p + d] { + diff = true; + break; + } + if d > 0 && (is_lms(&t, pos + d) || is_lms(&t, p + d)) { + if is_lms(&t, pos + d) != is_lms(&t, p + d) { + diff = true; + } + break; + } + d += 1; + } + } + if diff { + name += 1; + prev = pos as i32; + } + sa[n1 + pos / 2] = name - 1; + } + let mut j = n - 1; + for i in (n1..n).rev() { + if sa[i] >= 0 { + sa[j] = sa[i]; + j -= 1; + } + } + + // 7. Solve the reduced problem (recursively if names collide). + let new_alpha = (name as usize) + 1; + let (sa1_area, t1_area) = sa.split_at_mut(n - n1); + if (name as usize) == n1 { + for (i, &name_of_pos) in t1_area.iter().enumerate() { + sa1_area[name_of_pos as usize] = i as i32; + } + } else { + let reduced_text: &[i32] = &t1_area[..n1]; + let sa1 = &mut sa1_area[..n1]; + sa_is_inner(reduced_text, sa1, new_alpha); + } + + // 8. Map the sorted reduced suffixes back to original LMS positions. + for slot in sa.iter_mut().take(n1) { + let idx = *slot as usize; + *slot = lms_positions[idx]; + } + for slot in sa.iter_mut().take(n).skip(n1) { + *slot = -1; + } + + // 9. Re-place the now-sorted LMS suffixes at bucket ends, then 10. final + // induced sorts. Snapshot the sorted LMS positions first so the + // scatter cannot clobber a not-yet-read entry. + let mut lms_sorted: Vec = Vec::with_capacity(n1); + lms_sorted.extend_from_slice(&sa[..n1]); + for slot in sa.iter_mut().take(n) { + *slot = -1; + } + fill_bucket_ends(&counts, &mut bucket); + for &pos in lms_sorted.iter().rev() { + let c = text[pos as usize] as usize; + bucket[c] -= 1; + sa[bucket[c] as usize] = pos; + } + + induce_sort_l(text, sa, &t, &counts, &mut bucket); + induce_sort_s(text, sa, &t, &counts, &mut bucket); +} + +/// `true` iff suffix `i` is S-type AND suffix `i-1` is L-type (left-most S in +/// a run). Suffix 0 is never an LMS in our convention. +#[inline(always)] +fn is_lms(t: &[bool], i: usize) -> bool { + i > 0 && t[i] && !t[i - 1] +} + +/// Materialise each bucket *start* (exclusive prefix sum of `counts`). +#[inline] +fn fill_bucket_starts(counts: &[i32], out: &mut [i32]) { + let mut acc = 0i32; + for (o, &c) in out.iter_mut().zip(counts.iter()) { + *o = acc; + acc += c; + } +} + +/// Materialise each bucket *end* (inclusive prefix sum of `counts`). +#[inline] +fn fill_bucket_ends(counts: &[i32], out: &mut [i32]) { + let mut acc = 0i32; + for (o, &c) in out.iter_mut().zip(counts.iter()) { + acc += c; + *o = acc; + } +} + +/// Induced sort of L-type suffixes (left-to-right scan over `sa`). +fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) { + let n = text.len(); + fill_bucket_starts(counts, bucket); + for i in 0..n { + let v = sa[i]; + if v <= 0 { + continue; + } + let j = (v as usize) - 1; + if !t[j] { + let c = text[j] as usize; + let slot = bucket[c]; + sa[slot as usize] = j as i32; + bucket[c] = slot + 1; + } + } +} + +/// Induced sort of S-type suffixes (right-to-left scan over `sa`). +fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) { + let n = text.len(); + fill_bucket_ends(counts, bucket); + for i in (0..n).rev() { + let v = sa[i]; + if v <= 0 { + continue; + } + let j = (v as usize) - 1; + if t[j] { + let c = text[j] as usize; + let slot = bucket[c] - 1; + bucket[c] = slot; + sa[slot as usize] = j as i32; + } } } diff --git a/src/lz4/block.rs b/src/lz4/block.rs index 7125eb2..124c4a7 100644 --- a/src/lz4/block.rs +++ b/src/lz4/block.rs @@ -230,15 +230,46 @@ pub fn encode_block(input: &[u8], out: &mut Vec) { /// The emitted bitstream is byte-for-byte a valid LZ4 block in every case — /// only the parse changes, so the reference `lz4` decoder reads it unchanged. pub fn encode_block_level(input: &[u8], out: &mut Vec, level: u8) { - if level < HC_LEVEL_THRESHOLD { - encode_block(input, out); + encode_block_level_dict(&[], input, out, level); +} + +/// Encode `input` as a single LZ4 block, allowing back-references into a +/// `dict` prefix (the tail of previously emitted output, up to 64 KiB). +/// +/// This is the LZ4 frame **linked-block** match finder: matches in the +/// current block may point back into `dict`, giving a sliding window that +/// crosses block boundaries. The emitted bitstream is an ordinary LZ4 block — +/// the offsets simply reach further back than the block's own length, which +/// the reference decoder resolves against the previously decoded output. +/// +/// `dict` should be at most `MAX_DISTANCE + 1` bytes (the caller carries a +/// 64 KiB sliding window so this holds); any byte of `dict` further than +/// `MAX_DISTANCE` from a match start is simply never referenced. Passing an +/// empty `dict` reproduces [`encode_block_level`] exactly. +/// +/// The end-of-block rules (last 5 bytes literal, last match starts ≥ 12 bytes +/// before the block end) apply to the **current block** boundary only — the +/// dict prefix never affects them. +pub fn encode_block_level_dict(dict: &[u8], input: &[u8], out: &mut Vec, level: u8) { + if dict.is_empty() { + if level < HC_LEVEL_THRESHOLD { + encode_block(input, out); + } else if level < OPT_LEVEL_THRESHOLD { + encode_block_hc_dict(&[], input, out, level); + } else { + encode_block_optimal_dict(&[], input, out, level); + } return; } + // With a dictionary we always use a match finder that understands the + // prefix. The fast greedy path would ignore the dict, so low levels fall + // through to the HC finder (at its shallowest depth) to still cross block + // boundaries while staying cheap. if level < OPT_LEVEL_THRESHOLD { - encode_block_hc(input, out, level); - return; + encode_block_hc_dict(dict, input, out, level); + } else { + encode_block_optimal_dict(dict, input, out, level); } - encode_block_optimal(input, out, level); } /// Map a compression level to a hash-chain search depth (`nb_attempts`). @@ -361,22 +392,49 @@ fn hc_resolve( /// candidate start it walks the chain up to `nb_attempts` links and keeps the /// longest match inside the 64 KiB window. A one-step lazy heuristic defers a /// match when the next position offers a strictly longer one. -fn encode_block_hc(input: &[u8], out: &mut Vec, level: u8) { +/// +/// `dict` is an optional prefix of previously emitted output that matches may +/// reference (LZ4 linked-block mode). Internally `dict` and `input` are parsed +/// over one combined buffer; only positions inside `input` may *start* a +/// sequence, but their back-references may reach into the `dict` region. +fn encode_block_hc_dict(dict: &[u8], input: &[u8], out: &mut Vec, level: u8) { out.clear(); if input.is_empty() { return; } + // The current block is too small to host any match under the end-of-block + // rules; emit it as literals regardless of the dictionary. if input.len() < MFLIMIT + 1 { emit_last_literals(input, out); return; } + let dict_len = dict.len(); + // Combined view: dict bytes occupy [0, dict_len), the current block occupies + // [dict_len, n). Matches can point anywhere behind a start position (subject + // to MAX_DISTANCE), so a match starting in the block can reach into dict. + // One contiguous allocation lets the existing finder address dict and block + // uniformly with plain offsets. + let buf: Vec; + let input: &[u8] = if dict_len == 0 { + input + } else { + let mut v = Vec::with_capacity(dict_len + input.len()); + v.extend_from_slice(dict); + v.extend_from_slice(input); + buf = v; + &buf + }; + let n = input.len(); let nb_attempts = nb_attempts_for_level(level); let mut head = alloc::vec![HASH_EMPTY; HC_HASH_TABLE_SIZE]; let mut chain = alloc::vec![HASH_EMPTY; n]; + // End-of-block limits are measured against the combined length `n`; since + // the current block sits at the tail, this keeps the last 5 / 12 bytes of + // the *block* literal exactly as the single-block path does. let match_limit = n - MFLIMIT; // last position a match may start at let hash_limit = n - MIN_MATCH - LAST_LITERALS; // last hashable position let forward_limit = n - LAST_LITERALS; // last 5 bytes stay literal @@ -386,8 +444,9 @@ fn encode_block_hc(input: &[u8], out: &mut Vec, level: u8) { // each position is inserted exactly once and the chain stays strictly // ordered newest-first. let mut inserted_through: usize = 0; - let mut anchor: usize = 0; - let mut ip: usize = 0; + // Sequences may only start inside the current block. + let mut anchor: usize = dict_len; + let mut ip: usize = dict_len; // Insert all hashable positions in [inserted_through, up_to). macro_rules! insert_up_to { @@ -400,6 +459,9 @@ fn encode_block_hc(input: &[u8], out: &mut Vec, level: u8) { }}; } + // Seed the chain with every dict position so block matches can find them. + insert_up_to!(dict_len); + while ip <= match_limit && ip <= hash_limit { // Ensure positions up to and including `ip` are in the chain. insert_up_to!(ip + 1); @@ -528,7 +590,11 @@ struct OptStep { /// the hash-chain finder (sequence overhead + the literal run it terminates). /// Backtracking recovers the cheapest path, which is then emitted with the /// shared sequence emitter — so the bitstream stays a valid LZ4 block. -fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { +/// +/// `dict` is an optional prefix of previously emitted output that matches may +/// reference (LZ4 linked-block mode). The DP only traverses positions inside +/// the current block; matches found may point back into the `dict` region. +fn encode_block_optimal_dict(dict: &[u8], input: &[u8], out: &mut Vec, level: u8) { out.clear(); if input.is_empty() { return; @@ -538,6 +604,20 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { return; } + let dict_len = dict.len(); + // Combined buffer: dict at [0, dict_len), block at [dict_len, n). See + // `encode_block_hc_dict` for the rationale. + let buf: Vec; + let input: &[u8] = if dict_len == 0 { + input + } else { + let mut v = Vec::with_capacity(dict_len + input.len()); + v.extend_from_slice(dict); + v.extend_from_slice(input); + buf = v; + &buf + }; + let n = input.len(); let nb_attempts = nb_attempts_for_level(level); @@ -562,7 +642,9 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { }; n + 1 ]; - price[0] = 0; + // The DP origin is the start of the current block, not byte 0: the dict + // region is never traversed, only referenced. + price[dict_len] = 0; // Insert all positions up to `up_to` (exclusive) that are hashable. let mut inserted_through = 0usize; @@ -576,7 +658,10 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { }}; } - let mut i = 0usize; + // Seed the chain with every dict position so block matches can find them. + insert_up_to!(dict_len); + + let mut i = dict_len; while i < n { if price[i] == usize::MAX { i += 1; @@ -666,10 +751,11 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { i += 1; } - // Backtrack from n to 0, collecting the path edges in reverse. + // Backtrack from n to the block origin, collecting the path edges in + // reverse. let mut path: Vec = Vec::new(); let mut pos = n; - while pos > 0 { + while pos > dict_len { let s = step[pos]; if s.match_pos == usize::MAX { // Literal edge: step back one byte. Collapse a contiguous literal @@ -683,7 +769,7 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec, level: u8) { path.reverse(); // Replay forward, emitting literals then each match. - let mut anchor = 0usize; + let mut anchor = dict_len; for s in &path { let match_start = { // The match's start position is the end-of-literal-run point. We diff --git a/src/lz4/frame.rs b/src/lz4/frame.rs index 6dfaa30..c3d0b19 100644 --- a/src/lz4/frame.rs +++ b/src/lz4/frame.rs @@ -34,10 +34,15 @@ //! When `block_independence = false` (the LZ4 CLI default and ours), the //! decoder must keep up to 64 KiB of the previously-decoded payload //! available so the next block's back-references can address it. Our -//! encoder respects this in the same direction: the matcher only ever -//! looks within the current block (i.e. it produces blocks that decode -//! correctly under either mode), but the decoder honours linked-block -//! semantics on input from any conforming encoder. +//! encoder produces such references: it carries a sliding 64 KiB window of +//! the most recently emitted raw output and offers it to the block match +//! finder as a dictionary (see [`block::encode_block_level_dict`]), so a +//! match in one block can reference bytes that were emitted by previous +//! blocks. This is what gives linked mode its ratio edge over independent +//! blocks, whose match window is just the block's own ≤ 64 KiB. +//! +//! When `block_independence = true`, the window is never populated and each +//! block is compressed in isolation, decoding correctly on its own. use alloc::vec::Vec; @@ -362,6 +367,10 @@ pub struct Encoder { /// xxHash32 over the entire raw content. Only used when /// `cfg.content_checksum`. content_hash: XxHash32, + /// Sliding 64 KiB window of the most recently emitted *raw* output, used + /// as a back-reference dictionary for the next block in linked-block mode + /// (`cfg.block_independence == false`). Empty in independent mode. + window: Vec, } impl Encoder { @@ -373,6 +382,11 @@ impl Encoder { /// Construct with an explicit config. pub fn with_config(cfg: EncoderConfig) -> Self { let bs = cfg.block_max_size.bytes(); + let window_cap = if cfg.block_independence { + 0 + } else { + WINDOW_SIZE + }; let mut enc = Self { cfg, block_size: bs, @@ -381,11 +395,32 @@ impl Encoder { staged_idx: 0, phase: EncPhase::Header, content_hash: XxHash32::new(), + window: Vec::with_capacity(window_cap), }; enc.build_header(); enc } + /// Append `bytes` (a block's raw payload) to the sliding back-reference + /// window, keeping only the trailing [`WINDOW_SIZE`] bytes. No-op in + /// independent mode. + fn push_window(&mut self, bytes: &[u8]) { + if self.cfg.block_independence { + return; + } + if bytes.len() >= WINDOW_SIZE { + self.window.clear(); + self.window + .extend_from_slice(&bytes[bytes.len() - WINDOW_SIZE..]); + return; + } + let combined = self.window.len() + bytes.len(); + if combined > WINDOW_SIZE { + self.window.drain(..combined - WINDOW_SIZE); + } + self.window.extend_from_slice(bytes); + } + /// Stage the frame header (magic + FLG + BD + HC) in `staged`. fn build_header(&mut self) { self.staged.clear(); @@ -426,9 +461,20 @@ impl Encoder { self.content_hash.update(&self.raw); } - // Compress into a scratch buffer. + // Compress into a scratch buffer. In linked-block mode the previous + // blocks' trailing output (`self.window`, ≤ 64 KiB) is offered as a + // back-reference dictionary so matches can cross the block boundary. let mut compressed = Vec::with_capacity(block::compress_bound(self.raw.len())); - block::encode_block_level(&self.raw, &mut compressed, self.cfg.level); + block::encode_block_level_dict(&self.window, &self.raw, &mut compressed, self.cfg.level); + + // Slide the window forward with this block's raw payload, regardless of + // whether we end up emitting it compressed or raw — the decoder's + // window tracks decoded output either way. + if !self.cfg.block_independence { + let raw = core::mem::take(&mut self.raw); + self.push_window(&raw); + self.raw = raw; + } self.staged.clear(); // Choose the smaller of compressed / raw. The LZ4 Frame spec @@ -626,6 +672,7 @@ impl RawEncoder for Encoder { self.staged.clear(); self.staged_idx = 0; self.content_hash = XxHash32::new(); + self.window.clear(); self.phase = EncPhase::Header; self.build_header(); } diff --git a/src/lzma2/mod.rs b/src/lzma2/mod.rs index 44d06cc..742a8e3 100644 --- a/src/lzma2/mod.rs +++ b/src/lzma2/mod.rs @@ -181,7 +181,7 @@ fn resolve_dict_size(cfg: &DecoderConfig) -> Result { // ─── encoder ────────────────────────────────────────────────────────────── -use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk}; +use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma2_stream}; /// Dictionary size (in bytes) the encoder advertises to the LZMA chunk /// coder as the match-distance ceiling. Fixed at 4 MiB — the [`crate::xz`] @@ -278,37 +278,41 @@ impl Encoder { } } - /// Stage one LZMA2 chunk for `data` (`1..=ENC_CHUNK_MAX` bytes), choosing - /// a compressed chunk when it shrinks the data and an uncompressed - /// fallback otherwise. - fn stage_chunk(&mut self, data: &[u8]) { - debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX); - let compressed = encode_lzma_chunk(data, ENC_DICT_SIZE, self.params); - // A compressed chunk is only worth emitting when its range-coded body - // is both smaller than the input and fits the 16-bit (+1) comp-size - // field. Otherwise the uncompressed chunk is strictly smaller. - let use_compressed = - !compressed.is_empty() && compressed.len() <= 65_536 && compressed.len() < data.len(); - if use_compressed { - self.stage_compressed_chunk(data, &compressed); - } else { - self.stage_uncompressed_chunk(data); + /// Stage the whole stream's LZMA2 chunks from the fully-buffered `input`. + /// + /// The chunks come from a single continuous match-finder + /// ([`encode_lzma2_stream`]): the first chunk resets the dictionary + /// (`0xE0` compressed / `0x01` uncompressed) and every later chunk + /// continues it (`0xC0` compressed / `0x02` uncompressed), so cross-chunk + /// matches (up to the 4 MiB dictionary) are found. The caller appends the + /// `0x00` end marker afterwards. + fn stage_payload(&mut self, input: &[u8]) { + let chunks = encode_lzma2_stream(input, ENC_DICT_SIZE, self.params); + let mut pos = 0usize; + for chunk in chunks { + let data = &input[pos..pos + chunk.uncomp_len]; + match chunk.body { + Some(ref body) => self.stage_compressed_chunk(data, body, chunk.reset_dict), + None => self.stage_uncompressed_chunk(data, chunk.reset_dict), + } + pos += chunk.uncomp_len; } } - /// Stage a full-reset compressed chunk: control `0xE0` (compressed, with - /// dict, props, and state all reset; top 5 bits of `uncomp_size-1`), a + /// Stage a compressed chunk: control `0xE0` (dict reset) for the first + /// chunk or `0xC0` (dictionary continues) otherwise — both carry the top 5 + /// bits of `uncomp_size-1` and set bit 6 (props present). Followed by a /// 2-byte `uncomp_size-1` BE remainder, a 2-byte `comp_size-1` BE, the - /// 1-byte LZMA props (present because we full-reset), then the - /// range-coded body. - fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8]) { + /// 1-byte LZMA props, then the range-coded body. + fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8], reset_dict: bool) { debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX); debug_assert!(!compressed.is_empty() && compressed.len() <= 65_536); let uncomp_m1 = (data.len() - 1) as u32; // 0..=65535 with our cap // Top 5 bits of (uncomp_size - 1) live in the control byte; with a - // 65_536 cap they are always zero, yielding exactly 0xE0. - let control: u8 = 0xE0 | ((uncomp_m1 >> 16) & 0x1F) as u8; + // 65_536 cap they are always zero, yielding exactly 0xE0 / 0xC0. + let base: u8 = if reset_dict { 0xE0 } else { 0xC0 }; + let control: u8 = base | ((uncomp_m1 >> 16) & 0x1F) as u8; let comp_m1 = (compressed.len() - 1) as u16; self.pending.reserve(6 + compressed.len()); @@ -322,13 +326,15 @@ impl Encoder { self.pending_idx = 0; } - /// Stage an uncompressed chunk: control `0x01` (dict reset), 2-byte - /// `size-1` BE, then the raw bytes. - fn stage_uncompressed_chunk(&mut self, data: &[u8]) { + /// Stage an uncompressed chunk: control `0x01` (dict reset) for the first + /// chunk or `0x02` (dictionary continues) otherwise, a 2-byte `size-1` BE, + /// then the raw bytes. + fn stage_uncompressed_chunk(&mut self, data: &[u8], reset_dict: bool) { debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX); + let control: u8 = if reset_dict { 0x01 } else { 0x02 }; let size_m1 = (data.len() - 1) as u16; self.pending.reserve(3 + data.len()); - self.pending.push(0x01); + self.pending.push(control); self.pending.push((size_m1 >> 8) as u8); self.pending.push((size_m1 & 0xFF) as u8); self.pending.extend_from_slice(data); @@ -344,23 +350,19 @@ impl RawEncoder for Encoder { loop { match self.phase { EncPhase::Body => { - while consumed < input.len() && self.in_buf.len() < ENC_CHUNK_MAX { - let take = (ENC_CHUNK_MAX - self.in_buf.len()).min(input.len() - consumed); - self.in_buf - .extend_from_slice(&input[consumed..consumed + take]); - consumed += take; - } - if self.in_buf.len() == ENC_CHUNK_MAX { - let data = core::mem::take(&mut self.in_buf); - self.stage_chunk(&data); - self.phase = EncPhase::DrainPending; - } else { - return Ok(RawProgress { - consumed, - written, - done: false, - }); + // Buffer the entire input so the chunk stream can be + // produced by a single continuous match-finder at `finish` + // — the dictionary then spans every chunk instead of + // resetting per chunk. No mid-stream flush. + if consumed < input.len() { + self.in_buf.extend_from_slice(&input[consumed..]); + consumed = input.len(); } + return Ok(RawProgress { + consumed, + written, + done: false, + }); } EncPhase::DrainPending => { if self.drain_pending(output, &mut written) { @@ -411,9 +413,9 @@ impl RawEncoder for Encoder { } if !self.in_buf.is_empty() { let data = core::mem::take(&mut self.in_buf); - self.stage_chunk(&data); - // Stay in `Finishing`; the loop drains this chunk then - // re-checks the (now empty) buffer. + self.stage_payload(&data); + // Stay in `Finishing`; the loop drains the staged + // chunks then re-checks the (now empty) buffer. } else { // Buffer empty and any staged chunk drained: emit the // single 0x00 end marker. @@ -546,6 +548,24 @@ impl Decoder { e } + /// Feed raw uncompressed-chunk bytes into the LZMA2 dictionary so a later + /// dictionary-continuing compressed chunk (`0xC0`) can reference them. + /// Lazily creates `lzma_core` (canonical default props, sized to the + /// configured dictionary) when the stream opens with an uncompressed chunk. + fn feed_uncompressed_dict(&mut self, src: &[u8]) { + if self.lzma_core.is_none() { + let props = Lzma2Props::parse(LZMA2_PROPS_BYTE).unwrap_or(Lzma2Props { + lc: 3, + lp: 0, + pb: 2, + }); + self.lzma_core = Some(Box::new(LzmaCore::new(props, self.dict_size))); + } + if let Some(core) = self.lzma_core.as_mut() { + core.append_literals(src); + } + } + /// Pull bytes from `input` (advancing `consumed`) into `scratch` until it /// holds `scratch_want` bytes. Returns true once full. fn fill_scratch(&mut self, input: &[u8], consumed: &mut usize) -> bool { @@ -640,11 +660,13 @@ impl RawDecoder for Decoder { let src = &input[consumed..consumed + take]; output[written..written + take].copy_from_slice(src); // Feed the bytes into the LZ window so a later - // compressed chunk (without a dict reset) can - // back-reference them. - if let Some(core) = self.lzma_core.as_mut() { - core.append_literals(src); - } + // dictionary-continuing compressed chunk (`0xC0`) can + // back-reference them. Lazily create the core (canonical + // default props — irrelevant for plain dict population, + // and replaced by the next compressed chunk's reset + // bits) when none exists yet, e.g. when the stream opens + // with an uncompressed chunk. + self.feed_uncompressed_dict(src); self.chunk_remaining -= take; consumed += take; written += take; diff --git a/src/lzma2_internal/lzma2_encoder.rs b/src/lzma2_internal/lzma2_encoder.rs index bd3589b..23f19e9 100644 --- a/src/lzma2_internal/lzma2_encoder.rs +++ b/src/lzma2_internal/lzma2_encoder.rs @@ -586,6 +586,36 @@ impl LzmaEncCore { (self.output_pos as u32) & self.pos_mask } + /// Reset the LZMA *state* for a new continued chunk: re-initialise every + /// probability table, the LZMA state, the four rep distances, and a fresh + /// range coder — but deliberately keep `output_pos` running so the + /// `pos_state` derivation stays continuous across chunks (matching the + /// decoder, whose `output_pos` only resets on a dictionary reset, not a + /// state reset). The LZ history (hash chain) is owned outside the core and + /// is likewise preserved, so matches in this chunk can reference data from + /// earlier chunks. This is the encoder counterpart of the LZMA2 `0xC0` + /// "reset state + new props, dictionary continues" control byte. + fn reset_state_keep_pos(&mut self) { + self.is_match.fill(PROB_INIT); + self.is_rep.fill(PROB_INIT); + self.is_rep0.fill(PROB_INIT); + self.is_rep1.fill(PROB_INIT); + self.is_rep2.fill(PROB_INIT); + self.is_rep0_long.fill(PROB_INIT); + self.dist_slot.fill(PROB_INIT); + self.dist_special.fill(PROB_INIT); + self.dist_align.fill(PROB_INIT); + self.lit.fill(PROB_INIT); + self.len_coder = LengthCoderEnc::new(); + self.rep_len_coder = LengthCoderEnc::new(); + self.state = 0; + self.rep0 = 0; + self.rep1 = 0; + self.rep2 = 0; + self.rep3 = 0; + self.rc = RangeEncoder::new(); + } + fn encode_literal_full(&mut self, byte: u8, prev_byte: u8, match_byte: Option) { let lp_state = ((self.output_pos as u32) & self.lit_pos_mask) << self.lc; let prev_high = (prev_byte as u32) >> (8 - self.lc); @@ -1212,6 +1242,11 @@ fn literal_price_at( /// /// `params` is the level-derived match-finder + parser tuning; see /// [`EncoderParams::from_level`]. +/// +/// Single-chunk helper retained for the LZMA2 unit tests (which exercise the +/// chunk codec directly); the production encoders use [`encode_lzma2_stream`], +/// which keeps one continuous match-finder across chunks. +#[cfg_attr(not(test), allow(dead_code))] pub(crate) fn encode_lzma_chunk(input: &[u8], dict_size: u32, params: EncoderParams) -> Vec { if params.opt_window == 0 { return encode_chunk_body(input, dict_size, params, false); @@ -1229,8 +1264,140 @@ pub(crate) fn encode_lzma_chunk(input: &[u8], dict_size: u32, params: EncoderPar } } +/// One framed LZMA2 chunk produced by [`encode_lzma2_stream`]. +/// +/// The caller (xz Block payload or raw LZMA2 stream framing) turns this into +/// the on-wire chunk header + body. `uncomp_len` is the number of input bytes +/// the chunk covers; for a compressed chunk `body` is the range-coded payload, +/// for an uncompressed chunk `body` is empty and the caller copies the +/// `uncomp_len` raw input bytes verbatim. +pub(crate) struct Lzma2Chunk { + /// Number of uncompressed input bytes this chunk represents. + pub uncomp_len: usize, + /// `true` when this is the first chunk of the stream and therefore must + /// reset the dictionary (`0xE0` compressed / `0x01` uncompressed); `false` + /// for every later chunk, which continues the dictionary (`0xC0` + /// compressed / `0x02` uncompressed). + pub reset_dict: bool, + /// `Some(range-coded body)` for a compressed chunk; `None` for an + /// uncompressed-fallback chunk (the caller copies the raw input slice). + pub body: Option>, +} + +/// Largest uncompressed payload a single compressed chunk may carry. The +/// compressed-chunk uncompressed-size field is 21 bits (+1 ⇒ 2 MiB), but we +/// cap at 64 KiB so the *compressed* size always fits the 16-bit (+1) comp +/// field and the chunk header shape stays uniform with the uncompressed +/// fallback. The dictionary still spans the whole input regardless of this +/// slice size — only the output framing is chunked. +const STREAM_CHUNK_UNCOMP_MAX: usize = 65_536; + +/// Encode the *entire* `input` as a continuous LZMA2 chunk stream, returning +/// the framed chunks in order. +/// +/// Unlike calling [`encode_lzma_chunk`] once per slice, this keeps a single LZ +/// match-finder (and a continuous `output_pos`) across all chunks: the first +/// chunk resets the dictionary, every later chunk *continues* it, so a match +/// in a later chunk can reference data from any earlier chunk up to +/// `dict_size`. This is what closes the ratio gap with the `.lzma` (single +/// continuous stream) path. The range coder and probability model still reset +/// per chunk (state reset) — only the dictionary/history is continuous. +/// +/// The caller is responsible for the `0x00` end marker and any container +/// framing (xz Block/Index, raw-stream terminator). +pub(crate) fn encode_lzma2_stream( + input: &[u8], + dict_size: u32, + params: EncoderParams, +) -> Vec { + let mut chunks = Vec::new(); + if input.is_empty() { + return chunks; + } + + let mut core = LzmaEncCore::new(); + // One hash chain over the whole input: the LZ history is never cleared at + // a chunk boundary, so cross-chunk matches are found naturally. + let mut hc = HashChain::new(input.len()); + + let mut pos = 0usize; + let mut first = true; + while pos < input.len() { + let chunk_end = (pos + STREAM_CHUNK_UNCOMP_MAX).min(input.len()); + let uncomp_len = chunk_end - pos; + + // Range-code this slice. State/probabilities/range coder are reset for + // the chunk; `output_pos` and the hash chain continue. + let body = + encode_stream_chunk_body(&mut core, &mut hc, input, pos, chunk_end, dict_size, params); + + // Compressed only pays off when the body both shrinks the slice and + // fits the 16-bit (+1) compressed-size field; otherwise the + // uncompressed fallback is strictly smaller. + let use_compressed = !body.is_empty() && body.len() <= 65_536 && body.len() < uncomp_len; + + if use_compressed { + chunks.push(Lzma2Chunk { + uncomp_len, + reset_dict: first, + body: Some(body), + }); + } else { + // Uncompressed fallback. The range-coded attempt mutated `core` + // (output_pos advanced, hash chain populated for this slice), which + // is exactly the state we want for continuing the dictionary into + // later chunks — the bytes are already in the LZ history and + // `output_pos` already advanced by `uncomp_len`. So nothing extra + // to do here; just frame an uncompressed chunk. + chunks.push(Lzma2Chunk { + uncomp_len, + reset_dict: first, + body: None, + }); + } + + pos = chunk_end; + first = false; + } + + chunks +} + +/// Range-code `input[pos_start..pos_end]` through `core`/`hc`, performing a +/// per-chunk state reset first (keeping `output_pos` and the LZ history). +/// Returns the flushed range-coded body. +/// +/// Uses the optimal parse when the level enables it (`opt_window != 0`) and the +/// greedy parse otherwise — the same selection [`encode_lzma_chunk`] makes per +/// level. We do not additionally run both parses and keep the smaller body (as +/// the single-chunk path does): doing so would require snapshotting/replaying +/// the shared cross-chunk LZ history, and the optimal parse only loses to +/// greedy on pathological tiny inputs, which the uncompressed-chunk fallback +/// already covers for whole chunks. +#[allow(clippy::too_many_arguments)] +fn encode_stream_chunk_body( + core: &mut LzmaEncCore, + hc: &mut HashChain, + input: &[u8], + pos_start: usize, + pos_end: usize, + dict_size: u32, + params: EncoderParams, +) -> Vec { + core.reset_state_keep_pos(); + if params.opt_window == 0 { + encode_greedy(core, hc, input, pos_start, pos_end, dict_size, params); + } else { + encode_optimal(core, hc, input, pos_start, pos_end, dict_size, params); + } + core.rc.flush(); + core.rc.out.clone() +} + /// Encode one chunk body (range-coded packets + 5-byte flush, no EOS marker) -/// using the greedy or optimal parse. +/// using the greedy or optimal parse. Only reachable from the test-only +/// [`encode_lzma_chunk`]. +#[cfg_attr(not(test), allow(dead_code))] fn encode_chunk_body( input: &[u8], dict_size: u32, @@ -1241,9 +1408,9 @@ fn encode_chunk_body( let mut hc = HashChain::new(input.len()); if optimal { - encode_optimal(&mut core, &mut hc, input, dict_size, params); + encode_optimal(&mut core, &mut hc, input, 0, input.len(), dict_size, params); } else { - encode_greedy(&mut core, &mut hc, input, dict_size, params); + encode_greedy(&mut core, &mut hc, input, 0, input.len(), dict_size, params); } // Flush the range coder. NO EOS marker — LZMA2 frames the uncompressed @@ -1254,23 +1421,36 @@ fn encode_chunk_body( /// Greedy/lazy parse — used by the lowest levels where speed matters most and /// the optimal-parse overhead isn't worth it. +/// +/// Encodes `input[pos_start..pos_end]`. Match finding may reference any earlier +/// position in `input` (the LZ history is the whole buffer up to `pos`), but +/// emitted match/rep lengths are clamped so the parse stops exactly at +/// `pos_end` — this lets a continuous encoder slice the output into chunks +/// without ever crossing a chunk's uncompressed-size boundary. fn encode_greedy( core: &mut LzmaEncCore, hc: &mut HashChain, input: &[u8], + pos_start: usize, + pos_end: usize, dict_size: u32, params: EncoderParams, ) { - let mut pos = 0usize; - while pos < input.len() { + let mut pos = pos_start; + while pos < pos_end { + // Bytes left until this chunk's boundary; emitted lengths never exceed + // it so the chunk ends exactly at `pos_end`. + let cap = (pos_end - pos) as u32; let rep_lens = [ - rep_match_len(input, pos, core.rep0), - rep_match_len(input, pos, core.rep1), - rep_match_len(input, pos, core.rep2), - rep_match_len(input, pos, core.rep3), + rep_match_len(input, pos, core.rep0).min(cap), + rep_match_len(input, pos, core.rep1).min(cap), + rep_match_len(input, pos, core.rep2).min(cap), + rep_match_len(input, pos, core.rep3).min(cap), ]; - let new_match = hc.find_longest(input, pos, dict_size, params); + let new_match = hc + .find_longest(input, pos, dict_size, params) + .map(|(l, d)| (l.min(cap), d)); let best_rep_len = rep_lens.iter().copied().max().unwrap_or(0); let best_rep_idx = rep_lens @@ -1320,10 +1500,16 @@ fn encode_greedy( /// Cost-based optimal parse: forward DP over a look-ahead window, committing /// the cheapest path through the optimum buffer, then replaying decisions /// through the real (probability-updating) emit functions. +/// +/// Encodes `input[pos_start..pos_end]`; matches still reference the whole LZ +/// history before `pos`, but the parse never advances past `pos_end`, so the +/// chunk ends exactly there. fn encode_optimal( core: &mut LzmaEncCore, hc: &mut HashChain, input: &[u8], + pos_start: usize, + pos_end: usize, dict_size: u32, params: EncoderParams, ) { @@ -1331,17 +1517,18 @@ fn encode_optimal( let window = params.opt_window as usize; let mut opt = Optimizer::new(window); - let mut pos = 0usize; + let mut pos = pos_start; // Refresh the price snapshot once per committed window. Prices drift as // the model adapts; refreshing each window keeps them close to the live // model without recomputing per byte. - while pos < input.len() { + while pos < pos_end { let snap = core.price_snapshot(&prob_prices); let parsed = parse_window( core, hc, input, pos, + pos_end, dict_size, params, window, @@ -1368,6 +1555,7 @@ fn parse_window( hc: &HashChain, input: &[u8], start: usize, + pos_end: usize, dict_size: u32, params: EncoderParams, window: usize, @@ -1375,7 +1563,9 @@ fn parse_window( snap: &PriceSnapshot, opt: &mut Optimizer, ) -> usize { - let avail = input.len() - start; + // `avail` is bounded by the chunk boundary, not the end of input, so the + // DP never produces a decision that would carry past `pos_end`. + let avail = pos_end - start; let limit = window.min(avail); // Initialize node 0 with the encoder's current live state. diff --git a/src/mtf/mod.rs b/src/mtf/mod.rs index d1f322e..465d793 100644 --- a/src/mtf/mod.rs +++ b/src/mtf/mod.rs @@ -106,16 +106,31 @@ impl Table { } /// Encode one byte: return its current rank and move it to the front. + /// + /// Combines the find-rank scan and the move-to-front shift into a single + /// pass: as we walk the table looking for `b` we slide each scanned entry + /// back one slot (carrying the previous value forward), so each element is + /// touched exactly once. This replaces the previous scan-then-`copy_within` + /// (two passes over the prefix) and is what makes the encode hot loop fast + /// on the small-rank-dominated streams MTF is built for. fn encode_byte(&mut self, b: u8) -> u8 { - // Linear scan of 256 entries — `b` is guaranteed present (the table - // is always a permutation of all byte values), so the loop always - // finds it. - let mut rank = 0usize; - while self.table[rank] != b { + // `b` is always present (the table is a permutation of all 256 byte + // values), so the loop always terminates before running off the end. + let mut carry = self.table[0]; + if carry == b { + return 0; + } + self.table[0] = b; + let mut rank = 1usize; + loop { + let next = self.table[rank]; + self.table[rank] = carry; + if next == b { + return rank as u8; + } + carry = next; rank += 1; } - self.move_to_front(rank); - rank as u8 } /// Decode one byte: return the symbol at `rank` and move it to the front. diff --git a/src/rangecoder/mod.rs b/src/rangecoder/mod.rs index 5a161bf..dd1c7ff 100644 --- a/src/rangecoder/mod.rs +++ b/src/rangecoder/mod.rs @@ -127,14 +127,13 @@ impl Model { } } -#[inline] -fn adapt(prob: &mut u16, bit: u32) { - if bit == 0 { - *prob += ((1u16 << PROB_BITS) - *prob) >> MOVE_BITS; - } else { - *prob -= *prob >> MOVE_BITS; - } -} +// The probability counter update (`adapt`) is inlined directly into the +// `encode_bytes` / `decode_bytes` hot loops: +// +// * bit 0: `prob += (2048 - prob) >> MOVE_BITS` +// * bit 1: `prob -= prob >> MOVE_BITS` +// +// Both directions run the identical update so the models stay in lock-step. // ─── encoder ────────────────────────────────────────────────────────────── @@ -180,16 +179,11 @@ impl Encoder { // Take the input out so we can borrow `self.out` mutably while // reading the bytes; swap a placeholder in to avoid cloning. let input = core::mem::take(&mut self.input); - for &byte in &input { - // Bit-tree walk, MSB first. - let mut node = 1usize; - for i in (0..8).rev() { - let bit = ((byte >> i) & 1) as u32; - let prob = &mut model.probs[node]; - rc.encode_bit(&mut self.out, prob, bit); - node = (node << 1) | (bit as usize); - } - } + // A range-coded payload is never larger than the input by more than a + // few flush bytes; reserve up front so the per-byte `push`es in the + // renormalization loop don't re-check capacity / reallocate. + self.out.reserve(input.len() + 16); + rc.encode_bytes(&mut self.out, &mut model.probs, &input); rc.flush(&mut self.out); self.input = input; } @@ -263,30 +257,54 @@ impl RangeEncoder { } } + /// Encode every byte of `input` as an 8-bit bit-tree walk against the + /// shared `probs` model. Hoists `range`/`low` into locals so they live in + /// registers across the inner loop and indexes `probs` with `node & 0xFF` + /// (the walk only reads nodes `1..=255`, so the mask is a no-op that lets + /// the compiler drop the bounds check on the fixed 256-entry array). #[inline] - fn encode_bit(&mut self, out: &mut Vec, prob: &mut u16, bit: u32) { - // bound = (range >> 11) * prob — the size of the "bit 0" subrange. - let bound = (self.range >> PROB_BITS) * (*prob as u32); - if bit == 0 { - self.range = bound; - } else { - self.low += bound as u64; - self.range -= bound; - } - adapt(prob, bit); - while self.range < TOP { - self.range <<= 8; - self.shift_low(out); + fn encode_bytes(&mut self, out: &mut Vec, probs: &mut [u16; TREE_NODES], input: &[u8]) { + let mut range = self.range; + let mut low = self.low; + for &byte in input { + let mut node = 1usize; + // MSB-first: unrolled over the 8 bits of `byte`. + let mut b = byte; + for _ in 0..8 { + let bit = (b >> 7) as u32; // top bit + b <<= 1; + let slot = node & (TREE_NODES - 1); + let p = probs[slot]; + let bound = (range >> PROB_BITS) * (p as u32); + if bit == 0 { + range = bound; + probs[slot] = p + (((1u16 << PROB_BITS) - p) >> MOVE_BITS); + } else { + low += bound as u64; + range -= bound; + probs[slot] = p - (p >> MOVE_BITS); + } + node = (node << 1) | (bit as usize); + while range < TOP { + range <<= 8; + self.shift_low(out, &mut low); + } + } } + self.range = range; + self.low = low; } + /// Renormalize one byte out of the `low` accumulator. `low` is passed by + /// reference so the encode loop can keep it in a register rather than + /// bouncing through `self` on every renormalization. #[inline] - fn shift_low(&mut self, out: &mut Vec) { + fn shift_low(&mut self, out: &mut Vec, low: &mut u64) { // If the top byte of low is not 0xFF (or a carry is pending), // flush the cached byte plus any staged 0xFF run, adjusted by the // carry bit (low >> 32). - if self.low < 0xFF00_0000 || self.low > 0xFFFF_FFFF { - let carry = (self.low >> 32) as u8; + if *low < 0xFF00_0000 || *low > 0xFFFF_FFFF { + let carry = (*low >> 32) as u8; let mut temp = self.cache; loop { out.push(temp.wrapping_add(carry)); @@ -296,18 +314,20 @@ impl RangeEncoder { break; } } - self.cache = (self.low >> 24) as u8; + self.cache = (*low >> 24) as u8; } self.cache_size += 1; - self.low = (self.low << 8) & 0xFFFF_FFFF; + *low = (*low << 8) & 0xFFFF_FFFF; } fn flush(&mut self, out: &mut Vec) { // Drain the 32-bit low accumulator: 5 shift_low calls move every // byte (plus the cache) out to the stream. + let mut low = self.low; for _ in 0..5 { - self.shift_low(out); + self.shift_low(out, &mut low); } + self.low = low; } } @@ -376,17 +396,7 @@ impl Decoder { let result = (|| { let mut rc = RangeDecoder::new(&payload[8..])?; let mut model = Model::new(); - for _ in 0..out_len { - let mut node = 1usize; - for _ in 0..8 { - let prob = &mut model.probs[node]; - let bit = rc.decode_bit(prob)?; - node = (node << 1) | (bit as usize); - } - // node now holds 256 + byte. - self.out.push((node & 0xFF) as u8); - } - Ok(()) + rc.decode_bytes(&mut model.probs, out_len, &mut self.out) })(); self.input = payload; result @@ -495,27 +505,68 @@ impl<'a> RangeDecoder<'a> { } } - #[inline] - fn decode_bit(&mut self, prob: &mut u16) -> Result { - let bound = (self.range >> PROB_BITS) * (*prob as u32); - let bit; - if self.code < bound { - self.range = bound; - bit = 0; - } else { - self.code -= bound; - self.range -= bound; - bit = 1; - } - adapt(prob, bit); - while self.range < TOP { - self.range <<= 8; - self.code = (self.code << 8) | self.next_byte() as u32; - } - if self.overran { - return Err(Error::UnexpectedEnd); + /// Decode `out_len` bytes into `out`, each an 8-bit bit-tree walk against + /// `probs`. Mirrors [`RangeEncoder::encode_bytes`]: hoists `range`/`code` + /// /`pos` into locals so they stay in registers across the inner loop, and + /// indexes `probs` with `node & 0xFF` (the walk only reads nodes + /// `1..=255`, a no-op mask that elides the bounds check). The over-read + /// flag is consulted once per byte — a complete stream never sets it, and + /// a truncated one is reported as soon as the first short byte is decoded. + fn decode_bytes( + &mut self, + probs: &mut [u16; TREE_NODES], + out_len: usize, + out: &mut Vec, + ) -> Result<(), Error> { + let mut range = self.range; + let mut code = self.code; + let mut pos = self.pos; + let payload = self.payload; + out.reserve(out_len); + for _ in 0..out_len { + let mut node = 1usize; + for _ in 0..8 { + let slot = node & (TREE_NODES - 1); + let p = probs[slot]; + let bound = (range >> PROB_BITS) * (p as u32); + let bit; + if code < bound { + range = bound; + probs[slot] = p + (((1u16 << PROB_BITS) - p) >> MOVE_BITS); + bit = 0usize; + } else { + code -= bound; + range -= bound; + probs[slot] = p - (p >> MOVE_BITS); + bit = 1usize; + } + node = (node << 1) | bit; + while range < TOP { + range <<= 8; + // Inline of next_byte with the hoisted `pos`. + let b = match payload.get(pos) { + Some(&b) => b, + None => { + self.overran = true; + 0 + } + }; + pos += 1; + code = (code << 8) | b as u32; + } + } + if self.overran { + self.range = range; + self.code = code; + self.pos = pos; + return Err(Error::UnexpectedEnd); + } + out.push((node & 0xFF) as u8); } - Ok(bit) + self.range = range; + self.code = code; + self.pos = pos; + Ok(()) } } diff --git a/src/xz/mod.rs b/src/xz/mod.rs index f014f40..0bc2574 100644 --- a/src/xz/mod.rs +++ b/src/xz/mod.rs @@ -58,7 +58,7 @@ use crate::traits::{Algorithm, RawDecoder, RawEncoder, RawProgress}; // decoder exposed under the `lzma2` feature without pulling in this xz // container. See `lib.rs`. use crate::lzma2_internal::lzma2_decoder::{Lzma2Props, LzmaCore, lzma2_dict_size}; -use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk}; +use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma2_stream}; // ─── constants ───────────────────────────────────────────────────────────── @@ -453,55 +453,51 @@ impl Encoder { } } - /// Stage an LZMA2 chunk for emission, choosing between a compressed - /// chunk and an uncompressed fallback. + /// Stage the whole block's LZMA2 payload from the fully-buffered input. /// - /// We always use the "full reset" form of the chunk's control byte - /// (`0xE0` for compressed, `0x01` for uncompressed), so every chunk is - /// independently decodable. This is slightly less efficient than - /// reusing state across chunks but keeps the chunk header shape - /// uniform and matches xz-utils' default emission pattern when each - /// chunk straddles a dictionary reset. + /// The LZMA2 chunks are produced by a single continuous match-finder + /// ([`encode_lzma2_stream`]): the first chunk resets the dictionary + /// (`0xE0` compressed / `0x01` uncompressed) and every later chunk + /// *continues* it (`0xC0` compressed / `0x02` uncompressed), so a match in + /// a later chunk can reference data from any earlier chunk (up to the + /// dictionary size). The range coder still resets per chunk, but the + /// dictionary/history is continuous — this is what brings the ratio in + /// line with the single-stream `.lzma` path. /// - /// `data.len()` must be in `1..=LZMA2_CHUNK_MAX`. The compressed-chunk - /// path additionally requires the compressed payload to fit in - /// 65_536 bytes (the 16-bit + 1 size field); if it overflows or the - /// compressor's output isn't smaller than the input, we fall back to - /// emitting the uncompressed chunk. - fn stage_chunk(&mut self, data: &[u8]) { - debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX); - // Try LZMA-compressed first. Compressed size is bounded by both - // the chunk's 16-bit size field (max 65_536) and our heuristic: - // if the compressor expanded the data, we'd save nothing by - // emitting a compressed chunk — uncompressed is smaller. - let compressed = encode_lzma_chunk(data, LZMA2_DICT_SIZE, self.enc_params); - let use_compressed = - !compressed.is_empty() && compressed.len() <= 65_536 && compressed.len() < data.len(); - - if use_compressed { - self.stage_compressed_chunk(data, &compressed); - } else { - self.stage_uncompressed_chunk(data); + /// `input` is the entire block (`uncompressed_total` bytes). Each framed + /// chunk is appended to `pending`; the caller drains it before the block + /// trailer. + fn stage_payload(&mut self, input: &[u8]) { + let chunks = encode_lzma2_stream(input, LZMA2_DICT_SIZE, self.enc_params); + let mut pos = 0usize; + for chunk in chunks { + let data = &input[pos..pos + chunk.uncomp_len]; + match chunk.body { + Some(ref body) => self.stage_compressed_chunk(data, body, chunk.reset_dict), + None => self.stage_uncompressed_chunk(data, chunk.reset_dict), + } + pos += chunk.uncomp_len; } } - /// Stage a compressed LZMA2 chunk: control byte `0xE0`-style header - /// with the uncompressed-size's top 5 bits embedded into bits 0..=4 - /// of the control byte, followed by two big-endian uncompressed-size - /// continuation bytes, two big-endian compressed-size bytes, a - /// 1-byte LZMA properties byte (because we full-reset every chunk), - /// and the range-coded payload. - fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8]) { + /// Stage one compressed LZMA2 chunk: a control byte (`0xE0` for the first + /// dict-resetting chunk, `0xC0` for a dict-continuing chunk — both carry + /// the top 5 bits of `uncomp_size-1` in bits 0..=4), two big-endian + /// `uncomp_size-1` continuation bytes, two big-endian `comp_size-1` bytes, + /// a 1-byte LZMA properties byte (bit 6 of both control values is set, so + /// properties are always present), then the range-coded payload. + fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8], reset_dict: bool) { debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX); debug_assert!(!compressed.is_empty() && compressed.len() <= 65_536); let uncomp_size_minus_1 = (data.len() - 1) as u32; // 0..=65535 let uncomp_top = ((uncomp_size_minus_1 >> 16) & 0x1F) as u8; // 0 here - // Control byte: `111x_xxxx` = compressed + dict reset + props reset - // + state reset (full reset). The low 5 bits carry the top 5 bits - // of (uncomp_size - 1). With our 65_536 cap those top 5 bits are - // always zero, yielding the exact value 0xE0. - let control: u8 = 0xE0 | uncomp_top; + // Control base: `111x_xxxx` (0xE0) = compressed + dict reset + props + // reset + state reset; `110x_xxxx` (0xC0) = compressed + props reset + + // state reset, dictionary CONTINUES. The low 5 bits carry the top 5 + // bits of (uncomp_size - 1); with our 65_536 cap those are zero. + let base: u8 = if reset_dict { 0xE0 } else { 0xC0 }; + let control: u8 = base | uncomp_top; let uncomp_mid = ((uncomp_size_minus_1 >> 8) & 0xFF) as u8; let uncomp_lo = (uncomp_size_minus_1 & 0xFF) as u8; @@ -524,11 +520,12 @@ impl Encoder { self.compressed_payload_bytes += (header_len + compressed.len()) as u64; } - /// Stage an LZMA2 uncompressed chunk (control byte 0x01 — dict reset, - /// for the simple case where compression wouldn't help). - fn stage_uncompressed_chunk(&mut self, data: &[u8]) { + /// Stage an LZMA2 uncompressed chunk: control `0x01` (dict reset) for the + /// first chunk, `0x02` (no reset, dictionary continues) otherwise. Used as + /// the fallback when compression would expand a chunk. + fn stage_uncompressed_chunk(&mut self, data: &[u8], reset_dict: bool) { debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX); - let control: u8 = 0x01; // full dict reset; safe for any position + let control: u8 = if reset_dict { 0x01 } else { 0x02 }; let size_minus_1 = (data.len() - 1) as u16; self.pending.reserve(3 + data.len()); self.pending.push(control); @@ -605,29 +602,20 @@ impl RawEncoder for Encoder { } } EncPhase::Body => { - // Consume input into the buffer. - while consumed < input.len() && self.in_buf.len() < LZMA2_CHUNK_MAX { - let take = - (LZMA2_CHUNK_MAX - self.in_buf.len()).min(input.len() - consumed); - self.in_buf - .extend_from_slice(&input[consumed..consumed + take]); - consumed += take; - } - if self.in_buf.len() == LZMA2_CHUNK_MAX { - // Flush a full chunk. - let data = core::mem::take(&mut self.in_buf); - self.check.update(&data); - self.uncompressed_total += data.len() as u64; - self.stage_chunk(&data); - self.phase = EncPhase::DrainPending; - } else { - // Need more input; come back via another call. - return Ok(RawProgress { - consumed, - written, - done: false, - }); + // Buffer the entire block's input so the LZMA2 payload can + // be produced by a single continuous match-finder at + // `finish` (the dictionary spans the whole input rather + // than resetting per chunk). We never flush mid-stream; + // chunking happens only at the output-framing level. + if consumed < input.len() { + self.in_buf.extend_from_slice(&input[consumed..]); + consumed = input.len(); } + return Ok(RawProgress { + consumed, + written, + done: false, + }); } EncPhase::DrainPending => { if self.drain_pending(output, &mut written) { @@ -697,11 +685,13 @@ impl RawEncoder for Encoder { } EncPhase::Body => { if !self.in_buf.is_empty() { - // Flush a partial chunk. + // Produce the whole block's LZMA2 payload at once from + // the fully-buffered input, using a single continuous + // match-finder so the dictionary spans every chunk. let data = core::mem::take(&mut self.in_buf); self.check.update(&data); self.uncompressed_total += data.len() as u64; - self.stage_chunk(&data); + self.stage_payload(&data); self.phase = EncPhase::DrainPending; } else { // No pending input. Move to block trailer. @@ -945,6 +935,28 @@ impl Decoder { } } + /// Feed raw uncompressed-chunk bytes into the LZMA2 dictionary so a later + /// dictionary-continuing compressed chunk can reference them. Creates the + /// `lzma_core` on first use (canonical default props; the next compressed + /// chunk's reset bits replace them) sized to the block's dictionary. + fn feed_uncompressed_dict(&mut self, src: &[u8]) { + if self.lzma_core.is_none() { + // Props are irrelevant for plain dict population; use the canonical + // default (lc=3, lp=0, pb=2). `parse` cannot fail for this byte. + let props = Lzma2Props::parse(LZMA2_PROPS_BYTE).unwrap_or(Lzma2Props { + lc: 3, + lp: 0, + pb: 2, + }); + let dict_size = (self.lzma2_dict_size as usize).clamp(4096, 128 * 1024 * 1024); + self.lzma_core = Some(Box::new(LzmaCore::new(props, dict_size))); + self.last_props = LZMA2_PROPS_BYTE; + } + if let Some(core) = self.lzma_core.as_mut() { + core.append_literals(src); + } + } + fn poison(&mut self, e: Error) -> Error { self.poisoned = true; e @@ -1432,6 +1444,15 @@ impl RawDecoder for Decoder { let src = &input[consumed..consumed + take]; output[written..written + take].copy_from_slice(src); self.check.update(src); + // Feed the raw bytes into the LZ dictionary so a later + // dictionary-continuing compressed chunk (`0xC0`) can + // back-reference them. Lazily create the core (with the + // block's dict size and the canonical default props — + // props don't affect plain dict population, and the + // next compressed chunk resets them) when none exists + // yet, e.g. when the block opens with an uncompressed + // chunk. + self.feed_uncompressed_dict(src); self.block_compressed_seen += take as u64; self.block_uncompressed_seen += take as u64; self.chunk_remaining -= take; diff --git a/src/zstd/encoder.rs b/src/zstd/encoder.rs index 10719cb..9d8e27e 100644 --- a/src/zstd/encoder.rs +++ b/src/zstd/encoder.rs @@ -99,6 +99,9 @@ impl Default for EncoderConfig { /// walk deeper chains and accept longer matches before bailing out. #[derive(Debug, Clone, Copy)] pub(crate) struct LevelParams { + /// Clamped compression level (1..=22). Retained so the parser can gate + /// behaviour (e.g. the statistics-driven repricing passes) on it. + pub level: u8, /// Maximum number of hash-chain links the match finder walks per probe. pub max_chain: usize, /// Length at which the match finder stops looking for a longer candidate. @@ -117,12 +120,20 @@ pub(crate) struct LevelParams { /// Lowest level at which the optimal parser is used. const OPTIMAL_LEVEL: u8 = 13; -/// Per-position hash-chain depth cap for the optimal parser. The DP visits -/// every position, so an uncapped chain (up to 16384 at level 22) makes each -/// block quadratic; this bound keeps encode time reasonable while preserving -/// nearly all of the ratio (the DP's win comes from length/repeat pricing, -/// not from exhaustive chain walks). -const OPTIMAL_MAX_CHAIN: usize = 4096; +/// Per-position hash-chain depth cap for the optimal parser, by level. The DP +/// visits every position, so an uncapped chain (up to 16384 at level 22) makes +/// each block quadratic; this bound keeps encode time in the single-digit +/// seconds while preserving nearly all of the ratio (the DP's win comes mostly +/// from cross-block matching and length/repeat pricing, not from exhaustive +/// chain walks — beyond ~256 links the marginal ratio is < 1%). +fn optimal_chain_cap(level: u8) -> usize { + match level { + 0..=15 => 128, + 16..=19 => 256, + // 20..=22: spend more chain budget for the top presets. + _ => 384, + } +} impl LevelParams { /// Clamp `level` to `1..=22` and expand to match-finder tuning. The @@ -163,6 +174,7 @@ impl LevelParams { _ => (16384, super::matcher::MAX_MATCH), }; Self { + level, max_chain, nice_match, lazy_search, @@ -171,16 +183,31 @@ impl LevelParams { } } +/// Window the encoder retains as match context across blocks. The frame +/// advertises a 16 MiB window (`WD = 0x70`), so back-references up to this many +/// bytes are spec-legal and `zstd -d` will accept them. We cap the *retained* +/// history at 8 MiB to bound encoder memory and the per-block re-index cost; +/// real inputs rarely benefit from references beyond a few MiB and the cap +/// keeps total work near-linear. +const MATCH_WINDOW: usize = 8 * 1024 * 1024; + /// Streaming Zstandard encoder. pub struct Encoder { state: State, + /// Match context retained from already-emitted blocks (the sliding window). + /// Back-references in the current block may point anywhere in here, giving + /// cross-block matching up to [`MATCH_WINDOW`] bytes. Trimmed from the front + /// once it exceeds the window. + history: Vec, /// Input buffer pending block emission. pending: Vec, /// Output bytes ready to drain into the caller's buffer. out_buf: Vec, /// Cursor into `out_buf`. out_idx: usize, - /// Reusable matcher. + /// Reusable matcher. Re-indexed per block over `history ++ pending` so that + /// back-references in the current block can reach into earlier blocks + /// (cross-block matching within the advertised window). matcher: MatchFinder, /// Have we written the frame header yet? header_written: bool, @@ -219,6 +246,7 @@ impl Encoder { pub fn with_config(config: EncoderConfig) -> Self { Self { state: State::Accepting, + history: Vec::new(), pending: Vec::with_capacity(BLOCK_SIZE), out_buf: Vec::new(), out_idx: 0, @@ -266,156 +294,147 @@ impl Encoder { // Too small to bother — the framing overhead eats any savings. return None; } - let buffer = self.pending.as_slice(); - self.matcher.resize_for(buffer.len()); - - // Run LZ77 with repeat-offset awareness. We track a per-block ring - // copy of `prev_offsets` and rewrite each emitted match's offset - // through `assign_offset` so equal distances collapse to codes 1..=3. - // - // Two strategies depending on level: - // - level ≤ 3: greedy. Take the best match at the current position. - // - level ≥ 4: lazy. After finding a match at pos, also probe at - // pos+1; if it gives a meaningfully longer match, emit a literal - // and use that one instead. - // - // Independent of level, we always check the three repeat offsets at - // each position first — a repeat-offset match costs 1 bit in the - // offset stream vs. ~log2(distance) bits for a fresh offset, so even - // short repeats are cheap wins. - let lazy = self.params.lazy_search; + + // Cross-block matching: build a buffer of retained history followed by + // the current block, and compress only the `[start, end)` tail. + // Back-references in this block may point anywhere in `history`, up to + // the advertised 16 MiB window — `zstd -d` accepts them. `history` is + // already trimmed to `MATCH_WINDOW`, so every distance is in range. + let start = self.history.len(); + let mut buffer: Vec = Vec::with_capacity(start + self.pending.len()); + buffer.extend_from_slice(&self.history); + buffer.extend_from_slice(&self.pending); + let buffer = buffer.as_slice(); let buf_len = buffer.len(); + + // Rebuild the chains for this buffer and pre-index only the retained + // history (`[0, start)`). Each parser then splices in the *current + // block's* positions lazily as it advances, so the hash chains never + // contain positions ahead of the probe — the standard LZ invariant that + // keeps match finding correct and the depth budget meaningful. Indexing + // history up front is what enables cross-block back-references. + self.matcher.resize_for(buf_len); + for i in 0..start.min(buf_len.saturating_sub(3)) { + self.matcher.insert(buffer, i); + } + + let lazy = self.params.lazy_search; let max_chain = self.params.max_chain; let nice_match = self.params.nice_match; - // High levels: price-based optimal parse over the whole block. The DP - // probes a match candidate at every input position, so we cap the - // per-position chain depth to keep the per-block cost bounded — the DP - // recovers most of the ratio from trying lengths and repeat offsets - // rather than from exhaustive chain walks. - if self.params.optimal { - let opt_chain = max_chain.min(OPTIMAL_MAX_CHAIN); - let (sequences, new_offsets) = optimal_parse( - &mut self.matcher, - buffer, - self.prev_offsets, - opt_chain, + // High levels: price-based optimal parse over the block. The DP probes a + // match candidate at every input position, so we cap the per-position + // chain depth to keep the per-block cost bounded. + let body = if self.params.optimal { + let opt_chain = max_chain.min(optimal_chain_cap(self.params.level)); + // Statistics-driven repricing at high levels (btultra2-style). + let reprice_passes = if self.params.level >= REPRICE_LEVEL { + 2 + } else { + 0 + }; + let cfg = ParseConfig { + start, + init_offsets: self.prev_offsets, + max_chain: opt_chain, nice_match, - ); + }; + let (sequences, new_offsets) = + optimal_parse_2pass(&mut self.matcher, buffer, &cfg, reprice_passes); if sequences.is_empty() { return None; } - return finish_compressed_block( + finish_compressed_block( buffer, + start, &sequences, new_offsets, self.prev_huff_lengths.as_ref(), ) - .map(|(body, new_lengths, committed_offsets)| { - self.prev_offsets = committed_offsets; - if let Some(lengths) = new_lengths { - self.prev_huff_lengths = Some(lengths); + } else { + // Greedy / lazy parse with repeat-offset awareness. Positions in the + // current block are spliced into the chain lazily up to `pos`. + let mut sequences: Vec = Vec::new(); + let mut lit_start: usize = start; + let mut pos: usize = start; + let mut block_offsets = self.prev_offsets; + let mut next_insert = start; + + while pos + MIN_MATCH < buf_len { + while next_insert <= pos { + self.matcher.insert(buffer, next_insert); + next_insert += 1; } - body - }); - } - let mut sequences: Vec = Vec::new(); - let mut lit_start: usize = 0; - let mut pos: usize = 0; - let mut block_offsets = self.prev_offsets; - - // Invariant: positions in [0, next_insert) have already been spliced - // into the matcher's hash chain. We advance `next_insert` lazily. - let mut next_insert: usize = 0; - while pos + MIN_MATCH < buf_len { - // Make sure `pos` is in the chain. - while next_insert <= pos { - self.matcher.insert(buffer, next_insert); - next_insert += 1; - } - - // Step 1: best-match selection at `pos`. - let (m_dist, m_len, m_is_rep1) = best_at( - &self.matcher, - buffer, - pos, - &block_offsets, - max_chain, - nice_match, - ); - - if m_len == 0 { - pos += 1; - continue; - } + let (m_dist, m_len, m_is_rep1) = best_at( + &self.matcher, + buffer, + pos, + &block_offsets, + max_chain, + nice_match, + ); + + if m_len == 0 { + pos += 1; + continue; + } - // Step 2 (lazy only): probe pos+1 for a meaningfully better match. - // "Meaningfully better" = strictly longer by at least 1 byte when - // the current isn't already long. We skip the probe when the - // current match is already at least `nice_match` — there's no - // plausible win at that point. - let (best_pos, best_dist, best_len) = - if lazy && m_len < nice_match && pos + 1 + MIN_MATCH < buf_len { - // Insert pos+1 into the chain so its hash bucket includes it. - while next_insert <= pos + 1 { - self.matcher.insert(buffer, next_insert); - next_insert += 1; - } - let (n_dist, n_len, _) = best_at( - &self.matcher, - buffer, - pos + 1, - &block_offsets, - max_chain, - nice_match, - ); - // Score: prefer longer-match. A repeat-offset hit at pos - // saves bits in the offset stream — bias slightly in its - // favour by requiring the lazy match to beat by ≥2. - let margin = if m_is_rep1 { 2 } else { 1 }; - if n_len >= m_len + margin { - (pos + 1, n_dist, n_len) + let (best_pos, best_dist, best_len) = + if lazy && m_len < nice_match && pos + 1 + MIN_MATCH < buf_len { + while next_insert <= pos + 1 { + self.matcher.insert(buffer, next_insert); + next_insert += 1; + } + let (n_dist, n_len, _) = best_at( + &self.matcher, + buffer, + pos + 1, + &block_offsets, + max_chain, + nice_match, + ); + let margin = if m_is_rep1 { 2 } else { 1 }; + if n_len >= m_len + margin { + (pos + 1, n_dist, n_len) + } else { + (pos, m_dist, m_len) + } } else { (pos, m_dist, m_len) - } - } else { - (pos, m_dist, m_len) - }; - - // Emit the literals run up to `best_pos`, then the chosen match. - let literal_run = best_pos - lit_start; - let offset_value = - assign_offset(best_dist as u32, literal_run as u32, &mut block_offsets); - sequences.push(Seq { - literal_length: literal_run as u32, - match_length: best_len as u32, - offset_value, - }); - // Splice the interior positions of the match into the chain so - // later positions can match against them. We only insert - // positions that aren't already in. - let match_end = best_pos + best_len; - while next_insert < match_end { - self.matcher.insert(buffer, next_insert); - next_insert += 1; + }; + + let literal_run = best_pos - lit_start; + let offset_value = + assign_offset(best_dist as u32, literal_run as u32, &mut block_offsets); + sequences.push(Seq { + literal_length: literal_run as u32, + match_length: best_len as u32, + offset_value, + }); + let match_end = best_pos + best_len; + while next_insert < match_end { + self.matcher.insert(buffer, next_insert); + next_insert += 1; + } + pos = match_end; + lit_start = pos; } - pos = match_end; - lit_start = pos; - } - let _ = lit_start; - if sequences.is_empty() { - return None; - } + let _ = lit_start; + if sequences.is_empty() { + return None; + } + finish_compressed_block( + buffer, + start, + &sequences, + block_offsets, + self.prev_huff_lengths.as_ref(), + ) + }; - finish_compressed_block( - buffer, - &sequences, - block_offsets, - self.prev_huff_lengths.as_ref(), - ) - .map(|(body, new_lengths, committed_offsets)| { + body.map(|(body, new_lengths, committed_offsets)| { self.prev_offsets = committed_offsets; if let Some(lengths) = new_lengths { self.prev_huff_lengths = Some(lengths); @@ -433,16 +452,20 @@ impl Encoder { /// `self.pending` (which `buffer` borrows) against `&mut self`. fn finish_compressed_block( buffer: &[u8], + start: usize, sequences: &[Seq], block_offsets: [u32; 3], prev_huff_lengths: Option<&HuffLengths>, ) -> Option<(Vec, Option, [u32; 3])> { - // Reconstruct all literal bytes by replaying the sequences: each sequence - // emits `literal_length` literals from the cursor, then skips - // `match_length` matched bytes. Trailing bytes after the last sequence are - // literals too. - let mut all_literals: Vec = Vec::with_capacity(buffer.len()); - let mut cursor = 0usize; + // Reconstruct the current block's literal bytes by replaying the sequences: + // each sequence emits `literal_length` literals from the cursor, then skips + // `match_length` matched bytes. The cursor starts at `start` (the first byte + // of this block within `buffer`; earlier bytes are retained history that + // matches reference but that this block does not re-emit). Trailing bytes + // after the last sequence are literals too. + let block_len = buffer.len() - start; + let mut all_literals: Vec = Vec::with_capacity(block_len); + let mut cursor = start; for s in sequences { let ll = s.literal_length as usize; all_literals.extend_from_slice(&buffer[cursor..cursor + ll]); @@ -454,7 +477,7 @@ fn finish_compressed_block( let seq_section = build_sequences_section(sequences); let total = lit_section.len() + seq_section.len(); - if total >= buffer.len() { + if total >= block_len { return None; // Not worth compressing. } @@ -642,7 +665,7 @@ impl Encoder { let body_size = self.pending.len() as u32; Self::push_block_header(&mut self.out_buf, body_size, 1, last); self.out_buf.push(self.pending[0]); - self.pending.clear(); + self.roll_history_from_pending(); return; } if let Some(body) = self.try_compress_block() { @@ -654,7 +677,27 @@ impl Encoder { Self::append_raw_block(&mut self.out_buf, &pending_snapshot, last); self.pending = pending_snapshot; } - self.pending.clear(); + self.roll_history_from_pending(); + } + + /// Move the just-emitted `pending` bytes into the cross-block match window + /// (`history`) and trim the window to [`MATCH_WINDOW`]. Every emitted block — + /// RLE, compressed, or raw — extends the decoder's window, so back-references + /// in later blocks may reference these bytes. Keeping `history` ≤ the window + /// guarantees every offset we emit stays within the advertised window. + fn roll_history_from_pending(&mut self) { + if !self.pending.is_empty() { + self.history.extend_from_slice(&self.pending); + self.pending.clear(); + } + if self.history.len() > MATCH_WINDOW { + let drop = self.history.len() - MATCH_WINDOW; + self.history.drain(0..drop); + // Front of the buffer shifted, so every absolute index recorded in + // the matcher is now stale. Drop the chains; the next block rebuilds + // them. Trimming only happens past the 8 MiB window, so this is rare. + self.matcher.resize_for(self.history.len().max(1)); + } } /// Copy as much of `out_buf[out_idx..]` into `output[*written..]` as fits. @@ -679,37 +722,160 @@ impl Encoder { // ─── price-based optimal parser ─────────────────────────────────────────── -/// Estimated bit cost of a literal byte (~Huffman-coded text/code literal). -/// Only the literal-vs-match trade-off depends on it, not correctness. -const LIT_PRICE: u32 = 9; - -/// Estimated bit cost of the offset part of a match: the FSE offset code plus -/// its extra bits, with a distance matching one of the active repeat offsets -/// priced near-free (repeats emit a tiny FSE code and NO offset extra bits). -fn offset_price(distance: u32, reps: &[u32; 3], ll: u32) -> u32 { - let is_rep = if ll > 0 { - distance == reps[0] || distance == reps[1] || distance == reps[2] +/// Fixed-point fractional-bit scale. Prices are carried as `bits << PRICE_SHIFT` +/// so the DP can weigh sub-bit differences (an FSE code can cost a fractional +/// number of bits). 8 fractional bits is ample precision and keeps every block +/// price well inside `u32` (128 KiB × ~16 bits × 256 ≈ 2^31). +const PRICE_SHIFT: u32 = 8; +const PRICE_ONE: u32 = 1 << PRICE_SHIFT; + +/// `round(log2(x) * 256)` for `x ≥ 1`, via the integer part plus a mantissa +/// interpolation. Converts FSE normalized counts into per-code bit prices +/// without floating point (the crate is `no_std`). +fn log2_fp(x: u32) -> u32 { + if x <= 1 { + return 0; + } + let i = 31 - x.leading_zeros(); // floor(log2(x)) + // Fractional part: the top PRICE_SHIFT mantissa bits below the leading 1. + let frac = if i >= PRICE_SHIFT { + (x >> (i - PRICE_SHIFT)) & (PRICE_ONE - 1) } else { - distance == reps[1] || distance == reps[2] || (reps[0] > 1 && distance == reps[0] - 1) + (x << (PRICE_SHIFT - i)) & (PRICE_ONE - 1) }; - if is_rep { - return 4; + // Linear approximation log2(1 + f) ≈ f over the mantissa (max error + // < 0.09 bit) — plenty accurate for pricing decisions. + i * PRICE_ONE + frac +} + +/// Per-code fractional-bit price model for one FSE-coded symbol stream. +/// +/// Derived from the *actual* normalized counts the encoder will emit (after a +/// first parse), so the DP prices each LL/ML/OF code at its real FSE cost +/// instead of a flat constant. The cost of a symbol whose normalized count is +/// `c` under accuracy log `al` is `al - log2(c)` bits (a state covering more of +/// the table reads fewer bits) — the standard FSE entropy estimate zstd's +/// optimal parser uses. +struct CodePrices { + /// `bits << PRICE_SHIFT` per code index. Empty → always use `fallback`. + cost: Vec, + /// Price for a code absent from `cost` (or with count 0). + fallback: u32, +} + +impl CodePrices { + /// Build from a normalized FSE count table (`counts[c]` may be -1 for the + /// "less than one" probability rows, which read `accuracy_log` bits). + fn from_counts(counts: &[i16], accuracy_log: u8) -> Self { + let al_fp = (accuracy_log as u32) * PRICE_ONE; + let mut cost = Vec::with_capacity(counts.len()); + for &c in counts { + let bits = if c <= 0 { + // count -1 (and the degenerate 0) → one state, reads `al` bits. + al_fp + } else { + al_fp.saturating_sub(log2_fp(c as u32)) + }; + cost.push(bits); + } + Self { + cost, + fallback: al_fp, + } + } + + #[inline] + fn code_cost(&self, code: u8) -> u32 { + self.cost + .get(code as usize) + .copied() + .unwrap_or(self.fallback) + } +} + +/// Full fractional-bit price model used by the optimal parser. +struct PriceModel { + ll: CodePrices, + ml: CodePrices, + of: CodePrices, + /// Average literal-byte cost (`bits << PRICE_SHIFT`) under the block's + /// Huffman tree. A single average is enough for the literal-vs-match + /// trade-off; per-byte pricing barely moves the parse. + lit: u32, +} + +impl PriceModel { + /// Heuristic model for the first pass (no statistics yet). Mirrors the + /// original flat constants so pass 1 behaves like the previous encoder. + fn heuristic() -> Self { + Self { + ll: CodePrices { + cost: Vec::new(), + fallback: 6 * PRICE_ONE, + }, + ml: CodePrices { + cost: Vec::new(), + fallback: 6 * PRICE_ONE, + }, + of: CodePrices { + cost: Vec::new(), + fallback: 5 * PRICE_ONE, + }, + lit: 9 * PRICE_ONE, + } + } + + /// Price the offset part of a match: FSE code cost + extra bits. A distance + /// hitting one of the active repeat offsets uses code 0..=2 (cheap, no extra + /// bits); a fresh offset uses code `log2(distance+3)` with that many extras. + #[inline] + fn offset_price(&self, distance: u32, reps: &[u32; 3], ll: u32) -> u32 { + if let Some(code) = rep_match_code(distance, reps, ll) { + return self.of.code_cost(code); + } + let val = distance + 3; + let code = 31 - val.leading_zeros(); + self.of.code_cost(code as u8) + code * PRICE_ONE + } + + /// Price the literal-length + match-length FSE codes plus their extra bits. + #[inline] + fn ll_ml_price(&self, literal_length: u32, match_length: u32) -> u32 { + let (lc, lb, _) = ll_code(literal_length); + let (mc, mb, _) = ml_code(match_length); + self.ll.code_cost(lc) + self.ml.code_cost(mc) + (lb + mb) * PRICE_ONE + } + + #[inline] + fn lit_run_price(&self, run: u32) -> u32 { + self.lit.saturating_mul(run) } - // Fresh offset: `code` extra bits (the literal low bits of the distance) - // plus the FSE-coded offset code itself (~5 bits amortised). The FSE code - // adapts to the block, so it is NOT another `log2(D)` — charging that would - // double-count and push the DP away from good long-distance matches. - let val = distance + 3; - let code = 31 - val.leading_zeros(); - code + 5 } -/// Estimated bit cost of the literal-length / match-length FSE codes plus -/// their extra bits for a sequence with the given run/length. -fn ll_ml_price(literal_length: u32, match_length: u32) -> u32 { - let (_lc, lb, _lv) = ll_code(literal_length); - let (_mc, mb, _mv) = ml_code(match_length); - 10 + lb + mb +/// If `distance` matches an active repeat offset, return the FSE *offset code* +/// it encodes to (0, 1, or 2 → offset_value 1, 2, 3). Mirrors the aliasing +/// rules in [`assign_offset`]: the `literal_length == 0` case shifts the codes. +#[inline] +fn rep_match_code(distance: u32, reps: &[u32; 3], ll: u32) -> Option { + if ll > 0 { + if distance == reps[0] { + Some(0) + } else if distance == reps[1] { + Some(1) + } else if distance == reps[2] { + Some(2) + } else { + None + } + } else if distance == reps[1] { + Some(0) + } else if distance == reps[2] { + Some(1) + } else if reps[0] > 1 && distance == reps[0] - 1 { + Some(2) + } else { + None + } } /// Update the repeat-offset ring after a match (mirrors `assign_offset`'s @@ -733,31 +899,58 @@ fn advance_reps(distance: u32, literal_length: u32, reps: &[u32; 3]) -> [u32; 3] /// greedy/lazy parsing comes from (their offset extra bits dominate output). /// /// Returns the chosen sequences (in order) and the final repeat-offset ring. -fn optimal_parse( - matcher: &mut MatchFinder, - buffer: &[u8], +/// +/// Invariant block-level inputs (where the block starts, the incoming repeat +/// ring, and the match-finder knobs) are grouped in [`ParseConfig`] so the same +/// settings flow through every repricing pass without a long argument list. +struct ParseConfig { + /// First byte of the block within `buffer`; earlier bytes are retained + /// history that matches may reference but the block does not re-emit. + start: usize, + /// Repeat-offset ring entering the block (carried across blocks). init_offsets: [u32; 3], + /// Hash-chain depth cap and "good enough" match length for the finder. max_chain: usize, nice_match: usize, +} + +fn optimal_parse( + matcher: &mut MatchFinder, + buffer: &[u8], + cfg: &ParseConfig, + model: &PriceModel, + block_indexed: bool, ) -> (Vec, [u32; 3]) { + let ParseConfig { + start, + init_offsets, + max_chain, + nice_match, + } = *cfg; let n = buffer.len(); - if n < MIN_MATCH + 1 { + if n < start + MIN_MATCH + 1 { return (Vec::new(), init_offsets); } - // Insert every hashable position up front so chain walks see the whole - // block (back-references only look earlier, so insertion order within the - // block doesn't affect correctness). - matcher.resize_for(n); - for i in 0..n.saturating_sub(3) { - matcher.insert(buffer, i); - } + // The caller has already indexed the retained history (`[0, start)`). On the + // first parse pass we splice in this block's positions lazily as the DP + // advances, so the chains never contain positions ahead of the probe (the + // standard LZ invariant). Repricing passes reuse the now-complete index. + let hash_end = n.saturating_sub(3); + let mut next_insert = start; + // DP arrays are block-relative: index `j` maps to absolute position + // `start + j`, so memory stays proportional to the block (not the window). + let m = n - start; const INF: u32 = u32::MAX; - let mut price: Vec = vec![INF; n + 1]; - // Back-pointer: (prev_pos, match_len, match_dist). match_len == 0 → literal. - let mut back: Vec<(u32, u32, u32)> = vec![(0, 0, 0); n + 1]; - let mut reps_at: Vec<[u32; 3]> = vec![init_offsets; n + 1]; + let mut price: Vec = vec![INF; m + 1]; + // Back-pointer: (prev_abs_pos, match_len, match_dist). match_len == 0 → literal. + let mut back: Vec<(u32, u32, u32)> = vec![(0, 0, 0); m + 1]; + let mut reps_at: Vec<[u32; 3]> = vec![init_offsets; m + 1]; + // Length of the literal run ending at each node on its cheapest path. Used + // to (a) price the literal-length FSE code at the following match and (b) + // resolve repeat-offset aliasing (the LL==0 case shifts the rep codes). + let mut runlen_at: Vec = vec![0; m + 1]; price[0] = 0; // Step length sparsely for long matches to bound DP work. Dense up to 128 @@ -774,46 +967,68 @@ fn optimal_parse( let mut cands: Vec = Vec::new(); - for i in 0..n { - let base = price[i]; + for j in 0..m { + let i = start + j; // absolute input position + + // Splice this position into the hash chain before it is probed (first + // pass only). Done unconditionally — even for unreachable nodes — so the + // chain stays complete and ordered; the match finder only ever looks at + // positions strictly before the probe. + if !block_indexed { + while next_insert <= i { + if next_insert < hash_end { + matcher.insert(buffer, next_insert); + } + next_insert += 1; + } + } + + let base = price[j]; if base == INF { continue; } - let cur_reps = reps_at[i]; - - // Option A: emit a literal. - let lit_cand = base.saturating_add(LIT_PRICE); - if lit_cand < price[i + 1] { - price[i + 1] = lit_cand; - back[i + 1] = (i as u32, 0, 0); - reps_at[i + 1] = cur_reps; + let cur_reps = reps_at[j]; + + // Option A: emit a literal. Carries the running literal-run length so + // the next match can price its literal-length FSE code correctly. + let lit_cand = base.saturating_add(model.lit_run_price(1)); + if lit_cand < price[j + 1] { + price[j + 1] = lit_cand; + back[j + 1] = (i as u32, 0, 0); + reps_at[j + 1] = cur_reps; + runlen_at[j + 1] = runlen_at[j] + 1; } if i + MIN_MATCH > n { continue; } - // Proxy literal-length for offset rep-aliasing: the common case is a - // sequence following some literals (LL>0, reps map to codes 1..=3). - let ll_proxy = 1u32; + // Actual literal-run length on the cheapest path to `i`. Drives both the + // repeat-offset aliasing (LL==0 shifts the rep codes) and the LL FSE + // code charged on the emitted sequence. + let ll = runlen_at[j]; + // ML code + ML extra bits + LL code + LL extra bits for this sequence. + // The literal *bytes* are already priced step-by-step above, so only the + // length codes are charged here. + let ll_ml = |l: u32| model.ll_ml_price(ll, l); // Option B1: repeat-offset matches at the three active distances. for &d in &cur_reps { if d == 0 || (d as usize) > i { continue; } - let m = matcher.check_repeat_offset(buffer, i, d as usize); - if m >= MIN_MATCH { - let max_l = m.min(n - i); - let off = offset_price(d, &cur_reps, ll_proxy); + let mm = matcher.check_repeat_offset(buffer, i, d as usize); + if mm >= MIN_MATCH { + let max_l = mm.min(n - i); + let off = model.offset_price(d, &cur_reps, ll); + let next_reps = advance_reps(d, ll, &cur_reps); let mut l = MIN_MATCH; while l <= max_l { - let cost = base - .saturating_add(off) - .saturating_add(ll_ml_price(0, l as u32)); - if cost < price[i + l] { - price[i + l] = cost; - back[i + l] = (i as u32, l as u32, d); - reps_at[i + l] = advance_reps(d, ll_proxy, &cur_reps); + let cost = base.saturating_add(off).saturating_add(ll_ml(l as u32)); + if cost < price[j + l] { + price[j + l] = cost; + back[j + l] = (i as u32, l as u32, d); + reps_at[j + l] = next_reps; + runlen_at[j + l] = 0; } if l == max_l { break; @@ -827,17 +1042,22 @@ fn optimal_parse( matcher.collect_matches(buffer, i, n, max_chain, nice_match, &mut cands); for c in &cands { let d = c.distance as u32; + // Skip candidates that coincide with a repeat offset — already + // priced (more cheaply) in B1. + if rep_match_code(d, &cur_reps, ll).is_some() { + continue; + } let max_l = c.length.min(n - i); - let off = offset_price(d, &cur_reps, ll_proxy); + let off = model.offset_price(d, &cur_reps, ll); + let next_reps = advance_reps(d, ll, &cur_reps); let mut l = MIN_MATCH; while l <= max_l { - let cost = base - .saturating_add(off) - .saturating_add(ll_ml_price(0, l as u32)); - if cost < price[i + l] { - price[i + l] = cost; - back[i + l] = (i as u32, l as u32, d); - reps_at[i + l] = advance_reps(d, ll_proxy, &cur_reps); + let cost = base.saturating_add(off).saturating_add(ll_ml(l as u32)); + if cost < price[j + l] { + price[j + l] = cost; + back[j + l] = (i as u32, l as u32, d); + reps_at[j + l] = next_reps; + runlen_at[j + l] = 0; } if l == max_l { break; @@ -849,11 +1069,11 @@ fn optimal_parse( // Backtrack to recover the chosen steps, then emit sequences forward. let mut steps: Vec<(u32, u32)> = Vec::new(); // (match_len, match_dist); 0 = literal - let mut i = n; - while i > 0 { - let (prev, mlen, mdist) = back[i]; + let mut j = m; + while j > 0 { + let (prev_abs, mlen, mdist) = back[j]; steps.push((mlen, mdist)); - i = prev as usize; + j = (prev_abs as usize) - start; } steps.reverse(); @@ -879,6 +1099,180 @@ fn optimal_parse( (sequences, block_offsets) } +/// Lowest level at which the optimal parser runs a second, statistics-driven +/// repricing pass over the block. The first pass uses a flat heuristic model; +/// the second rebuilds the price model from the actual LL/ML/OF code and +/// literal-byte statistics and reparses, so prices reflect the entropy coding +/// the block will actually use. Mirrors zstd's `btultra2` two-pass parse. +const REPRICE_LEVEL: u8 = 15; + +/// Build a statistics-driven [`PriceModel`] from a completed parse's sequences +/// and the block's literal bytes. The FSE code prices come from the same +/// normalized-count tables [`build_sequences_section`] / [`pick_table`] would +/// pick; the literal price is the Huffman entropy of the literal bytes. +fn build_price_model(buffer: &[u8], start: usize, sequences: &[Seq]) -> PriceModel { + // Histogram the LL/ML/OF codes exactly as the sequence section will. + let mut ll_hist = [0u32; 36]; + let mut ml_hist = [0u32; 53]; + let mut of_hist = [0u32; 32]; + for s in sequences { + let (lc, _, _) = ll_code(s.literal_length); + let (mc, _, _) = ml_code(s.match_length); + let (oc, _, _) = of_code(s.offset_value); + ll_hist[lc as usize] += 1; + ml_hist[mc as usize] += 1; + of_hist[(oc as usize).min(31)] += 1; + } + let n = sequences.len() as u32; + + let ll = code_prices_for(&ll_hist, &DEFAULT_LL_COUNTS, DEFAULT_LL_ACCURACY_LOG, 9, n); + let of = code_prices_for(&of_hist, &DEFAULT_OF_COUNTS, DEFAULT_OF_ACCURACY_LOG, 8, n); + let ml = code_prices_for(&ml_hist, &DEFAULT_ML_COUNTS, DEFAULT_ML_ACCURACY_LOG, 9, n); + + // Literal-byte price: average Huffman code length over the actual literal + // bytes of this block (the `[start, end)` tail). Reconstruct the literal + // stream from the sequences, build the same tree the literals section would, + // and average its code lengths. + let all_literals = reconstruct_literals(buffer, start, sequences); + let lit = literal_price(&all_literals); + + PriceModel { ll, ml, of, lit } +} + +/// Replay `sequences` over `buffer[start..]` to recover the block's literal +/// bytes (the bytes not covered by any match). Shared by the price model and +/// the cost predictor so they always agree with [`finish_compressed_block`]. +fn reconstruct_literals(buffer: &[u8], start: usize, sequences: &[Seq]) -> Vec { + let mut out: Vec = Vec::with_capacity(buffer.len() - start); + let mut cursor = start; + for s in sequences { + let ll = s.literal_length as usize; + out.extend_from_slice(&buffer[cursor..cursor + ll]); + cursor += ll + s.match_length as usize; + } + out.extend_from_slice(&buffer[cursor..]); + out +} + +/// Pick the normalized FSE counts the encoder would actually emit for a code +/// stream (custom if it beats predefined, else predefined) and turn them into a +/// [`CodePrices`]. `al_cap` bounds the custom accuracy log; `n` is the sequence +/// count (used to size the custom table the way [`pick_table`] does). +fn code_prices_for( + hist: &[u32], + default_counts: &[i16], + default_al: u8, + al_cap: u8, + n: u32, +) -> CodePrices { + if n == 0 { + return CodePrices::from_counts(default_counts, default_al); + } + // Mirror pick_table's accuracy-log selection for the custom table. + let mut al = al_cap; + while al > 5 && (1u32 << al) > n * 4 { + al -= 1; + } + if al < 5 { + al = 5; + } + if let Some(counts) = build_normalised_counts(hist, n, al) { + // Compare predicted bitstream cost the way pick_table does; only use the + // custom table's prices when it is the one that would be emitted. + let pred_custom = predict_fse_bits(&counts, hist, al); + let pred_default = predict_fse_bits(default_counts, hist, default_al); + let header = encode_fse_table_header(&counts, al); + let custom_bytes = (pred_custom / 8 + 1) as usize + header.len(); + let default_bytes = (pred_default / 8 + 1) as usize; + if custom_bytes + 2 < default_bytes { + return CodePrices::from_counts(&counts, al); + } + } + CodePrices::from_counts(default_counts, default_al) +} + +/// Average Huffman code length (as `bits << PRICE_SHIFT`) over `literals`, +/// using the same canonical tree the literals section builds. Falls back to a +/// flat 8 bits when the alphabet can't be Huffman-coded (tiny/degenerate input). +fn literal_price(literals: &[u8]) -> u32 { + if literals.is_empty() { + return 8 * PRICE_ONE; + } + let freq = histogram(literals); + if let Some(lengths) = build_huff_lengths(&freq) { + let mut total_bits: u64 = 0; + let mut total_syms: u64 = 0; + for (b, &f) in freq.iter().enumerate() { + if f == 0 { + continue; + } + let len = lengths[b] as u64; + total_bits += (f as u64) * len.max(1); + total_syms += f as u64; + } + // Average bits/symbol in fixed point. Clamp to ≥1 bit so the DP never + // treats literals as free. + if let Some(avg) = (total_bits * PRICE_ONE as u64).checked_div(total_syms) { + return (avg as u32).max(PRICE_ONE); + } + } + 8 * PRICE_ONE +} + +/// Drive the optimal parser: a first heuristic-priced pass, then (at high +/// levels) one or more statistics-driven repricing passes. Each repricing pass +/// rebuilds the price model from the previous pass's sequences and reparses; +/// the best (smallest predicted body) result wins. Returns the chosen sequences +/// and final repeat-offset ring. +fn optimal_parse_2pass( + matcher: &mut MatchFinder, + buffer: &[u8], + cfg: &ParseConfig, + reprice_passes: u32, +) -> (Vec, [u32; 3]) { + let start = cfg.start; + // Pass 1: flat heuristic prices. Splices the current block into the chains + // lazily (history was pre-indexed by the caller); later passes reuse them. + let mut model = PriceModel::heuristic(); + let (mut best_seqs, mut best_offsets) = optimal_parse(matcher, buffer, cfg, &model, false); + if best_seqs.is_empty() { + return (best_seqs, best_offsets); + } + let mut best_cost = predicted_block_cost(buffer, start, &best_seqs); + + // Repricing passes: rebuild the model from the current best parse and + // reparse. Keep iterating while it strictly improves the predicted cost. + for _ in 0..reprice_passes { + model = build_price_model(buffer, start, &best_seqs); + let (seqs, offsets) = optimal_parse( + matcher, buffer, cfg, &model, true, // chains already populated by pass 1 + ); + if seqs.is_empty() { + break; + } + let cost = predicted_block_cost(buffer, start, &seqs); + if cost < best_cost { + best_cost = cost; + best_seqs = seqs; + best_offsets = offsets; + } else { + break; // no further improvement + } + } + + (best_seqs, best_offsets) +} + +/// Predict the total compressed body size (in bytes) for a parse, used to pick +/// the better of two repricing passes. Sums the literals section and sequences +/// section the encoder would actually emit. Cheap relative to the parse itself. +fn predicted_block_cost(buffer: &[u8], start: usize, sequences: &[Seq]) -> usize { + let all_literals = reconstruct_literals(buffer, start, sequences); + let (lit_section, _) = build_literals_section(&all_literals, None); + let seq_section = build_sequences_section(sequences); + lit_section.len() + seq_section.len() +} + /// Find the best (distance, length) match at `pos`, mixing repeat-offset /// probes with a hash-chain search. /// @@ -1518,6 +1912,7 @@ impl RawEncoder for Encoder { fn raw_reset(&mut self) { self.state = State::Accepting; + self.history.clear(); self.pending.clear(); self.out_buf.clear(); self.out_idx = 0; diff --git a/tests/lz4_frame.rs b/tests/lz4_frame.rs index bc4eee4..b850efb 100644 --- a/tests/lz4_frame.rs +++ b/tests/lz4_frame.rs @@ -418,6 +418,75 @@ fn linked_blocks_back_reference_across_boundary() { round_trip_with(cfg, &v); } +/// Linked blocks must compress strictly better than independent blocks when +/// the payload's second block repeats data from the first block's tail — +/// because only linked mode can back-reference across the boundary. +#[test] +fn linked_beats_independent_across_boundary() { + // First 64 KiB: pseudo-random (so it is not itself very compressible). + // Second block: a verbatim copy of the first block's last stretch, which + // only a cross-block reference can exploit. + let mut first = Vec::with_capacity(64 * 1024); + let mut state: u32 = 0x1357_9BDF; + for _ in 0..(64 * 1024) { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + first.push((state >> 16) as u8); + } + let mut payload = first.clone(); + // Second block reuses the tail of the first block verbatim. + payload.extend_from_slice(&first[16 * 1024..]); + + let linked = EncoderConfig { + block_max_size: BlockMaxSize::Max64KB, + block_independence: false, + level: 9, + ..Default::default() + }; + let independent = EncoderConfig { + block_independence: true, + ..linked + }; + + let enc_linked = encode_with_cfg_chunked(linked, &payload, 8192, 8192); + let enc_indep = encode_with_cfg_chunked(independent, &payload, 8192, 8192); + + // Both must round-trip. + round_trip_with(linked, &payload); + round_trip_with(independent, &payload); + + assert!( + enc_linked.len() < enc_indep.len(), + "linked {} should be smaller than independent {}", + enc_linked.len(), + enc_indep.len() + ); +} + +/// Our linked-block (default) encoder output must decode byte-for-byte through +/// the system `lz4` tool across multiple blocks — the core cross-block +/// reference path. +#[test] +fn cross_tool_linked_multiblock_our_encode_system_decode() { + if !system_lz4_available() { + eprintln!("skipping cross-tool test: `lz4` not on PATH"); + return; + } + // ~200 KiB so it spans several 64 KiB blocks, with long-range repetition + // that the linked window exploits across boundaries. + let unit = b"the quick brown fox jumps over the lazy dog. ".repeat(64); + let mut payload = Vec::new(); + while payload.len() < 200 * 1024 { + payload.extend_from_slice(&unit); + } + let cfg = EncoderConfig { + level: 9, + ..Default::default() + }; + let encoded = encode_with_cfg_chunked(cfg, &payload, 4096, 4096); + let decoded = system_decompress(&encoded).expect("system lz4 -dc failed"); + assert_eq!(decoded, payload); +} + // ─── Cross-tool: system `lz4 -dc` ──────────────────────────────────────── fn system_lz4_available() -> bool {