diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99a793e..be7b6fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Performance
+
+- **Round 2 of encoder ratio + codec speed work** (encoder-only for ratio;
+  decoders unchanged and every format still decodes byte-for-byte with its
+  reference tool). Ratios on a 2.9 MB real-source corpus, our max level vs the
+  reference's max (`ours/ref`, lower is better):
+  - **xz / lzma2**: 1.51 → **1.10** — the LZMA2 chunk encoder now keeps the LZ
+    dictionary **continuous across chunks** (emits `0xC0` continue-dict control
+    bytes after the first `0xE0`, with a single match-finder spanning the whole
+    input) instead of resetting every 64 KiB. Closes nearly all of the gap to
+    the `.lzma` path (1.07). Also fixes the raw-LZMA2 decoder to feed
+    uncompressed (stored) chunks into the dictionary.
+  - **zstd**: 1.40 → **1.04 vs `zstd -19`** at max level (now beats `zstd -12`)
+    — cross-block matching over a retained sliding window (≤8 MiB, within the
+    advertised window) plus a two-pass, statistics-driven optimal parse
+    (btultra2-style repricing) and repeat-offset-aware DP pricing.
+  - **lz4**: 1.18 → **1.02** (frame, `-l 12` beats `lz4 -9`) — implemented LZ4
+    frame **block-linked** mode so matches reference up to 64 KiB of prior
+    blocks' output, not just the current block.
+- **Standalone-codec encode throughput** (output byte-identical):
+  - **bwt** encode ~3× faster — replaced the prefix-doubling rotation sort with
+    linear-time SA-IS suffix-array construction.
+  - **mtf** encode ~2.3× faster (single-pass scan-and-shift); **rangecoder**
+    encode/decode ~+15% (tightened hot loops).
+
+  Note: the xz/lzma2 and zstd encoders now buffer the whole input to drive a
+  single continuous match-finder (same memory profile the `.lzma` path already
+  had); higher levels trade encode time for ratio, decode speed is unchanged.
+
 ## [0.6.3](https://github.com/KarpelesLab/compcol/compare/v0.6.2...v0.6.3) - 2026-06-15
 
 ### Added
diff --git a/src/bwt/mod.rs b/src/bwt/mod.rs
index 2a51789..243d841 100644
--- a/src/bwt/mod.rs
+++ b/src/bwt/mod.rs
@@ -36,15 +36,19 @@
 //!
 //! ## Forward transform
 //!
-//! For each block we build the order of its cyclic rotations with a
-//! prefix-doubling (Manber–Myers) sort — `O(n log n)` ranking rounds, each an
-//! `O(n)` counting sort on the `(rank[i], rank[i + k])` pairs. From the sorted
-//! rotation order `sa` (where `sa[r]` is the starting offset of the rotation
-//! ranked `r`) the BWT last column is `L[r] = block[(sa[r] + n - 1) mod n]`,
-//! and the primary index is the rank of the rotation that starts at offset 0.
+//! For each block we build the order of its cyclic rotations by suffix-sorting
+//! the doubled block `T + T + $` (sentinel `$` smaller than any byte) with
+//! **SA-IS** (Suffix Array by Induced Sorting; Nong, Zhang & Chan, 2009), a
+//! linear-time `O(n)` construction. From the sorted rotation order `sa` (where
+//! `sa[r]` is the starting offset of the rotation ranked `r`) the BWT last
+//! column is `L[r] = block[(sa[r] + n - 1) mod n]`, and the primary index is
+//! the rank of the rotation that starts at offset 0.
 //!
-//! The prefix-doubling sort is `O(n log n)` and handles the pathological cases
-//! (all-equal bytes, long repeats) without degrading to `O(n² log n)`.
+//! SA-IS is `O(n)` and handles the pathological cases (all-equal bytes, long
+//! repeats) without degrading. A small KMP-based tie-break pass reorders any
+//! run of *equal* cyclic rotations (only possible for fully periodic blocks)
+//! by ascending offset, so the emitted `(L, primary)` pair is identical to
+//! that of a stable cyclic-rotation sort.
 //!
 //! ## Inverse transform
 //!
@@ -64,8 +68,9 @@
 //! ## Licensing
 //!
 //! Clean-room from the published BWT algorithm description (Burrows & Wheeler,
-//! 1994) and the textbook prefix-doubling rotation sort. No code was copied
-//! from `src/bzip2/` or any third-party source.
+//! 1994) and the published SA-IS suffix-array construction (Nong, Zhang &
+//! Chan, 2009). Implemented independently within this module; no code was
+//! copied from `src/bzip2/` or any third-party source.
 
 #![cfg_attr(docsrs, doc(cfg(feature = "bwt")))]
 
diff --git a/src/bwt/transform.rs b/src/bwt/transform.rs
index 7d9d205..8ace78b 100644
--- a/src/bwt/transform.rs
+++ b/src/bwt/transform.rs
@@ -45,114 +45,354 @@ pub(super) fn forward(block: &[u8]) -> (Vec<u8>, usize) {
 /// Sort the `n` cyclic rotations of `block` and return their starting offsets
 /// in sorted order (`sa[r]` = offset of the rotation ranked `r`).
 ///
-/// Uses prefix doubling (Manber–Myers): start with rank = byte value, then
-/// repeatedly refine ranks by the pair `(rank[i], rank[(i + k) mod n])` for
-/// doubling `k`, until every rotation has a distinct rank (or `k >= n`). Each
-/// round is an `O(n)` counting sort over the rank pairs, and there are
-/// `O(log n)` rounds, so the whole thing is `O(n log n)`.
+/// Sorting cyclic rotations is reduced to an ordinary suffix sort over the
+/// **doubled** block `T + T` followed by a sentinel strictly smaller than any
+/// byte. Suffix `i` of `T + T + $` for `0 <= i < n` begins with the i-th
+/// cyclic rotation of `T` (its first `n` characters are exactly that
+/// rotation), so the suffix-sort order restricted to those `n` positions is a
+/// valid cyclic-rotation order. The suffix array itself is built with **SA-IS**
+/// (Suffix Array by Induced Sorting; Nong, Zhang & Chan, IEEE TC 2009), a
+/// linear-time `O(n)` algorithm — several times faster than the previous
+/// `O(n log n)` prefix-doubling sort.
+///
+/// For periodic `T` two rotations can be equal; any consistent tie-break
+/// yields a valid BWT (the emitted `L` bytes are identical for tied rotations
+/// anyway), and the inverse transform recovers the original regardless.
 fn sort_rotations(block: &[u8]) -> Vec<u32> {
     let n = block.len();
     debug_assert!(n >= 1);
 
-    // Order array: the rotation offsets, to be permuted into sorted order.
-    let mut order: Vec<u32> = (0..n as u32).collect();
-
     if n == 1 {
-        return order;
+        return vec![0u32];
     }
 
-    // `rank[i]` = current rank of the rotation starting at offset `i`.
-    //
-    // Initialise from the byte at each offset, but *densely*: map each
-    // distinct byte value to its 0-based order among the values that actually
-    // occur. This keeps every rank in `0..n` from the very first round, which
-    // is what the counting sort's bucket sizing (`n_keys = n`) relies on —
-    // raw byte values would reach 255 and overflow the buckets on small
-    // blocks.
-    let mut present = [false; 256];
+    // Doubled text `T + T + sentinel`. Bytes map to 1..=256, sentinel to 0,
+    // giving an unsigned alphabet of size 257. `n` is bounded by the BWT
+    // block size (<= 64 MiB), so `2 * n + 1` stays well within `i32`.
+    let mut text: Vec<i32> = Vec::with_capacity(2 * n + 1);
+    for &b in block {
+        text.push(b as i32 + 1);
+    }
     for &b in block {
-        present[b as usize] = true;
-    }
-    let mut byte_rank = [0u32; 256];
-    let mut next_rank = 0u32;
-    for (v, &seen) in present.iter().enumerate() {
-        if seen {
-            byte_rank[v] = next_rank;
-            next_rank += 1;
+        text.push(b as i32 + 1);
+    }
+    text.push(0); // sentinel
+
+    let sa = sa_is(&text, 257);
+
+    // Keep only suffixes that start in the first half (`< n`): those are the
+    // `n` cyclic rotations of `T`. The sentinel-only suffix sorts first and is
+    // skipped, as are the second-half suffixes (each dominated by the matching
+    // first-half rotation).
+    let mut order: Vec<u32> = Vec::with_capacity(n);
+    for &s in sa.iter() {
+        let s = s as usize;
+        if s < n {
+            order.push(s as u32);
         }
     }
-    let mut rank: Vec<u32> = block.iter().map(|&b| byte_rank[b as usize]).collect();
-    // Scratch buffers reused across rounds.
-    let mut new_rank: Vec<u32> = vec![0; n];
-    let mut tmp_order: Vec<u32> = vec![0; n];
-
-    let mut k = 1usize;
-    loop {
-        // Sort `order` by the key (rank[i], rank[(i + k) mod n]) using a
-        // stable two-pass counting sort (LSD radix on the two rank fields).
-        // Pass 1: by the second key, rank[(i + k) mod n].
-        counting_sort_by(&order, &mut tmp_order, n, |i| {
-            rank[(i as usize + k) % n] as usize
-        });
-        // Pass 2: by the first key, rank[i]. Stable, so ties keep the
-        // second-key order established above.
-        counting_sort_by(&tmp_order, &mut order, n, |i| rank[i as usize] as usize);
-
-        // Recompute ranks from the freshly sorted order. Two adjacent
-        // rotations share a rank iff both key components are equal.
-        new_rank[order[0] as usize] = 0;
-        let mut r = 0u32;
-        for w in 1..n {
-            let prev = order[w - 1] as usize;
-            let cur = order[w] as usize;
-            let prev_key = (rank[prev], rank[(prev + k) % n]);
-            let cur_key = (rank[cur], rank[(cur + k) % n]);
-            if cur_key != prev_key {
-                r += 1;
+    debug_assert_eq!(order.len(), n);
+
+    // Tie-break normalization. For periodic blocks several cyclic rotations
+    // can be *equal*; the doubled-suffix order breaks those ties by the
+    // distance to the sentinel (favouring larger offsets), whereas a stable
+    // rotation sort favours the smaller offset. Equal rotations share an
+    // identical last column, so reordering within a tied run never changes
+    // `L` — but it can change which row is the primary index. To keep the BWT
+    // output byte-identical to a stable cyclic sort, reorder each maximal run
+    // of equal rotations by ascending starting offset.
+    //
+    // Equal rotations are contiguous in `order`. We need the smallest period
+    // `p` of the block: rotation `a` equals rotation `b` iff `p` divides
+    // `b - a`. With `p` known, equality of adjacent rotations is an O(1) test
+    // and the whole normalization is O(n).
+    let period = smallest_period(block);
+    if period < n {
+        let mut i = 0usize;
+        while i < n {
+            // Extend a run while consecutive rotations are equal. With the
+            // block fully periodic, rotations are equal iff their starting
+            // offsets are congruent modulo `period`, an O(1) test.
+            let base = order[i] as usize % period;
+            let mut j = i + 1;
+            while j < n && order[j] as usize % period == base {
+                j += 1;
+            }
+            if j - i > 1 {
+                order[i..j].sort_unstable();
             }
-            new_rank[cur] = r;
+            i = j;
         }
-        rank.copy_from_slice(&new_rank);
+    }
+    order
+}
 
-        // All rotations distinct → fully sorted.
-        if r as usize == n - 1 {
-            break;
+/// Smallest period `p` (1 <= p <= n) such that `block[k] == block[k + p]` for
+/// all valid `k` *and* `p` divides `n` — i.e. `block` is `n/p` repetitions of
+/// its length-`p` prefix. Returns `n` when the block is not periodic. Computed
+/// in O(n) with the Knuth–Morris–Pratt failure function.
+fn smallest_period(block: &[u8]) -> usize {
+    let n = block.len();
+    if n <= 1 {
+        return n;
+    }
+    // KMP prefix-function (longest proper border length).
+    let mut fail = vec![0usize; n];
+    let mut k = 0usize;
+    for i in 1..n {
+        while k > 0 && block[i] != block[k] {
+            k = fail[k - 1];
         }
-        // Doubling. Once k >= n the second key spans the whole rotation, so
-        // one more round (already done above) suffices; guard anyway.
-        k <<= 1;
-        if k >= n {
-            break;
+        if block[i] == block[k] {
+            k += 1;
         }
+        fail[i] = k;
     }
-    order
+    let p = n - fail[n - 1];
+    if n.is_multiple_of(p) { p } else { n }
 }
 
-/// Stable counting sort of `src` (a permutation of rotation offsets) into
-/// `dst`, keyed by `key(offset)` which must return a value in `0..n_keys`.
-///
-/// `n_keys` is an upper bound on the key range; we use `n` (the block length)
-/// since every rank lies in `0..n`.
-fn counting_sort_by<F>(src: &[u32], dst: &mut [u32], n_keys: usize, key: F)
-where
-    F: Fn(u32) -> usize,
-{
-    let mut counts = vec![0usize; n_keys + 1];
-    for &i in src {
-        counts[key(i)] += 1;
-    }
-    // Prefix sums → starting offset for each key bucket.
-    let mut acc = 0usize;
-    for c in counts.iter_mut() {
-        let cur = *c;
-        *c = acc;
-        acc += cur;
-    }
-    // Scatter, preserving input order within a bucket (stability).
-    for &i in src {
-        let slot = key(i);
-        dst[counts[slot]] = i;
-        counts[slot] += 1;
+// ─── SA-IS suffix array construction ──────────────────────────────────────
+
+/// Build the suffix array of `text` over an integer alphabet whose largest
+/// symbol is `< alphabet_size`. The text MUST end with a unique sentinel
+/// strictly smaller than every other symbol (we use 0; real bytes are shifted
+/// into `1..=256`). Returns an array of length `text.len()`, where `sa[i]` is
+/// the start index of the i-th smallest suffix. Linear time, linear space,
+/// pure safe Rust.
+fn sa_is(text: &[i32], alphabet_size: usize) -> Vec<i32> {
+    let n = text.len();
+    let mut sa = vec![-1i32; n];
+    sa_is_inner(text, &mut sa, alphabet_size);
+    sa
+}
+
+/// SA-IS core. Writes the suffix array into `sa` (length must equal
+/// `text.len()`).
+fn sa_is_inner(text: &[i32], sa: &mut [i32], alphabet_size: usize) {
+    let n = text.len();
+    debug_assert_eq!(sa.len(), n);
+
+    if n == 0 {
+        return;
+    }
+    if n == 1 {
+        sa[0] = 0;
+        return;
+    }
+    if n == 2 {
+        if text[0] < text[1] {
+            sa[0] = 0;
+            sa[1] = 1;
+        } else {
+            sa[0] = 1;
+            sa[1] = 0;
+        }
+        return;
+    }
+
+    // 1. Classify each suffix as S-type (`t[i] == true`) or L-type. The last
+    //    suffix (sentinel only) is S-type. Suffix i is S-type iff
+    //    text[i] < text[i+1], or text[i] == text[i+1] and i+1 is S-type.
+    //    Collect LMS positions (an S-type with an L-type left neighbour) while
+    //    classifying.
+    let mut t = vec![false; n];
+    t[n - 1] = true;
+    let mut lms_positions: Vec<i32> = Vec::new();
+    for i in (0..n - 1).rev() {
+        let si = match text[i].cmp(&text[i + 1]) {
+            core::cmp::Ordering::Less => true,
+            core::cmp::Ordering::Equal => t[i + 1],
+            core::cmp::Ordering::Greater => false,
+        };
+        t[i] = si;
+        if t[i + 1] && !si {
+            lms_positions.push((i + 1) as i32);
+        }
+    }
+    lms_positions.reverse();
+    let n1 = lms_positions.len();
+
+    // 2. Bucket sizes (per-symbol counts in `text`).
+    let mut counts = vec![0i32; alphabet_size];
+    for &c in text {
+        counts[c as usize] += 1;
+    }
+    let mut bucket = vec![0i32; alphabet_size];
+
+    // 3. Place LMS suffixes at the END of their buckets.
+    sa.fill(-1);
+    fill_bucket_ends(&counts, &mut bucket);
+    for &p in &lms_positions {
+        let c = text[p as usize] as usize;
+        bucket[c] -= 1;
+        sa[bucket[c] as usize] = p;
+    }
+
+    // 4-5. Induced sort: L-suffixes (left-to-right), then S-suffixes
+    //      (right-to-left).
+    induce_sort_l(text, sa, &t, &counts, &mut bucket);
+    induce_sort_s(text, sa, &t, &counts, &mut bucket);
+
+    // 6. Compact the induced LMS suffixes to the front of `sa`, then name
+    //    each by its LMS-substring identity.
+    let mut j1 = 0usize;
+    for i in 0..n {
+        if sa[i] >= 0 && is_lms(&t, sa[i] as usize) {
+            sa[j1] = sa[i];
+            j1 += 1;
+        }
+    }
+    debug_assert_eq!(j1, n1);
+    for slot in sa.iter_mut().take(n).skip(n1) {
+        *slot = -1;
+    }
+
+    let mut name: i32 = 0;
+    let mut prev: i32 = -1;
+    for i in 0..n1 {
+        let pos = sa[i] as usize;
+        let mut diff = false;
+        if prev == -1 {
+            diff = true;
+        } else {
+            let p = prev as usize;
+            let mut d = 0usize;
+            loop {
+                if pos + d >= n || p + d >= n {
+                    diff = true;
+                    break;
+                }
+                if text[pos + d] != text[p + d] || t[pos + d] != t[p + d] {
+                    diff = true;
+                    break;
+                }
+                if d > 0 && (is_lms(&t, pos + d) || is_lms(&t, p + d)) {
+                    if is_lms(&t, pos + d) != is_lms(&t, p + d) {
+                        diff = true;
+                    }
+                    break;
+                }
+                d += 1;
+            }
+        }
+        if diff {
+            name += 1;
+            prev = pos as i32;
+        }
+        sa[n1 + pos / 2] = name - 1;
+    }
+    let mut j = n - 1;
+    for i in (n1..n).rev() {
+        if sa[i] >= 0 {
+            sa[j] = sa[i];
+            j -= 1;
+        }
+    }
+
+    // 7. Solve the reduced problem (recursively if names collide).
+    let new_alpha = (name as usize) + 1;
+    let (sa1_area, t1_area) = sa.split_at_mut(n - n1);
+    if (name as usize) == n1 {
+        for (i, &name_of_pos) in t1_area.iter().enumerate() {
+            sa1_area[name_of_pos as usize] = i as i32;
+        }
+    } else {
+        let reduced_text: &[i32] = &t1_area[..n1];
+        let sa1 = &mut sa1_area[..n1];
+        sa_is_inner(reduced_text, sa1, new_alpha);
+    }
+
+    // 8. Map the sorted reduced suffixes back to original LMS positions.
+    for slot in sa.iter_mut().take(n1) {
+        let idx = *slot as usize;
+        *slot = lms_positions[idx];
+    }
+    for slot in sa.iter_mut().take(n).skip(n1) {
+        *slot = -1;
+    }
+
+    // 9. Re-place the now-sorted LMS suffixes at bucket ends, then 10. final
+    //    induced sorts. Snapshot the sorted LMS positions first so the
+    //    scatter cannot clobber a not-yet-read entry.
+    let mut lms_sorted: Vec<i32> = Vec::with_capacity(n1);
+    lms_sorted.extend_from_slice(&sa[..n1]);
+    for slot in sa.iter_mut().take(n) {
+        *slot = -1;
+    }
+    fill_bucket_ends(&counts, &mut bucket);
+    for &pos in lms_sorted.iter().rev() {
+        let c = text[pos as usize] as usize;
+        bucket[c] -= 1;
+        sa[bucket[c] as usize] = pos;
+    }
+
+    induce_sort_l(text, sa, &t, &counts, &mut bucket);
+    induce_sort_s(text, sa, &t, &counts, &mut bucket);
+}
+
+/// `true` iff suffix `i` is S-type AND suffix `i-1` is L-type (left-most S in
+/// a run). Suffix 0 is never an LMS in our convention.
+#[inline(always)]
+fn is_lms(t: &[bool], i: usize) -> bool {
+    i > 0 && t[i] && !t[i - 1]
+}
+
+/// Materialise each bucket *start* (exclusive prefix sum of `counts`).
+#[inline]
+fn fill_bucket_starts(counts: &[i32], out: &mut [i32]) {
+    let mut acc = 0i32;
+    for (o, &c) in out.iter_mut().zip(counts.iter()) {
+        *o = acc;
+        acc += c;
+    }
+}
+
+/// Materialise each bucket *end* (inclusive prefix sum of `counts`).
+#[inline]
+fn fill_bucket_ends(counts: &[i32], out: &mut [i32]) {
+    let mut acc = 0i32;
+    for (o, &c) in out.iter_mut().zip(counts.iter()) {
+        acc += c;
+        *o = acc;
+    }
+}
+
+/// Induced sort of L-type suffixes (left-to-right scan over `sa`).
+fn induce_sort_l(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) {
+    let n = text.len();
+    fill_bucket_starts(counts, bucket);
+    for i in 0..n {
+        let v = sa[i];
+        if v <= 0 {
+            continue;
+        }
+        let j = (v as usize) - 1;
+        if !t[j] {
+            let c = text[j] as usize;
+            let slot = bucket[c];
+            sa[slot as usize] = j as i32;
+            bucket[c] = slot + 1;
+        }
+    }
+}
+
+/// Induced sort of S-type suffixes (right-to-left scan over `sa`).
+fn induce_sort_s(text: &[i32], sa: &mut [i32], t: &[bool], counts: &[i32], bucket: &mut [i32]) {
+    let n = text.len();
+    fill_bucket_ends(counts, bucket);
+    for i in (0..n).rev() {
+        let v = sa[i];
+        if v <= 0 {
+            continue;
+        }
+        let j = (v as usize) - 1;
+        if t[j] {
+            let c = text[j] as usize;
+            let slot = bucket[c] - 1;
+            bucket[c] = slot;
+            sa[slot as usize] = j as i32;
+        }
     }
 }
 
diff --git a/src/lz4/block.rs b/src/lz4/block.rs
index 7125eb2..124c4a7 100644
--- a/src/lz4/block.rs
+++ b/src/lz4/block.rs
@@ -230,15 +230,46 @@ pub fn encode_block(input: &[u8], out: &mut Vec<u8>) {
 /// The emitted bitstream is byte-for-byte a valid LZ4 block in every case —
 /// only the parse changes, so the reference `lz4` decoder reads it unchanged.
 pub fn encode_block_level(input: &[u8], out: &mut Vec<u8>, level: u8) {
-    if level < HC_LEVEL_THRESHOLD {
-        encode_block(input, out);
+    encode_block_level_dict(&[], input, out, level);
+}
+
+/// Encode `input` as a single LZ4 block, allowing back-references into a
+/// `dict` prefix (the tail of previously emitted output, up to 64 KiB).
+///
+/// This is the LZ4 frame **linked-block** match finder: matches in the
+/// current block may point back into `dict`, giving a sliding window that
+/// crosses block boundaries. The emitted bitstream is an ordinary LZ4 block —
+/// the offsets simply reach further back than the block's own length, which
+/// the reference decoder resolves against the previously decoded output.
+///
+/// `dict` should be at most `MAX_DISTANCE + 1` bytes (the caller carries a
+/// 64 KiB sliding window so this holds); any byte of `dict` further than
+/// `MAX_DISTANCE` from a match start is simply never referenced. Passing an
+/// empty `dict` reproduces [`encode_block_level`] exactly.
+///
+/// The end-of-block rules (last 5 bytes literal, last match starts ≥ 12 bytes
+/// before the block end) apply to the **current block** boundary only — the
+/// dict prefix never affects them.
+pub fn encode_block_level_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
+    if dict.is_empty() {
+        if level < HC_LEVEL_THRESHOLD {
+            encode_block(input, out);
+        } else if level < OPT_LEVEL_THRESHOLD {
+            encode_block_hc_dict(&[], input, out, level);
+        } else {
+            encode_block_optimal_dict(&[], input, out, level);
+        }
         return;
     }
+    // With a dictionary we always use a match finder that understands the
+    // prefix. The fast greedy path would ignore the dict, so low levels fall
+    // through to the HC finder (at its shallowest depth) to still cross block
+    // boundaries while staying cheap.
     if level < OPT_LEVEL_THRESHOLD {
-        encode_block_hc(input, out, level);
-        return;
+        encode_block_hc_dict(dict, input, out, level);
+    } else {
+        encode_block_optimal_dict(dict, input, out, level);
     }
-    encode_block_optimal(input, out, level);
 }
 
 /// Map a compression level to a hash-chain search depth (`nb_attempts`).
@@ -361,22 +392,49 @@ fn hc_resolve(
 /// candidate start it walks the chain up to `nb_attempts` links and keeps the
 /// longest match inside the 64 KiB window. A one-step lazy heuristic defers a
 /// match when the next position offers a strictly longer one.
-fn encode_block_hc(input: &[u8], out: &mut Vec<u8>, level: u8) {
+///
+/// `dict` is an optional prefix of previously emitted output that matches may
+/// reference (LZ4 linked-block mode). Internally `dict` and `input` are parsed
+/// over one combined buffer; only positions inside `input` may *start* a
+/// sequence, but their back-references may reach into the `dict` region.
+fn encode_block_hc_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
     out.clear();
     if input.is_empty() {
         return;
     }
+    // The current block is too small to host any match under the end-of-block
+    // rules; emit it as literals regardless of the dictionary.
     if input.len() < MFLIMIT + 1 {
         emit_last_literals(input, out);
         return;
     }
 
+    let dict_len = dict.len();
+    // Combined view: dict bytes occupy [0, dict_len), the current block occupies
+    // [dict_len, n). Matches can point anywhere behind a start position (subject
+    // to MAX_DISTANCE), so a match starting in the block can reach into dict.
+    // One contiguous allocation lets the existing finder address dict and block
+    // uniformly with plain offsets.
+    let buf: Vec<u8>;
+    let input: &[u8] = if dict_len == 0 {
+        input
+    } else {
+        let mut v = Vec::with_capacity(dict_len + input.len());
+        v.extend_from_slice(dict);
+        v.extend_from_slice(input);
+        buf = v;
+        &buf
+    };
+
     let n = input.len();
     let nb_attempts = nb_attempts_for_level(level);
 
     let mut head = alloc::vec![HASH_EMPTY; HC_HASH_TABLE_SIZE];
     let mut chain = alloc::vec![HASH_EMPTY; n];
 
+    // End-of-block limits are measured against the combined length `n`; since
+    // the current block sits at the tail, this keeps the last 5 / 12 bytes of
+    // the *block* literal exactly as the single-block path does.
     let match_limit = n - MFLIMIT; // last position a match may start at
     let hash_limit = n - MIN_MATCH - LAST_LITERALS; // last hashable position
     let forward_limit = n - LAST_LITERALS; // last 5 bytes stay literal
@@ -386,8 +444,9 @@ fn encode_block_hc(input: &[u8], out: &mut Vec<u8>, level: u8) {
     // each position is inserted exactly once and the chain stays strictly
     // ordered newest-first.
     let mut inserted_through: usize = 0;
-    let mut anchor: usize = 0;
-    let mut ip: usize = 0;
+    // Sequences may only start inside the current block.
+    let mut anchor: usize = dict_len;
+    let mut ip: usize = dict_len;
 
     // Insert all hashable positions in [inserted_through, up_to).
     macro_rules! insert_up_to {
@@ -400,6 +459,9 @@ fn encode_block_hc(input: &[u8], out: &mut Vec<u8>, level: u8) {
         }};
     }
 
+    // Seed the chain with every dict position so block matches can find them.
+    insert_up_to!(dict_len);
+
     while ip <= match_limit && ip <= hash_limit {
         // Ensure positions up to and including `ip` are in the chain.
         insert_up_to!(ip + 1);
@@ -528,7 +590,11 @@ struct OptStep {
 /// the hash-chain finder (sequence overhead + the literal run it terminates).
 /// Backtracking recovers the cheapest path, which is then emitted with the
 /// shared sequence emitter — so the bitstream stays a valid LZ4 block.
-fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
+///
+/// `dict` is an optional prefix of previously emitted output that matches may
+/// reference (LZ4 linked-block mode). The DP only traverses positions inside
+/// the current block; matches found may point back into the `dict` region.
+fn encode_block_optimal_dict(dict: &[u8], input: &[u8], out: &mut Vec<u8>, level: u8) {
     out.clear();
     if input.is_empty() {
         return;
@@ -538,6 +604,20 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
         return;
     }
 
+    let dict_len = dict.len();
+    // Combined buffer: dict at [0, dict_len), block at [dict_len, n). See
+    // `encode_block_hc_dict` for the rationale.
+    let buf: Vec<u8>;
+    let input: &[u8] = if dict_len == 0 {
+        input
+    } else {
+        let mut v = Vec::with_capacity(dict_len + input.len());
+        v.extend_from_slice(dict);
+        v.extend_from_slice(input);
+        buf = v;
+        &buf
+    };
+
     let n = input.len();
     let nb_attempts = nb_attempts_for_level(level);
 
@@ -562,7 +642,9 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
         };
         n + 1
     ];
-    price[0] = 0;
+    // The DP origin is the start of the current block, not byte 0: the dict
+    // region is never traversed, only referenced.
+    price[dict_len] = 0;
 
     // Insert all positions up to `up_to` (exclusive) that are hashable.
     let mut inserted_through = 0usize;
@@ -576,7 +658,10 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
         }};
     }
 
-    let mut i = 0usize;
+    // Seed the chain with every dict position so block matches can find them.
+    insert_up_to!(dict_len);
+
+    let mut i = dict_len;
     while i < n {
         if price[i] == usize::MAX {
             i += 1;
@@ -666,10 +751,11 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
         i += 1;
     }
 
-    // Backtrack from n to 0, collecting the path edges in reverse.
+    // Backtrack from n to the block origin, collecting the path edges in
+    // reverse.
     let mut path: Vec<OptStep> = Vec::new();
     let mut pos = n;
-    while pos > 0 {
+    while pos > dict_len {
         let s = step[pos];
         if s.match_pos == usize::MAX {
             // Literal edge: step back one byte. Collapse a contiguous literal
@@ -683,7 +769,7 @@ fn encode_block_optimal(input: &[u8], out: &mut Vec<u8>, level: u8) {
     path.reverse();
 
     // Replay forward, emitting literals then each match.
-    let mut anchor = 0usize;
+    let mut anchor = dict_len;
     for s in &path {
         let match_start = {
             // The match's start position is the end-of-literal-run point. We
diff --git a/src/lz4/frame.rs b/src/lz4/frame.rs
index 6dfaa30..c3d0b19 100644
--- a/src/lz4/frame.rs
+++ b/src/lz4/frame.rs
@@ -34,10 +34,15 @@
 //! When `block_independence = false` (the LZ4 CLI default and ours), the
 //! decoder must keep up to 64 KiB of the previously-decoded payload
 //! available so the next block's back-references can address it. Our
-//! encoder respects this in the same direction: the matcher only ever
-//! looks within the current block (i.e. it produces blocks that decode
-//! correctly under either mode), but the decoder honours linked-block
-//! semantics on input from any conforming encoder.
+//! encoder produces such references: it carries a sliding 64 KiB window of
+//! the most recently emitted raw output and offers it to the block match
+//! finder as a dictionary (see [`block::encode_block_level_dict`]), so a
+//! match in one block can reference bytes that were emitted by previous
+//! blocks. This is what gives linked mode its ratio edge over independent
+//! blocks, whose match window is just the block's own ≤ 64 KiB.
+//!
+//! When `block_independence = true`, the window is never populated and each
+//! block is compressed in isolation, decoding correctly on its own.
 
 use alloc::vec::Vec;
 
@@ -362,6 +367,10 @@ pub struct Encoder {
     /// xxHash32 over the entire raw content. Only used when
     /// `cfg.content_checksum`.
     content_hash: XxHash32,
+    /// Sliding 64 KiB window of the most recently emitted *raw* output, used
+    /// as a back-reference dictionary for the next block in linked-block mode
+    /// (`cfg.block_independence == false`). Empty in independent mode.
+    window: Vec<u8>,
 }
 
 impl Encoder {
@@ -373,6 +382,11 @@ impl Encoder {
     /// Construct with an explicit config.
     pub fn with_config(cfg: EncoderConfig) -> Self {
         let bs = cfg.block_max_size.bytes();
+        let window_cap = if cfg.block_independence {
+            0
+        } else {
+            WINDOW_SIZE
+        };
         let mut enc = Self {
             cfg,
             block_size: bs,
@@ -381,11 +395,32 @@ impl Encoder {
             staged_idx: 0,
             phase: EncPhase::Header,
             content_hash: XxHash32::new(),
+            window: Vec::with_capacity(window_cap),
         };
         enc.build_header();
         enc
     }
 
+    /// Append `bytes` (a block's raw payload) to the sliding back-reference
+    /// window, keeping only the trailing [`WINDOW_SIZE`] bytes. No-op in
+    /// independent mode.
+    fn push_window(&mut self, bytes: &[u8]) {
+        if self.cfg.block_independence {
+            return;
+        }
+        if bytes.len() >= WINDOW_SIZE {
+            self.window.clear();
+            self.window
+                .extend_from_slice(&bytes[bytes.len() - WINDOW_SIZE..]);
+            return;
+        }
+        let combined = self.window.len() + bytes.len();
+        if combined > WINDOW_SIZE {
+            self.window.drain(..combined - WINDOW_SIZE);
+        }
+        self.window.extend_from_slice(bytes);
+    }
+
     /// Stage the frame header (magic + FLG + BD + HC) in `staged`.
     fn build_header(&mut self) {
         self.staged.clear();
@@ -426,9 +461,20 @@ impl Encoder {
             self.content_hash.update(&self.raw);
         }
 
-        // Compress into a scratch buffer.
+        // Compress into a scratch buffer. In linked-block mode the previous
+        // blocks' trailing output (`self.window`, ≤ 64 KiB) is offered as a
+        // back-reference dictionary so matches can cross the block boundary.
         let mut compressed = Vec::with_capacity(block::compress_bound(self.raw.len()));
-        block::encode_block_level(&self.raw, &mut compressed, self.cfg.level);
+        block::encode_block_level_dict(&self.window, &self.raw, &mut compressed, self.cfg.level);
+
+        // Slide the window forward with this block's raw payload, regardless of
+        // whether we end up emitting it compressed or raw — the decoder's
+        // window tracks decoded output either way.
+        if !self.cfg.block_independence {
+            let raw = core::mem::take(&mut self.raw);
+            self.push_window(&raw);
+            self.raw = raw;
+        }
 
         self.staged.clear();
         // Choose the smaller of compressed / raw. The LZ4 Frame spec
@@ -626,6 +672,7 @@ impl RawEncoder for Encoder {
         self.staged.clear();
         self.staged_idx = 0;
         self.content_hash = XxHash32::new();
+        self.window.clear();
         self.phase = EncPhase::Header;
         self.build_header();
     }
diff --git a/src/lzma2/mod.rs b/src/lzma2/mod.rs
index 44d06cc..742a8e3 100644
--- a/src/lzma2/mod.rs
+++ b/src/lzma2/mod.rs
@@ -181,7 +181,7 @@ fn resolve_dict_size(cfg: &DecoderConfig) -> Result<usize, Error> {
 
 // ─── encoder ──────────────────────────────────────────────────────────────
 
-use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk};
+use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma2_stream};
 
 /// Dictionary size (in bytes) the encoder advertises to the LZMA chunk
 /// coder as the match-distance ceiling. Fixed at 4 MiB — the [`crate::xz`]
@@ -278,37 +278,41 @@ impl Encoder {
         }
     }
 
-    /// Stage one LZMA2 chunk for `data` (`1..=ENC_CHUNK_MAX` bytes), choosing
-    /// a compressed chunk when it shrinks the data and an uncompressed
-    /// fallback otherwise.
-    fn stage_chunk(&mut self, data: &[u8]) {
-        debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
-        let compressed = encode_lzma_chunk(data, ENC_DICT_SIZE, self.params);
-        // A compressed chunk is only worth emitting when its range-coded body
-        // is both smaller than the input and fits the 16-bit (+1) comp-size
-        // field. Otherwise the uncompressed chunk is strictly smaller.
-        let use_compressed =
-            !compressed.is_empty() && compressed.len() <= 65_536 && compressed.len() < data.len();
-        if use_compressed {
-            self.stage_compressed_chunk(data, &compressed);
-        } else {
-            self.stage_uncompressed_chunk(data);
+    /// Stage the whole stream's LZMA2 chunks from the fully-buffered `input`.
+    ///
+    /// The chunks come from a single continuous match-finder
+    /// ([`encode_lzma2_stream`]): the first chunk resets the dictionary
+    /// (`0xE0` compressed / `0x01` uncompressed) and every later chunk
+    /// continues it (`0xC0` compressed / `0x02` uncompressed), so cross-chunk
+    /// matches (up to the 4 MiB dictionary) are found. The caller appends the
+    /// `0x00` end marker afterwards.
+    fn stage_payload(&mut self, input: &[u8]) {
+        let chunks = encode_lzma2_stream(input, ENC_DICT_SIZE, self.params);
+        let mut pos = 0usize;
+        for chunk in chunks {
+            let data = &input[pos..pos + chunk.uncomp_len];
+            match chunk.body {
+                Some(ref body) => self.stage_compressed_chunk(data, body, chunk.reset_dict),
+                None => self.stage_uncompressed_chunk(data, chunk.reset_dict),
+            }
+            pos += chunk.uncomp_len;
         }
     }
 
-    /// Stage a full-reset compressed chunk: control `0xE0` (compressed, with
-    /// dict, props, and state all reset; top 5 bits of `uncomp_size-1`), a
+    /// Stage a compressed chunk: control `0xE0` (dict reset) for the first
+    /// chunk or `0xC0` (dictionary continues) otherwise — both carry the top 5
+    /// bits of `uncomp_size-1` and set bit 6 (props present). Followed by a
     /// 2-byte `uncomp_size-1` BE remainder, a 2-byte `comp_size-1` BE, the
-    /// 1-byte LZMA props (present because we full-reset), then the
-    /// range-coded body.
-    fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8]) {
+    /// 1-byte LZMA props, then the range-coded body.
+    fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8], reset_dict: bool) {
         debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
         debug_assert!(!compressed.is_empty() && compressed.len() <= 65_536);
 
         let uncomp_m1 = (data.len() - 1) as u32; // 0..=65535 with our cap
         // Top 5 bits of (uncomp_size - 1) live in the control byte; with a
-        // 65_536 cap they are always zero, yielding exactly 0xE0.
-        let control: u8 = 0xE0 | ((uncomp_m1 >> 16) & 0x1F) as u8;
+        // 65_536 cap they are always zero, yielding exactly 0xE0 / 0xC0.
+        let base: u8 = if reset_dict { 0xE0 } else { 0xC0 };
+        let control: u8 = base | ((uncomp_m1 >> 16) & 0x1F) as u8;
         let comp_m1 = (compressed.len() - 1) as u16;
 
         self.pending.reserve(6 + compressed.len());
@@ -322,13 +326,15 @@ impl Encoder {
         self.pending_idx = 0;
     }
 
-    /// Stage an uncompressed chunk: control `0x01` (dict reset), 2-byte
-    /// `size-1` BE, then the raw bytes.
-    fn stage_uncompressed_chunk(&mut self, data: &[u8]) {
+    /// Stage an uncompressed chunk: control `0x01` (dict reset) for the first
+    /// chunk or `0x02` (dictionary continues) otherwise, a 2-byte `size-1` BE,
+    /// then the raw bytes.
+    fn stage_uncompressed_chunk(&mut self, data: &[u8], reset_dict: bool) {
         debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
+        let control: u8 = if reset_dict { 0x01 } else { 0x02 };
         let size_m1 = (data.len() - 1) as u16;
         self.pending.reserve(3 + data.len());
-        self.pending.push(0x01);
+        self.pending.push(control);
         self.pending.push((size_m1 >> 8) as u8);
         self.pending.push((size_m1 & 0xFF) as u8);
         self.pending.extend_from_slice(data);
@@ -344,23 +350,19 @@ impl RawEncoder for Encoder {
         loop {
             match self.phase {
                 EncPhase::Body => {
-                    while consumed < input.len() && self.in_buf.len() < ENC_CHUNK_MAX {
-                        let take = (ENC_CHUNK_MAX - self.in_buf.len()).min(input.len() - consumed);
-                        self.in_buf
-                            .extend_from_slice(&input[consumed..consumed + take]);
-                        consumed += take;
-                    }
-                    if self.in_buf.len() == ENC_CHUNK_MAX {
-                        let data = core::mem::take(&mut self.in_buf);
-                        self.stage_chunk(&data);
-                        self.phase = EncPhase::DrainPending;
-                    } else {
-                        return Ok(RawProgress {
-                            consumed,
-                            written,
-                            done: false,
-                        });
+                    // Buffer the entire input so the chunk stream can be
+                    // produced by a single continuous match-finder at `finish`
+                    // — the dictionary then spans every chunk instead of
+                    // resetting per chunk. No mid-stream flush.
+                    if consumed < input.len() {
+                        self.in_buf.extend_from_slice(&input[consumed..]);
+                        consumed = input.len();
                     }
+                    return Ok(RawProgress {
+                        consumed,
+                        written,
+                        done: false,
+                    });
                 }
                 EncPhase::DrainPending => {
                     if self.drain_pending(output, &mut written) {
@@ -411,9 +413,9 @@ impl RawEncoder for Encoder {
                     }
                     if !self.in_buf.is_empty() {
                         let data = core::mem::take(&mut self.in_buf);
-                        self.stage_chunk(&data);
-                        // Stay in `Finishing`; the loop drains this chunk then
-                        // re-checks the (now empty) buffer.
+                        self.stage_payload(&data);
+                        // Stay in `Finishing`; the loop drains the staged
+                        // chunks then re-checks the (now empty) buffer.
                     } else {
                         // Buffer empty and any staged chunk drained: emit the
                         // single 0x00 end marker.
@@ -546,6 +548,24 @@ impl Decoder {
         e
     }
 
+    /// Feed raw uncompressed-chunk bytes into the LZMA2 dictionary so a later
+    /// dictionary-continuing compressed chunk (`0xC0`) can reference them.
+    /// Lazily creates `lzma_core` (canonical default props, sized to the
+    /// configured dictionary) when the stream opens with an uncompressed chunk.
+    fn feed_uncompressed_dict(&mut self, src: &[u8]) {
+        if self.lzma_core.is_none() {
+            let props = Lzma2Props::parse(LZMA2_PROPS_BYTE).unwrap_or(Lzma2Props {
+                lc: 3,
+                lp: 0,
+                pb: 2,
+            });
+            self.lzma_core = Some(Box::new(LzmaCore::new(props, self.dict_size)));
+        }
+        if let Some(core) = self.lzma_core.as_mut() {
+            core.append_literals(src);
+        }
+    }
+
     /// Pull bytes from `input` (advancing `consumed`) into `scratch` until it
     /// holds `scratch_want` bytes. Returns true once full.
     fn fill_scratch(&mut self, input: &[u8], consumed: &mut usize) -> bool {
@@ -640,11 +660,13 @@ impl RawDecoder for Decoder {
                         let src = &input[consumed..consumed + take];
                         output[written..written + take].copy_from_slice(src);
                         // Feed the bytes into the LZ window so a later
-                        // compressed chunk (without a dict reset) can
-                        // back-reference them.
-                        if let Some(core) = self.lzma_core.as_mut() {
-                            core.append_literals(src);
-                        }
+                        // dictionary-continuing compressed chunk (`0xC0`) can
+                        // back-reference them. Lazily create the core (canonical
+                        // default props — irrelevant for plain dict population,
+                        // and replaced by the next compressed chunk's reset
+                        // bits) when none exists yet, e.g. when the stream opens
+                        // with an uncompressed chunk.
+                        self.feed_uncompressed_dict(src);
                         self.chunk_remaining -= take;
                         consumed += take;
                         written += take;
diff --git a/src/lzma2_internal/lzma2_encoder.rs b/src/lzma2_internal/lzma2_encoder.rs
index bd3589b..23f19e9 100644
--- a/src/lzma2_internal/lzma2_encoder.rs
+++ b/src/lzma2_internal/lzma2_encoder.rs
@@ -586,6 +586,36 @@ impl LzmaEncCore {
         (self.output_pos as u32) & self.pos_mask
     }
 
+    /// Reset the LZMA *state* for a new continued chunk: re-initialise every
+    /// probability table, the LZMA state, the four rep distances, and a fresh
+    /// range coder — but deliberately keep `output_pos` running so the
+    /// `pos_state` derivation stays continuous across chunks (matching the
+    /// decoder, whose `output_pos` only resets on a dictionary reset, not a
+    /// state reset). The LZ history (hash chain) is owned outside the core and
+    /// is likewise preserved, so matches in this chunk can reference data from
+    /// earlier chunks. This is the encoder counterpart of the LZMA2 `0xC0`
+    /// "reset state + new props, dictionary continues" control byte.
+    fn reset_state_keep_pos(&mut self) {
+        self.is_match.fill(PROB_INIT);
+        self.is_rep.fill(PROB_INIT);
+        self.is_rep0.fill(PROB_INIT);
+        self.is_rep1.fill(PROB_INIT);
+        self.is_rep2.fill(PROB_INIT);
+        self.is_rep0_long.fill(PROB_INIT);
+        self.dist_slot.fill(PROB_INIT);
+        self.dist_special.fill(PROB_INIT);
+        self.dist_align.fill(PROB_INIT);
+        self.lit.fill(PROB_INIT);
+        self.len_coder = LengthCoderEnc::new();
+        self.rep_len_coder = LengthCoderEnc::new();
+        self.state = 0;
+        self.rep0 = 0;
+        self.rep1 = 0;
+        self.rep2 = 0;
+        self.rep3 = 0;
+        self.rc = RangeEncoder::new();
+    }
+
     fn encode_literal_full(&mut self, byte: u8, prev_byte: u8, match_byte: Option<u8>) {
         let lp_state = ((self.output_pos as u32) & self.lit_pos_mask) << self.lc;
         let prev_high = (prev_byte as u32) >> (8 - self.lc);
@@ -1212,6 +1242,11 @@ fn literal_price_at(
 ///
 /// `params` is the level-derived match-finder + parser tuning; see
 /// [`EncoderParams::from_level`].
+///
+/// Single-chunk helper retained for the LZMA2 unit tests (which exercise the
+/// chunk codec directly); the production encoders use [`encode_lzma2_stream`],
+/// which keeps one continuous match-finder across chunks.
+#[cfg_attr(not(test), allow(dead_code))]
 pub(crate) fn encode_lzma_chunk(input: &[u8], dict_size: u32, params: EncoderParams) -> Vec<u8> {
     if params.opt_window == 0 {
         return encode_chunk_body(input, dict_size, params, false);
@@ -1229,8 +1264,140 @@ pub(crate) fn encode_lzma_chunk(input: &[u8], dict_size: u32, params: EncoderPar
     }
 }
 
+/// One framed LZMA2 chunk produced by [`encode_lzma2_stream`].
+///
+/// The caller (xz Block payload or raw LZMA2 stream framing) turns this into
+/// the on-wire chunk header + body. `uncomp_len` is the number of input bytes
+/// the chunk covers; for a compressed chunk `body` is the range-coded payload,
+/// for an uncompressed chunk `body` is empty and the caller copies the
+/// `uncomp_len` raw input bytes verbatim.
+pub(crate) struct Lzma2Chunk {
+    /// Number of uncompressed input bytes this chunk represents.
+    pub uncomp_len: usize,
+    /// `true` when this is the first chunk of the stream and therefore must
+    /// reset the dictionary (`0xE0` compressed / `0x01` uncompressed); `false`
+    /// for every later chunk, which continues the dictionary (`0xC0`
+    /// compressed / `0x02` uncompressed).
+    pub reset_dict: bool,
+    /// `Some(range-coded body)` for a compressed chunk; `None` for an
+    /// uncompressed-fallback chunk (the caller copies the raw input slice).
+    pub body: Option<Vec<u8>>,
+}
+
+/// Largest uncompressed payload a single compressed chunk may carry. The
+/// compressed-chunk uncompressed-size field is 21 bits (+1 ⇒ 2 MiB), but we
+/// cap at 64 KiB so the *compressed* size always fits the 16-bit (+1) comp
+/// field and the chunk header shape stays uniform with the uncompressed
+/// fallback. The dictionary still spans the whole input regardless of this
+/// slice size — only the output framing is chunked.
+const STREAM_CHUNK_UNCOMP_MAX: usize = 65_536;
+
+/// Encode the *entire* `input` as a continuous LZMA2 chunk stream, returning
+/// the framed chunks in order.
+///
+/// Unlike calling [`encode_lzma_chunk`] once per slice, this keeps a single LZ
+/// match-finder (and a continuous `output_pos`) across all chunks: the first
+/// chunk resets the dictionary, every later chunk *continues* it, so a match
+/// in a later chunk can reference data from any earlier chunk up to
+/// `dict_size`. This is what closes the ratio gap with the `.lzma` (single
+/// continuous stream) path. The range coder and probability model still reset
+/// per chunk (state reset) — only the dictionary/history is continuous.
+///
+/// The caller is responsible for the `0x00` end marker and any container
+/// framing (xz Block/Index, raw-stream terminator).
+pub(crate) fn encode_lzma2_stream(
+    input: &[u8],
+    dict_size: u32,
+    params: EncoderParams,
+) -> Vec<Lzma2Chunk> {
+    let mut chunks = Vec::new();
+    if input.is_empty() {
+        return chunks;
+    }
+
+    let mut core = LzmaEncCore::new();
+    // One hash chain over the whole input: the LZ history is never cleared at
+    // a chunk boundary, so cross-chunk matches are found naturally.
+    let mut hc = HashChain::new(input.len());
+
+    let mut pos = 0usize;
+    let mut first = true;
+    while pos < input.len() {
+        let chunk_end = (pos + STREAM_CHUNK_UNCOMP_MAX).min(input.len());
+        let uncomp_len = chunk_end - pos;
+
+        // Range-code this slice. State/probabilities/range coder are reset for
+        // the chunk; `output_pos` and the hash chain continue.
+        let body =
+            encode_stream_chunk_body(&mut core, &mut hc, input, pos, chunk_end, dict_size, params);
+
+        // Compressed only pays off when the body both shrinks the slice and
+        // fits the 16-bit (+1) compressed-size field; otherwise the
+        // uncompressed fallback is strictly smaller.
+        let use_compressed = !body.is_empty() && body.len() <= 65_536 && body.len() < uncomp_len;
+
+        if use_compressed {
+            chunks.push(Lzma2Chunk {
+                uncomp_len,
+                reset_dict: first,
+                body: Some(body),
+            });
+        } else {
+            // Uncompressed fallback. The range-coded attempt mutated `core`
+            // (output_pos advanced, hash chain populated for this slice), which
+            // is exactly the state we want for continuing the dictionary into
+            // later chunks — the bytes are already in the LZ history and
+            // `output_pos` already advanced by `uncomp_len`. So nothing extra
+            // to do here; just frame an uncompressed chunk.
+            chunks.push(Lzma2Chunk {
+                uncomp_len,
+                reset_dict: first,
+                body: None,
+            });
+        }
+
+        pos = chunk_end;
+        first = false;
+    }
+
+    chunks
+}
+
+/// Range-code `input[pos_start..pos_end]` through `core`/`hc`, performing a
+/// per-chunk state reset first (keeping `output_pos` and the LZ history).
+/// Returns the flushed range-coded body.
+///
+/// Uses the optimal parse when the level enables it (`opt_window != 0`) and the
+/// greedy parse otherwise — the same selection [`encode_lzma_chunk`] makes per
+/// level. We do not additionally run both parses and keep the smaller body (as
+/// the single-chunk path does): doing so would require snapshotting/replaying
+/// the shared cross-chunk LZ history, and the optimal parse only loses to
+/// greedy on pathological tiny inputs, which the uncompressed-chunk fallback
+/// already covers for whole chunks.
+#[allow(clippy::too_many_arguments)]
+fn encode_stream_chunk_body(
+    core: &mut LzmaEncCore,
+    hc: &mut HashChain,
+    input: &[u8],
+    pos_start: usize,
+    pos_end: usize,
+    dict_size: u32,
+    params: EncoderParams,
+) -> Vec<u8> {
+    core.reset_state_keep_pos();
+    if params.opt_window == 0 {
+        encode_greedy(core, hc, input, pos_start, pos_end, dict_size, params);
+    } else {
+        encode_optimal(core, hc, input, pos_start, pos_end, dict_size, params);
+    }
+    core.rc.flush();
+    core.rc.out.clone()
+}
+
 /// Encode one chunk body (range-coded packets + 5-byte flush, no EOS marker)
-/// using the greedy or optimal parse.
+/// using the greedy or optimal parse. Only reachable from the test-only
+/// [`encode_lzma_chunk`].
+#[cfg_attr(not(test), allow(dead_code))]
 fn encode_chunk_body(
     input: &[u8],
     dict_size: u32,
@@ -1241,9 +1408,9 @@ fn encode_chunk_body(
     let mut hc = HashChain::new(input.len());
 
     if optimal {
-        encode_optimal(&mut core, &mut hc, input, dict_size, params);
+        encode_optimal(&mut core, &mut hc, input, 0, input.len(), dict_size, params);
     } else {
-        encode_greedy(&mut core, &mut hc, input, dict_size, params);
+        encode_greedy(&mut core, &mut hc, input, 0, input.len(), dict_size, params);
     }
 
     // Flush the range coder. NO EOS marker — LZMA2 frames the uncompressed
@@ -1254,23 +1421,36 @@ fn encode_chunk_body(
 
 /// Greedy/lazy parse — used by the lowest levels where speed matters most and
 /// the optimal-parse overhead isn't worth it.
+///
+/// Encodes `input[pos_start..pos_end]`. Match finding may reference any earlier
+/// position in `input` (the LZ history is the whole buffer up to `pos`), but
+/// emitted match/rep lengths are clamped so the parse stops exactly at
+/// `pos_end` — this lets a continuous encoder slice the output into chunks
+/// without ever crossing a chunk's uncompressed-size boundary.
 fn encode_greedy(
     core: &mut LzmaEncCore,
     hc: &mut HashChain,
     input: &[u8],
+    pos_start: usize,
+    pos_end: usize,
     dict_size: u32,
     params: EncoderParams,
 ) {
-    let mut pos = 0usize;
-    while pos < input.len() {
+    let mut pos = pos_start;
+    while pos < pos_end {
+        // Bytes left until this chunk's boundary; emitted lengths never exceed
+        // it so the chunk ends exactly at `pos_end`.
+        let cap = (pos_end - pos) as u32;
         let rep_lens = [
-            rep_match_len(input, pos, core.rep0),
-            rep_match_len(input, pos, core.rep1),
-            rep_match_len(input, pos, core.rep2),
-            rep_match_len(input, pos, core.rep3),
+            rep_match_len(input, pos, core.rep0).min(cap),
+            rep_match_len(input, pos, core.rep1).min(cap),
+            rep_match_len(input, pos, core.rep2).min(cap),
+            rep_match_len(input, pos, core.rep3).min(cap),
         ];
 
-        let new_match = hc.find_longest(input, pos, dict_size, params);
+        let new_match = hc
+            .find_longest(input, pos, dict_size, params)
+            .map(|(l, d)| (l.min(cap), d));
 
         let best_rep_len = rep_lens.iter().copied().max().unwrap_or(0);
         let best_rep_idx = rep_lens
@@ -1320,10 +1500,16 @@ fn encode_greedy(
 /// Cost-based optimal parse: forward DP over a look-ahead window, committing
 /// the cheapest path through the optimum buffer, then replaying decisions
 /// through the real (probability-updating) emit functions.
+///
+/// Encodes `input[pos_start..pos_end]`; matches still reference the whole LZ
+/// history before `pos`, but the parse never advances past `pos_end`, so the
+/// chunk ends exactly there.
 fn encode_optimal(
     core: &mut LzmaEncCore,
     hc: &mut HashChain,
     input: &[u8],
+    pos_start: usize,
+    pos_end: usize,
     dict_size: u32,
     params: EncoderParams,
 ) {
@@ -1331,17 +1517,18 @@ fn encode_optimal(
     let window = params.opt_window as usize;
     let mut opt = Optimizer::new(window);
 
-    let mut pos = 0usize;
+    let mut pos = pos_start;
     // Refresh the price snapshot once per committed window. Prices drift as
     // the model adapts; refreshing each window keeps them close to the live
     // model without recomputing per byte.
-    while pos < input.len() {
+    while pos < pos_end {
         let snap = core.price_snapshot(&prob_prices);
         let parsed = parse_window(
             core,
             hc,
             input,
             pos,
+            pos_end,
             dict_size,
             params,
             window,
@@ -1368,6 +1555,7 @@ fn parse_window(
     hc: &HashChain,
     input: &[u8],
     start: usize,
+    pos_end: usize,
     dict_size: u32,
     params: EncoderParams,
     window: usize,
@@ -1375,7 +1563,9 @@ fn parse_window(
     snap: &PriceSnapshot,
     opt: &mut Optimizer,
 ) -> usize {
-    let avail = input.len() - start;
+    // `avail` is bounded by the chunk boundary, not the end of input, so the
+    // DP never produces a decision that would carry past `pos_end`.
+    let avail = pos_end - start;
     let limit = window.min(avail);
 
     // Initialize node 0 with the encoder's current live state.
diff --git a/src/mtf/mod.rs b/src/mtf/mod.rs
index d1f322e..465d793 100644
--- a/src/mtf/mod.rs
+++ b/src/mtf/mod.rs
@@ -106,16 +106,31 @@ impl Table {
     }
 
     /// Encode one byte: return its current rank and move it to the front.
+    ///
+    /// Combines the find-rank scan and the move-to-front shift into a single
+    /// pass: as we walk the table looking for `b` we slide each scanned entry
+    /// back one slot (carrying the previous value forward), so each element is
+    /// touched exactly once. This replaces the previous scan-then-`copy_within`
+    /// (two passes over the prefix) and is what makes the encode hot loop fast
+    /// on the small-rank-dominated streams MTF is built for.
     fn encode_byte(&mut self, b: u8) -> u8 {
-        // Linear scan of 256 entries — `b` is guaranteed present (the table
-        // is always a permutation of all byte values), so the loop always
-        // finds it.
-        let mut rank = 0usize;
-        while self.table[rank] != b {
+        // `b` is always present (the table is a permutation of all 256 byte
+        // values), so the loop always terminates before running off the end.
+        let mut carry = self.table[0];
+        if carry == b {
+            return 0;
+        }
+        self.table[0] = b;
+        let mut rank = 1usize;
+        loop {
+            let next = self.table[rank];
+            self.table[rank] = carry;
+            if next == b {
+                return rank as u8;
+            }
+            carry = next;
             rank += 1;
         }
-        self.move_to_front(rank);
-        rank as u8
     }
 
     /// Decode one byte: return the symbol at `rank` and move it to the front.
diff --git a/src/rangecoder/mod.rs b/src/rangecoder/mod.rs
index 5a161bf..dd1c7ff 100644
--- a/src/rangecoder/mod.rs
+++ b/src/rangecoder/mod.rs
@@ -127,14 +127,13 @@ impl Model {
     }
 }
 
-#[inline]
-fn adapt(prob: &mut u16, bit: u32) {
-    if bit == 0 {
-        *prob += ((1u16 << PROB_BITS) - *prob) >> MOVE_BITS;
-    } else {
-        *prob -= *prob >> MOVE_BITS;
-    }
-}
+// The probability counter update (`adapt`) is inlined directly into the
+// `encode_bytes` / `decode_bytes` hot loops:
+//
+// * bit 0: `prob += (2048 - prob) >> MOVE_BITS`
+// * bit 1: `prob -= prob >> MOVE_BITS`
+//
+// Both directions run the identical update so the models stay in lock-step.
 
 // ─── encoder ──────────────────────────────────────────────────────────────
 
@@ -180,16 +179,11 @@ impl Encoder {
         // Take the input out so we can borrow `self.out` mutably while
         // reading the bytes; swap a placeholder in to avoid cloning.
         let input = core::mem::take(&mut self.input);
-        for &byte in &input {
-            // Bit-tree walk, MSB first.
-            let mut node = 1usize;
-            for i in (0..8).rev() {
-                let bit = ((byte >> i) & 1) as u32;
-                let prob = &mut model.probs[node];
-                rc.encode_bit(&mut self.out, prob, bit);
-                node = (node << 1) | (bit as usize);
-            }
-        }
+        // A range-coded payload is never larger than the input by more than a
+        // few flush bytes; reserve up front so the per-byte `push`es in the
+        // renormalization loop don't re-check capacity / reallocate.
+        self.out.reserve(input.len() + 16);
+        rc.encode_bytes(&mut self.out, &mut model.probs, &input);
         rc.flush(&mut self.out);
         self.input = input;
     }
@@ -263,30 +257,54 @@ impl RangeEncoder {
         }
     }
 
+    /// Encode every byte of `input` as an 8-bit bit-tree walk against the
+    /// shared `probs` model. Hoists `range`/`low` into locals so they live in
+    /// registers across the inner loop and indexes `probs` with `node & 0xFF`
+    /// (the walk only reads nodes `1..=255`, so the mask is a no-op that lets
+    /// the compiler drop the bounds check on the fixed 256-entry array).
     #[inline]
-    fn encode_bit(&mut self, out: &mut Vec<u8>, prob: &mut u16, bit: u32) {
-        // bound = (range >> 11) * prob  — the size of the "bit 0" subrange.
-        let bound = (self.range >> PROB_BITS) * (*prob as u32);
-        if bit == 0 {
-            self.range = bound;
-        } else {
-            self.low += bound as u64;
-            self.range -= bound;
-        }
-        adapt(prob, bit);
-        while self.range < TOP {
-            self.range <<= 8;
-            self.shift_low(out);
+    fn encode_bytes(&mut self, out: &mut Vec<u8>, probs: &mut [u16; TREE_NODES], input: &[u8]) {
+        let mut range = self.range;
+        let mut low = self.low;
+        for &byte in input {
+            let mut node = 1usize;
+            // MSB-first: unrolled over the 8 bits of `byte`.
+            let mut b = byte;
+            for _ in 0..8 {
+                let bit = (b >> 7) as u32; // top bit
+                b <<= 1;
+                let slot = node & (TREE_NODES - 1);
+                let p = probs[slot];
+                let bound = (range >> PROB_BITS) * (p as u32);
+                if bit == 0 {
+                    range = bound;
+                    probs[slot] = p + (((1u16 << PROB_BITS) - p) >> MOVE_BITS);
+                } else {
+                    low += bound as u64;
+                    range -= bound;
+                    probs[slot] = p - (p >> MOVE_BITS);
+                }
+                node = (node << 1) | (bit as usize);
+                while range < TOP {
+                    range <<= 8;
+                    self.shift_low(out, &mut low);
+                }
+            }
         }
+        self.range = range;
+        self.low = low;
     }
 
+    /// Renormalize one byte out of the `low` accumulator. `low` is passed by
+    /// reference so the encode loop can keep it in a register rather than
+    /// bouncing through `self` on every renormalization.
     #[inline]
-    fn shift_low(&mut self, out: &mut Vec<u8>) {
+    fn shift_low(&mut self, out: &mut Vec<u8>, low: &mut u64) {
         // If the top byte of low is not 0xFF (or a carry is pending),
         // flush the cached byte plus any staged 0xFF run, adjusted by the
         // carry bit (low >> 32).
-        if self.low < 0xFF00_0000 || self.low > 0xFFFF_FFFF {
-            let carry = (self.low >> 32) as u8;
+        if *low < 0xFF00_0000 || *low > 0xFFFF_FFFF {
+            let carry = (*low >> 32) as u8;
             let mut temp = self.cache;
             loop {
                 out.push(temp.wrapping_add(carry));
@@ -296,18 +314,20 @@ impl RangeEncoder {
                     break;
                 }
             }
-            self.cache = (self.low >> 24) as u8;
+            self.cache = (*low >> 24) as u8;
         }
         self.cache_size += 1;
-        self.low = (self.low << 8) & 0xFFFF_FFFF;
+        *low = (*low << 8) & 0xFFFF_FFFF;
     }
 
     fn flush(&mut self, out: &mut Vec<u8>) {
         // Drain the 32-bit low accumulator: 5 shift_low calls move every
         // byte (plus the cache) out to the stream.
+        let mut low = self.low;
         for _ in 0..5 {
-            self.shift_low(out);
+            self.shift_low(out, &mut low);
         }
+        self.low = low;
     }
 }
 
@@ -376,17 +396,7 @@ impl Decoder {
         let result = (|| {
             let mut rc = RangeDecoder::new(&payload[8..])?;
             let mut model = Model::new();
-            for _ in 0..out_len {
-                let mut node = 1usize;
-                for _ in 0..8 {
-                    let prob = &mut model.probs[node];
-                    let bit = rc.decode_bit(prob)?;
-                    node = (node << 1) | (bit as usize);
-                }
-                // node now holds 256 + byte.
-                self.out.push((node & 0xFF) as u8);
-            }
-            Ok(())
+            rc.decode_bytes(&mut model.probs, out_len, &mut self.out)
         })();
         self.input = payload;
         result
@@ -495,27 +505,68 @@ impl<'a> RangeDecoder<'a> {
         }
     }
 
-    #[inline]
-    fn decode_bit(&mut self, prob: &mut u16) -> Result<u32, Error> {
-        let bound = (self.range >> PROB_BITS) * (*prob as u32);
-        let bit;
-        if self.code < bound {
-            self.range = bound;
-            bit = 0;
-        } else {
-            self.code -= bound;
-            self.range -= bound;
-            bit = 1;
-        }
-        adapt(prob, bit);
-        while self.range < TOP {
-            self.range <<= 8;
-            self.code = (self.code << 8) | self.next_byte() as u32;
-        }
-        if self.overran {
-            return Err(Error::UnexpectedEnd);
+    /// Decode `out_len` bytes into `out`, each an 8-bit bit-tree walk against
+    /// `probs`. Mirrors [`RangeEncoder::encode_bytes`]: hoists `range`/`code`
+    /// /`pos` into locals so they stay in registers across the inner loop, and
+    /// indexes `probs` with `node & 0xFF` (the walk only reads nodes
+    /// `1..=255`, a no-op mask that elides the bounds check). The over-read
+    /// flag is consulted once per byte — a complete stream never sets it, and
+    /// a truncated one is reported as soon as the first short byte is decoded.
+    fn decode_bytes(
+        &mut self,
+        probs: &mut [u16; TREE_NODES],
+        out_len: usize,
+        out: &mut Vec<u8>,
+    ) -> Result<(), Error> {
+        let mut range = self.range;
+        let mut code = self.code;
+        let mut pos = self.pos;
+        let payload = self.payload;
+        out.reserve(out_len);
+        for _ in 0..out_len {
+            let mut node = 1usize;
+            for _ in 0..8 {
+                let slot = node & (TREE_NODES - 1);
+                let p = probs[slot];
+                let bound = (range >> PROB_BITS) * (p as u32);
+                let bit;
+                if code < bound {
+                    range = bound;
+                    probs[slot] = p + (((1u16 << PROB_BITS) - p) >> MOVE_BITS);
+                    bit = 0usize;
+                } else {
+                    code -= bound;
+                    range -= bound;
+                    probs[slot] = p - (p >> MOVE_BITS);
+                    bit = 1usize;
+                }
+                node = (node << 1) | bit;
+                while range < TOP {
+                    range <<= 8;
+                    // Inline of next_byte with the hoisted `pos`.
+                    let b = match payload.get(pos) {
+                        Some(&b) => b,
+                        None => {
+                            self.overran = true;
+                            0
+                        }
+                    };
+                    pos += 1;
+                    code = (code << 8) | b as u32;
+                }
+            }
+            if self.overran {
+                self.range = range;
+                self.code = code;
+                self.pos = pos;
+                return Err(Error::UnexpectedEnd);
+            }
+            out.push((node & 0xFF) as u8);
         }
-        Ok(bit)
+        self.range = range;
+        self.code = code;
+        self.pos = pos;
+        Ok(())
     }
 }
 
diff --git a/src/xz/mod.rs b/src/xz/mod.rs
index f014f40..0bc2574 100644
--- a/src/xz/mod.rs
+++ b/src/xz/mod.rs
@@ -58,7 +58,7 @@ use crate::traits::{Algorithm, RawDecoder, RawEncoder, RawProgress};
 // decoder exposed under the `lzma2` feature without pulling in this xz
 // container. See `lib.rs`.
 use crate::lzma2_internal::lzma2_decoder::{Lzma2Props, LzmaCore, lzma2_dict_size};
-use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk};
+use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma2_stream};
 
 // ─── constants ─────────────────────────────────────────────────────────────
 
@@ -453,55 +453,51 @@ impl Encoder {
         }
     }
 
-    /// Stage an LZMA2 chunk for emission, choosing between a compressed
-    /// chunk and an uncompressed fallback.
+    /// Stage the whole block's LZMA2 payload from the fully-buffered input.
     ///
-    /// We always use the "full reset" form of the chunk's control byte
-    /// (`0xE0` for compressed, `0x01` for uncompressed), so every chunk is
-    /// independently decodable. This is slightly less efficient than
-    /// reusing state across chunks but keeps the chunk header shape
-    /// uniform and matches xz-utils' default emission pattern when each
-    /// chunk straddles a dictionary reset.
+    /// The LZMA2 chunks are produced by a single continuous match-finder
+    /// ([`encode_lzma2_stream`]): the first chunk resets the dictionary
+    /// (`0xE0` compressed / `0x01` uncompressed) and every later chunk
+    /// *continues* it (`0xC0` compressed / `0x02` uncompressed), so a match in
+    /// a later chunk can reference data from any earlier chunk (up to the
+    /// dictionary size). The range coder still resets per chunk, but the
+    /// dictionary/history is continuous — this is what brings the ratio in
+    /// line with the single-stream `.lzma` path.
     ///
-    /// `data.len()` must be in `1..=LZMA2_CHUNK_MAX`. The compressed-chunk
-    /// path additionally requires the compressed payload to fit in
-    /// 65_536 bytes (the 16-bit + 1 size field); if it overflows or the
-    /// compressor's output isn't smaller than the input, we fall back to
-    /// emitting the uncompressed chunk.
-    fn stage_chunk(&mut self, data: &[u8]) {
-        debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX);
-        // Try LZMA-compressed first. Compressed size is bounded by both
-        // the chunk's 16-bit size field (max 65_536) and our heuristic:
-        // if the compressor expanded the data, we'd save nothing by
-        // emitting a compressed chunk — uncompressed is smaller.
-        let compressed = encode_lzma_chunk(data, LZMA2_DICT_SIZE, self.enc_params);
-        let use_compressed =
-            !compressed.is_empty() && compressed.len() <= 65_536 && compressed.len() < data.len();
-
-        if use_compressed {
-            self.stage_compressed_chunk(data, &compressed);
-        } else {
-            self.stage_uncompressed_chunk(data);
+    /// `input` is the entire block (`uncompressed_total` bytes). Each framed
+    /// chunk is appended to `pending`; the caller drains it before the block
+    /// trailer.
+    fn stage_payload(&mut self, input: &[u8]) {
+        let chunks = encode_lzma2_stream(input, LZMA2_DICT_SIZE, self.enc_params);
+        let mut pos = 0usize;
+        for chunk in chunks {
+            let data = &input[pos..pos + chunk.uncomp_len];
+            match chunk.body {
+                Some(ref body) => self.stage_compressed_chunk(data, body, chunk.reset_dict),
+                None => self.stage_uncompressed_chunk(data, chunk.reset_dict),
+            }
+            pos += chunk.uncomp_len;
         }
     }
 
-    /// Stage a compressed LZMA2 chunk: control byte `0xE0`-style header
-    /// with the uncompressed-size's top 5 bits embedded into bits 0..=4
-    /// of the control byte, followed by two big-endian uncompressed-size
-    /// continuation bytes, two big-endian compressed-size bytes, a
-    /// 1-byte LZMA properties byte (because we full-reset every chunk),
-    /// and the range-coded payload.
-    fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8]) {
+    /// Stage one compressed LZMA2 chunk: a control byte (`0xE0` for the first
+    /// dict-resetting chunk, `0xC0` for a dict-continuing chunk — both carry
+    /// the top 5 bits of `uncomp_size-1` in bits 0..=4), two big-endian
+    /// `uncomp_size-1` continuation bytes, two big-endian `comp_size-1` bytes,
+    /// a 1-byte LZMA properties byte (bit 6 of both control values is set, so
+    /// properties are always present), then the range-coded payload.
+    fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8], reset_dict: bool) {
         debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX);
         debug_assert!(!compressed.is_empty() && compressed.len() <= 65_536);
 
         let uncomp_size_minus_1 = (data.len() - 1) as u32; // 0..=65535
         let uncomp_top = ((uncomp_size_minus_1 >> 16) & 0x1F) as u8; // 0 here
-        // Control byte: `111x_xxxx` = compressed + dict reset + props reset
-        // + state reset (full reset). The low 5 bits carry the top 5 bits
-        // of (uncomp_size - 1). With our 65_536 cap those top 5 bits are
-        // always zero, yielding the exact value 0xE0.
-        let control: u8 = 0xE0 | uncomp_top;
+        // Control base: `111x_xxxx` (0xE0) = compressed + dict reset + props
+        // reset + state reset; `110x_xxxx` (0xC0) = compressed + props reset +
+        // state reset, dictionary CONTINUES. The low 5 bits carry the top 5
+        // bits of (uncomp_size - 1); with our 65_536 cap those are zero.
+        let base: u8 = if reset_dict { 0xE0 } else { 0xC0 };
+        let control: u8 = base | uncomp_top;
 
         let uncomp_mid = ((uncomp_size_minus_1 >> 8) & 0xFF) as u8;
         let uncomp_lo = (uncomp_size_minus_1 & 0xFF) as u8;
@@ -524,11 +520,12 @@ impl Encoder {
         self.compressed_payload_bytes += (header_len + compressed.len()) as u64;
     }
 
-    /// Stage an LZMA2 uncompressed chunk (control byte 0x01 — dict reset,
-    /// for the simple case where compression wouldn't help).
-    fn stage_uncompressed_chunk(&mut self, data: &[u8]) {
+    /// Stage an LZMA2 uncompressed chunk: control `0x01` (dict reset) for the
+    /// first chunk, `0x02` (no reset, dictionary continues) otherwise. Used as
+    /// the fallback when compression would expand a chunk.
+    fn stage_uncompressed_chunk(&mut self, data: &[u8], reset_dict: bool) {
         debug_assert!(!data.is_empty() && data.len() <= LZMA2_CHUNK_MAX);
-        let control: u8 = 0x01; // full dict reset; safe for any position
+        let control: u8 = if reset_dict { 0x01 } else { 0x02 };
         let size_minus_1 = (data.len() - 1) as u16;
         self.pending.reserve(3 + data.len());
         self.pending.push(control);
@@ -605,29 +602,20 @@ impl RawEncoder for Encoder {
                     }
                 }
                 EncPhase::Body => {
-                    // Consume input into the buffer.
-                    while consumed < input.len() && self.in_buf.len() < LZMA2_CHUNK_MAX {
-                        let take =
-                            (LZMA2_CHUNK_MAX - self.in_buf.len()).min(input.len() - consumed);
-                        self.in_buf
-                            .extend_from_slice(&input[consumed..consumed + take]);
-                        consumed += take;
-                    }
-                    if self.in_buf.len() == LZMA2_CHUNK_MAX {
-                        // Flush a full chunk.
-                        let data = core::mem::take(&mut self.in_buf);
-                        self.check.update(&data);
-                        self.uncompressed_total += data.len() as u64;
-                        self.stage_chunk(&data);
-                        self.phase = EncPhase::DrainPending;
-                    } else {
-                        // Need more input; come back via another call.
-                        return Ok(RawProgress {
-                            consumed,
-                            written,
-                            done: false,
-                        });
+                    // Buffer the entire block's input so the LZMA2 payload can
+                    // be produced by a single continuous match-finder at
+                    // `finish` (the dictionary spans the whole input rather
+                    // than resetting per chunk). We never flush mid-stream;
+                    // chunking happens only at the output-framing level.
+                    if consumed < input.len() {
+                        self.in_buf.extend_from_slice(&input[consumed..]);
+                        consumed = input.len();
                     }
+                    return Ok(RawProgress {
+                        consumed,
+                        written,
+                        done: false,
+                    });
                 }
                 EncPhase::DrainPending => {
                     if self.drain_pending(output, &mut written) {
@@ -697,11 +685,13 @@ impl RawEncoder for Encoder {
                 }
                 EncPhase::Body => {
                     if !self.in_buf.is_empty() {
-                        // Flush a partial chunk.
+                        // Produce the whole block's LZMA2 payload at once from
+                        // the fully-buffered input, using a single continuous
+                        // match-finder so the dictionary spans every chunk.
                         let data = core::mem::take(&mut self.in_buf);
                         self.check.update(&data);
                         self.uncompressed_total += data.len() as u64;
-                        self.stage_chunk(&data);
+                        self.stage_payload(&data);
                         self.phase = EncPhase::DrainPending;
                     } else {
                         // No pending input. Move to block trailer.
@@ -945,6 +935,28 @@ impl Decoder {
         }
     }
 
+    /// Feed raw uncompressed-chunk bytes into the LZMA2 dictionary so a later
+    /// dictionary-continuing compressed chunk can reference them. Creates the
+    /// `lzma_core` on first use (canonical default props; the next compressed
+    /// chunk's reset bits replace them) sized to the block's dictionary.
+    fn feed_uncompressed_dict(&mut self, src: &[u8]) {
+        if self.lzma_core.is_none() {
+            // Props are irrelevant for plain dict population; use the canonical
+            // default (lc=3, lp=0, pb=2). `parse` cannot fail for this byte.
+            let props = Lzma2Props::parse(LZMA2_PROPS_BYTE).unwrap_or(Lzma2Props {
+                lc: 3,
+                lp: 0,
+                pb: 2,
+            });
+            let dict_size = (self.lzma2_dict_size as usize).clamp(4096, 128 * 1024 * 1024);
+            self.lzma_core = Some(Box::new(LzmaCore::new(props, dict_size)));
+            self.last_props = LZMA2_PROPS_BYTE;
+        }
+        if let Some(core) = self.lzma_core.as_mut() {
+            core.append_literals(src);
+        }
+    }
+
     fn poison(&mut self, e: Error) -> Error {
         self.poisoned = true;
         e
@@ -1432,6 +1444,15 @@ impl RawDecoder for Decoder {
                         let src = &input[consumed..consumed + take];
                         output[written..written + take].copy_from_slice(src);
                         self.check.update(src);
+                        // Feed the raw bytes into the LZ dictionary so a later
+                        // dictionary-continuing compressed chunk (`0xC0`) can
+                        // back-reference them. Lazily create the core (with the
+                        // block's dict size and the canonical default props —
+                        // props don't affect plain dict population, and the
+                        // next compressed chunk resets them) when none exists
+                        // yet, e.g. when the block opens with an uncompressed
+                        // chunk.
+                        self.feed_uncompressed_dict(src);
                         self.block_compressed_seen += take as u64;
                         self.block_uncompressed_seen += take as u64;
                         self.chunk_remaining -= take;
diff --git a/src/zstd/encoder.rs b/src/zstd/encoder.rs
index 10719cb..9d8e27e 100644
--- a/src/zstd/encoder.rs
+++ b/src/zstd/encoder.rs
@@ -99,6 +99,9 @@ impl Default for EncoderConfig {
 /// walk deeper chains and accept longer matches before bailing out.
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct LevelParams {
+    /// Clamped compression level (1..=22). Retained so the parser can gate
+    /// behaviour (e.g. the statistics-driven repricing passes) on it.
+    pub level: u8,
     /// Maximum number of hash-chain links the match finder walks per probe.
     pub max_chain: usize,
     /// Length at which the match finder stops looking for a longer candidate.
@@ -117,12 +120,20 @@ pub(crate) struct LevelParams {
 /// Lowest level at which the optimal parser is used.
 const OPTIMAL_LEVEL: u8 = 13;
 
-/// Per-position hash-chain depth cap for the optimal parser. The DP visits
-/// every position, so an uncapped chain (up to 16384 at level 22) makes each
-/// block quadratic; this bound keeps encode time reasonable while preserving
-/// nearly all of the ratio (the DP's win comes from length/repeat pricing,
-/// not from exhaustive chain walks).
-const OPTIMAL_MAX_CHAIN: usize = 4096;
+/// Per-position hash-chain depth cap for the optimal parser, by level. The DP
+/// visits every position, so an uncapped chain (up to 16384 at level 22) makes
+/// each block quadratic; this bound keeps encode time in the single-digit
+/// seconds while preserving nearly all of the ratio (the DP's win comes mostly
+/// from cross-block matching and length/repeat pricing, not from exhaustive
+/// chain walks — beyond ~256 links the marginal ratio is < 1%).
+fn optimal_chain_cap(level: u8) -> usize {
+    match level {
+        0..=15 => 128,
+        16..=19 => 256,
+        // 20..=22: spend more chain budget for the top presets.
+        _ => 384,
+    }
+}
 
 impl LevelParams {
     /// Clamp `level` to `1..=22` and expand to match-finder tuning. The
@@ -163,6 +174,7 @@ impl LevelParams {
             _ => (16384, super::matcher::MAX_MATCH),
         };
         Self {
+            level,
             max_chain,
             nice_match,
             lazy_search,
@@ -171,16 +183,31 @@ impl LevelParams {
     }
 }
 
+/// Window the encoder retains as match context across blocks. The frame
+/// advertises a 16 MiB window (`WD = 0x70`), so back-references up to this many
+/// bytes are spec-legal and `zstd -d` will accept them. We cap the *retained*
+/// history at 8 MiB to bound encoder memory and the per-block re-index cost;
+/// real inputs rarely benefit from references beyond a few MiB and the cap
+/// keeps total work near-linear.
+const MATCH_WINDOW: usize = 8 * 1024 * 1024;
+
 /// Streaming Zstandard encoder.
 pub struct Encoder {
     state: State,
+    /// Match context retained from already-emitted blocks (the sliding window).
+    /// Back-references in the current block may point anywhere in here, giving
+    /// cross-block matching up to [`MATCH_WINDOW`] bytes. Trimmed from the front
+    /// once it exceeds the window.
+    history: Vec<u8>,
     /// Input buffer pending block emission.
     pending: Vec<u8>,
     /// Output bytes ready to drain into the caller's buffer.
     out_buf: Vec<u8>,
     /// Cursor into `out_buf`.
     out_idx: usize,
-    /// Reusable matcher.
+    /// Reusable matcher. Re-indexed per block over `history ++ pending` so that
+    /// back-references in the current block can reach into earlier blocks
+    /// (cross-block matching within the advertised window).
     matcher: MatchFinder,
     /// Have we written the frame header yet?
     header_written: bool,
@@ -219,6 +246,7 @@ impl Encoder {
     pub fn with_config(config: EncoderConfig) -> Self {
         Self {
             state: State::Accepting,
+            history: Vec::new(),
             pending: Vec::with_capacity(BLOCK_SIZE),
             out_buf: Vec::new(),
             out_idx: 0,
@@ -266,156 +294,147 @@ impl Encoder {
             // Too small to bother — the framing overhead eats any savings.
             return None;
         }
-        let buffer = self.pending.as_slice();
-        self.matcher.resize_for(buffer.len());
-
-        // Run LZ77 with repeat-offset awareness. We track a per-block ring
-        // copy of `prev_offsets` and rewrite each emitted match's offset
-        // through `assign_offset` so equal distances collapse to codes 1..=3.
-        //
-        // Two strategies depending on level:
-        //   - level ≤ 3: greedy. Take the best match at the current position.
-        //   - level ≥ 4: lazy. After finding a match at pos, also probe at
-        //     pos+1; if it gives a meaningfully longer match, emit a literal
-        //     and use that one instead.
-        //
-        // Independent of level, we always check the three repeat offsets at
-        // each position first — a repeat-offset match costs 1 bit in the
-        // offset stream vs. ~log2(distance) bits for a fresh offset, so even
-        // short repeats are cheap wins.
-        let lazy = self.params.lazy_search;
+
+        // Cross-block matching: build a buffer of retained history followed by
+        // the current block, and compress only the `[start, end)` tail.
+        // Back-references in this block may point anywhere in `history`, up to
+        // the advertised 16 MiB window — `zstd -d` accepts them. `history` is
+        // already trimmed to `MATCH_WINDOW`, so every distance is in range.
+        let start = self.history.len();
+        let mut buffer: Vec<u8> = Vec::with_capacity(start + self.pending.len());
+        buffer.extend_from_slice(&self.history);
+        buffer.extend_from_slice(&self.pending);
+        let buffer = buffer.as_slice();
         let buf_len = buffer.len();
+
+        // Rebuild the chains for this buffer and pre-index only the retained
+        // history (`[0, start)`). Each parser then splices in the *current
+        // block's* positions lazily as it advances, so the hash chains never
+        // contain positions ahead of the probe — the standard LZ invariant that
+        // keeps match finding correct and the depth budget meaningful. Indexing
+        // history up front is what enables cross-block back-references.
+        self.matcher.resize_for(buf_len);
+        for i in 0..start.min(buf_len.saturating_sub(3)) {
+            self.matcher.insert(buffer, i);
+        }
+
+        let lazy = self.params.lazy_search;
         let max_chain = self.params.max_chain;
         let nice_match = self.params.nice_match;
 
-        // High levels: price-based optimal parse over the whole block. The DP
-        // probes a match candidate at every input position, so we cap the
-        // per-position chain depth to keep the per-block cost bounded — the DP
-        // recovers most of the ratio from trying lengths and repeat offsets
-        // rather than from exhaustive chain walks.
-        if self.params.optimal {
-            let opt_chain = max_chain.min(OPTIMAL_MAX_CHAIN);
-            let (sequences, new_offsets) = optimal_parse(
-                &mut self.matcher,
-                buffer,
-                self.prev_offsets,
-                opt_chain,
+        // High levels: price-based optimal parse over the block. The DP probes a
+        // match candidate at every input position, so we cap the per-position
+        // chain depth to keep the per-block cost bounded.
+        let body = if self.params.optimal {
+            let opt_chain = max_chain.min(optimal_chain_cap(self.params.level));
+            // Statistics-driven repricing at high levels (btultra2-style).
+            let reprice_passes = if self.params.level >= REPRICE_LEVEL {
+                2
+            } else {
+                0
+            };
+            let cfg = ParseConfig {
+                start,
+                init_offsets: self.prev_offsets,
+                max_chain: opt_chain,
                 nice_match,
-            );
+            };
+            let (sequences, new_offsets) =
+                optimal_parse_2pass(&mut self.matcher, buffer, &cfg, reprice_passes);
             if sequences.is_empty() {
                 return None;
             }
-            return finish_compressed_block(
+            finish_compressed_block(
                 buffer,
+                start,
                 &sequences,
                 new_offsets,
                 self.prev_huff_lengths.as_ref(),
             )
-            .map(|(body, new_lengths, committed_offsets)| {
-                self.prev_offsets = committed_offsets;
-                if let Some(lengths) = new_lengths {
-                    self.prev_huff_lengths = Some(lengths);
+        } else {
+            // Greedy / lazy parse with repeat-offset awareness. Positions in the
+            // current block are spliced into the chain lazily up to `pos`.
+            let mut sequences: Vec<Seq> = Vec::new();
+            let mut lit_start: usize = start;
+            let mut pos: usize = start;
+            let mut block_offsets = self.prev_offsets;
+            let mut next_insert = start;
+
+            while pos + MIN_MATCH < buf_len {
+                while next_insert <= pos {
+                    self.matcher.insert(buffer, next_insert);
+                    next_insert += 1;
                 }
-                body
-            });
-        }
 
-        let mut sequences: Vec<Seq> = Vec::new();
-        let mut lit_start: usize = 0;
-        let mut pos: usize = 0;
-        let mut block_offsets = self.prev_offsets;
-
-        // Invariant: positions in [0, next_insert) have already been spliced
-        // into the matcher's hash chain. We advance `next_insert` lazily.
-        let mut next_insert: usize = 0;
-        while pos + MIN_MATCH < buf_len {
-            // Make sure `pos` is in the chain.
-            while next_insert <= pos {
-                self.matcher.insert(buffer, next_insert);
-                next_insert += 1;
-            }
-
-            // Step 1: best-match selection at `pos`.
-            let (m_dist, m_len, m_is_rep1) = best_at(
-                &self.matcher,
-                buffer,
-                pos,
-                &block_offsets,
-                max_chain,
-                nice_match,
-            );
-
-            if m_len == 0 {
-                pos += 1;
-                continue;
-            }
+                let (m_dist, m_len, m_is_rep1) = best_at(
+                    &self.matcher,
+                    buffer,
+                    pos,
+                    &block_offsets,
+                    max_chain,
+                    nice_match,
+                );
+
+                if m_len == 0 {
+                    pos += 1;
+                    continue;
+                }
 
-            // Step 2 (lazy only): probe pos+1 for a meaningfully better match.
-            // "Meaningfully better" = strictly longer by at least 1 byte when
-            // the current isn't already long. We skip the probe when the
-            // current match is already at least `nice_match` — there's no
-            // plausible win at that point.
-            let (best_pos, best_dist, best_len) =
-                if lazy && m_len < nice_match && pos + 1 + MIN_MATCH < buf_len {
-                    // Insert pos+1 into the chain so its hash bucket includes it.
-                    while next_insert <= pos + 1 {
-                        self.matcher.insert(buffer, next_insert);
-                        next_insert += 1;
-                    }
-                    let (n_dist, n_len, _) = best_at(
-                        &self.matcher,
-                        buffer,
-                        pos + 1,
-                        &block_offsets,
-                        max_chain,
-                        nice_match,
-                    );
-                    // Score: prefer longer-match. A repeat-offset hit at pos
-                    // saves bits in the offset stream — bias slightly in its
-                    // favour by requiring the lazy match to beat by ≥2.
-                    let margin = if m_is_rep1 { 2 } else { 1 };
-                    if n_len >= m_len + margin {
-                        (pos + 1, n_dist, n_len)
+                let (best_pos, best_dist, best_len) =
+                    if lazy && m_len < nice_match && pos + 1 + MIN_MATCH < buf_len {
+                        while next_insert <= pos + 1 {
+                            self.matcher.insert(buffer, next_insert);
+                            next_insert += 1;
+                        }
+                        let (n_dist, n_len, _) = best_at(
+                            &self.matcher,
+                            buffer,
+                            pos + 1,
+                            &block_offsets,
+                            max_chain,
+                            nice_match,
+                        );
+                        let margin = if m_is_rep1 { 2 } else { 1 };
+                        if n_len >= m_len + margin {
+                            (pos + 1, n_dist, n_len)
+                        } else {
+                            (pos, m_dist, m_len)
+                        }
                     } else {
                         (pos, m_dist, m_len)
-                    }
-                } else {
-                    (pos, m_dist, m_len)
-                };
-
-            // Emit the literals run up to `best_pos`, then the chosen match.
-            let literal_run = best_pos - lit_start;
-            let offset_value =
-                assign_offset(best_dist as u32, literal_run as u32, &mut block_offsets);
-            sequences.push(Seq {
-                literal_length: literal_run as u32,
-                match_length: best_len as u32,
-                offset_value,
-            });
-            // Splice the interior positions of the match into the chain so
-            // later positions can match against them. We only insert
-            // positions that aren't already in.
-            let match_end = best_pos + best_len;
-            while next_insert < match_end {
-                self.matcher.insert(buffer, next_insert);
-                next_insert += 1;
+                    };
+
+                let literal_run = best_pos - lit_start;
+                let offset_value =
+                    assign_offset(best_dist as u32, literal_run as u32, &mut block_offsets);
+                sequences.push(Seq {
+                    literal_length: literal_run as u32,
+                    match_length: best_len as u32,
+                    offset_value,
+                });
+                let match_end = best_pos + best_len;
+                while next_insert < match_end {
+                    self.matcher.insert(buffer, next_insert);
+                    next_insert += 1;
+                }
+                pos = match_end;
+                lit_start = pos;
             }
-            pos = match_end;
-            lit_start = pos;
-        }
 
-        let _ = lit_start;
-        if sequences.is_empty() {
-            return None;
-        }
+            let _ = lit_start;
+            if sequences.is_empty() {
+                return None;
+            }
+            finish_compressed_block(
+                buffer,
+                start,
+                &sequences,
+                block_offsets,
+                self.prev_huff_lengths.as_ref(),
+            )
+        };
 
-        finish_compressed_block(
-            buffer,
-            &sequences,
-            block_offsets,
-            self.prev_huff_lengths.as_ref(),
-        )
-        .map(|(body, new_lengths, committed_offsets)| {
+        body.map(|(body, new_lengths, committed_offsets)| {
             self.prev_offsets = committed_offsets;
             if let Some(lengths) = new_lengths {
                 self.prev_huff_lengths = Some(lengths);
@@ -433,16 +452,20 @@ impl Encoder {
 /// `self.pending` (which `buffer` borrows) against `&mut self`.
 fn finish_compressed_block(
     buffer: &[u8],
+    start: usize,
     sequences: &[Seq],
     block_offsets: [u32; 3],
     prev_huff_lengths: Option<&HuffLengths>,
 ) -> Option<(Vec<u8>, Option<HuffLengths>, [u32; 3])> {
-    // Reconstruct all literal bytes by replaying the sequences: each sequence
-    // emits `literal_length` literals from the cursor, then skips
-    // `match_length` matched bytes. Trailing bytes after the last sequence are
-    // literals too.
-    let mut all_literals: Vec<u8> = Vec::with_capacity(buffer.len());
-    let mut cursor = 0usize;
+    // Reconstruct the current block's literal bytes by replaying the sequences:
+    // each sequence emits `literal_length` literals from the cursor, then skips
+    // `match_length` matched bytes. The cursor starts at `start` (the first byte
+    // of this block within `buffer`; earlier bytes are retained history that
+    // matches reference but that this block does not re-emit). Trailing bytes
+    // after the last sequence are literals too.
+    let block_len = buffer.len() - start;
+    let mut all_literals: Vec<u8> = Vec::with_capacity(block_len);
+    let mut cursor = start;
     for s in sequences {
         let ll = s.literal_length as usize;
         all_literals.extend_from_slice(&buffer[cursor..cursor + ll]);
@@ -454,7 +477,7 @@ fn finish_compressed_block(
     let seq_section = build_sequences_section(sequences);
 
     let total = lit_section.len() + seq_section.len();
-    if total >= buffer.len() {
+    if total >= block_len {
         return None; // Not worth compressing.
     }
 
@@ -642,7 +665,7 @@ impl Encoder {
             let body_size = self.pending.len() as u32;
             Self::push_block_header(&mut self.out_buf, body_size, 1, last);
             self.out_buf.push(self.pending[0]);
-            self.pending.clear();
+            self.roll_history_from_pending();
             return;
         }
         if let Some(body) = self.try_compress_block() {
@@ -654,7 +677,27 @@ impl Encoder {
             Self::append_raw_block(&mut self.out_buf, &pending_snapshot, last);
             self.pending = pending_snapshot;
         }
-        self.pending.clear();
+        self.roll_history_from_pending();
+    }
+
+    /// Move the just-emitted `pending` bytes into the cross-block match window
+    /// (`history`) and trim the window to [`MATCH_WINDOW`]. Every emitted block —
+    /// RLE, compressed, or raw — extends the decoder's window, so back-references
+    /// in later blocks may reference these bytes. Keeping `history` ≤ the window
+    /// guarantees every offset we emit stays within the advertised window.
+    fn roll_history_from_pending(&mut self) {
+        if !self.pending.is_empty() {
+            self.history.extend_from_slice(&self.pending);
+            self.pending.clear();
+        }
+        if self.history.len() > MATCH_WINDOW {
+            let drop = self.history.len() - MATCH_WINDOW;
+            self.history.drain(0..drop);
+            // Front of the buffer shifted, so every absolute index recorded in
+            // the matcher is now stale. Drop the chains; the next block rebuilds
+            // them. Trimming only happens past the 8 MiB window, so this is rare.
+            self.matcher.resize_for(self.history.len().max(1));
+        }
     }
 
     /// Copy as much of `out_buf[out_idx..]` into `output[*written..]` as fits.
@@ -679,37 +722,160 @@ impl Encoder {
 
 // ─── price-based optimal parser ───────────────────────────────────────────
 
-/// Estimated bit cost of a literal byte (~Huffman-coded text/code literal).
-/// Only the literal-vs-match trade-off depends on it, not correctness.
-const LIT_PRICE: u32 = 9;
-
-/// Estimated bit cost of the offset part of a match: the FSE offset code plus
-/// its extra bits, with a distance matching one of the active repeat offsets
-/// priced near-free (repeats emit a tiny FSE code and NO offset extra bits).
-fn offset_price(distance: u32, reps: &[u32; 3], ll: u32) -> u32 {
-    let is_rep = if ll > 0 {
-        distance == reps[0] || distance == reps[1] || distance == reps[2]
+/// Fixed-point fractional-bit scale. Prices are carried as `bits << PRICE_SHIFT`
+/// so the DP can weigh sub-bit differences (an FSE code can cost a fractional
+/// number of bits). 8 fractional bits is ample precision and keeps every block
+/// price well inside `u32` (128 KiB × ~16 bits × 256 ≈ 2^31).
+const PRICE_SHIFT: u32 = 8;
+const PRICE_ONE: u32 = 1 << PRICE_SHIFT;
+
+/// `round(log2(x) * 256)` for `x ≥ 1`, via the integer part plus a mantissa
+/// interpolation. Converts FSE normalized counts into per-code bit prices
+/// without floating point (the crate is `no_std`).
+fn log2_fp(x: u32) -> u32 {
+    if x <= 1 {
+        return 0;
+    }
+    let i = 31 - x.leading_zeros(); // floor(log2(x))
+    // Fractional part: the top PRICE_SHIFT mantissa bits below the leading 1.
+    let frac = if i >= PRICE_SHIFT {
+        (x >> (i - PRICE_SHIFT)) & (PRICE_ONE - 1)
     } else {
-        distance == reps[1] || distance == reps[2] || (reps[0] > 1 && distance == reps[0] - 1)
+        (x << (PRICE_SHIFT - i)) & (PRICE_ONE - 1)
     };
-    if is_rep {
-        return 4;
+    // Linear approximation log2(1 + f) ≈ f over the mantissa (max error
+    // < 0.09 bit) — plenty accurate for pricing decisions.
+    i * PRICE_ONE + frac
+}
+
+/// Per-code fractional-bit price model for one FSE-coded symbol stream.
+///
+/// Derived from the *actual* normalized counts the encoder will emit (after a
+/// first parse), so the DP prices each LL/ML/OF code at its real FSE cost
+/// instead of a flat constant. The cost of a symbol whose normalized count is
+/// `c` under accuracy log `al` is `al - log2(c)` bits (a state covering more of
+/// the table reads fewer bits) — the standard FSE entropy estimate zstd's
+/// optimal parser uses.
+struct CodePrices {
+    /// `bits << PRICE_SHIFT` per code index. Empty → always use `fallback`.
+    cost: Vec<u32>,
+    /// Price for a code absent from `cost` (or with count 0).
+    fallback: u32,
+}
+
+impl CodePrices {
+    /// Build from a normalized FSE count table (`counts[c]` may be -1 for the
+    /// "less than one" probability rows, which read `accuracy_log` bits).
+    fn from_counts(counts: &[i16], accuracy_log: u8) -> Self {
+        let al_fp = (accuracy_log as u32) * PRICE_ONE;
+        let mut cost = Vec::with_capacity(counts.len());
+        for &c in counts {
+            let bits = if c <= 0 {
+                // count -1 (and the degenerate 0) → one state, reads `al` bits.
+                al_fp
+            } else {
+                al_fp.saturating_sub(log2_fp(c as u32))
+            };
+            cost.push(bits);
+        }
+        Self {
+            cost,
+            fallback: al_fp,
+        }
+    }
+
+    #[inline]
+    fn code_cost(&self, code: u8) -> u32 {
+        self.cost
+            .get(code as usize)
+            .copied()
+            .unwrap_or(self.fallback)
+    }
+}
+
+/// Full fractional-bit price model used by the optimal parser.
+struct PriceModel {
+    ll: CodePrices,
+    ml: CodePrices,
+    of: CodePrices,
+    /// Average literal-byte cost (`bits << PRICE_SHIFT`) under the block's
+    /// Huffman tree. A single average is enough for the literal-vs-match
+    /// trade-off; per-byte pricing barely moves the parse.
+    lit: u32,
+}
+
+impl PriceModel {
+    /// Heuristic model for the first pass (no statistics yet). Mirrors the
+    /// original flat constants so pass 1 behaves like the previous encoder.
+    fn heuristic() -> Self {
+        Self {
+            ll: CodePrices {
+                cost: Vec::new(),
+                fallback: 6 * PRICE_ONE,
+            },
+            ml: CodePrices {
+                cost: Vec::new(),
+                fallback: 6 * PRICE_ONE,
+            },
+            of: CodePrices {
+                cost: Vec::new(),
+                fallback: 5 * PRICE_ONE,
+            },
+            lit: 9 * PRICE_ONE,
+        }
+    }
+
+    /// Price the offset part of a match: FSE code cost + extra bits. A distance
+    /// hitting one of the active repeat offsets uses code 0..=2 (cheap, no extra
+    /// bits); a fresh offset uses code `log2(distance+3)` with that many extras.
+    #[inline]
+    fn offset_price(&self, distance: u32, reps: &[u32; 3], ll: u32) -> u32 {
+        if let Some(code) = rep_match_code(distance, reps, ll) {
+            return self.of.code_cost(code);
+        }
+        let val = distance + 3;
+        let code = 31 - val.leading_zeros();
+        self.of.code_cost(code as u8) + code * PRICE_ONE
+    }
+
+    /// Price the literal-length + match-length FSE codes plus their extra bits.
+    #[inline]
+    fn ll_ml_price(&self, literal_length: u32, match_length: u32) -> u32 {
+        let (lc, lb, _) = ll_code(literal_length);
+        let (mc, mb, _) = ml_code(match_length);
+        self.ll.code_cost(lc) + self.ml.code_cost(mc) + (lb + mb) * PRICE_ONE
+    }
+
+    #[inline]
+    fn lit_run_price(&self, run: u32) -> u32 {
+        self.lit.saturating_mul(run)
     }
-    // Fresh offset: `code` extra bits (the literal low bits of the distance)
-    // plus the FSE-coded offset code itself (~5 bits amortised). The FSE code
-    // adapts to the block, so it is NOT another `log2(D)` — charging that would
-    // double-count and push the DP away from good long-distance matches.
-    let val = distance + 3;
-    let code = 31 - val.leading_zeros();
-    code + 5
 }
 
-/// Estimated bit cost of the literal-length / match-length FSE codes plus
-/// their extra bits for a sequence with the given run/length.
-fn ll_ml_price(literal_length: u32, match_length: u32) -> u32 {
-    let (_lc, lb, _lv) = ll_code(literal_length);
-    let (_mc, mb, _mv) = ml_code(match_length);
-    10 + lb + mb
+/// If `distance` matches an active repeat offset, return the FSE *offset code*
+/// it encodes to (0, 1, or 2 → offset_value 1, 2, 3). Mirrors the aliasing
+/// rules in [`assign_offset`]: the `literal_length == 0` case shifts the codes.
+#[inline]
+fn rep_match_code(distance: u32, reps: &[u32; 3], ll: u32) -> Option<u8> {
+    if ll > 0 {
+        if distance == reps[0] {
+            Some(0)
+        } else if distance == reps[1] {
+            Some(1)
+        } else if distance == reps[2] {
+            Some(2)
+        } else {
+            None
+        }
+    } else if distance == reps[1] {
+        Some(0)
+    } else if distance == reps[2] {
+        Some(1)
+    } else if reps[0] > 1 && distance == reps[0] - 1 {
+        Some(2)
+    } else {
+        None
+    }
 }
 
 /// Update the repeat-offset ring after a match (mirrors `assign_offset`'s
@@ -733,31 +899,58 @@ fn advance_reps(distance: u32, literal_length: u32, reps: &[u32; 3]) -> [u32; 3]
 /// greedy/lazy parsing comes from (their offset extra bits dominate output).
 ///
 /// Returns the chosen sequences (in order) and the final repeat-offset ring.
-fn optimal_parse(
-    matcher: &mut MatchFinder,
-    buffer: &[u8],
+///
+/// Invariant block-level inputs (where the block starts, the incoming repeat
+/// ring, and the match-finder knobs) are grouped in [`ParseConfig`] so the same
+/// settings flow through every repricing pass without a long argument list.
+struct ParseConfig {
+    /// First byte of the block within `buffer`; earlier bytes are retained
+    /// history that matches may reference but the block does not re-emit.
+    start: usize,
+    /// Repeat-offset ring entering the block (carried across blocks).
     init_offsets: [u32; 3],
+    /// Hash-chain depth cap and "good enough" match length for the finder.
     max_chain: usize,
     nice_match: usize,
+}
+
+fn optimal_parse(
+    matcher: &mut MatchFinder,
+    buffer: &[u8],
+    cfg: &ParseConfig,
+    model: &PriceModel,
+    block_indexed: bool,
 ) -> (Vec<Seq>, [u32; 3]) {
+    let ParseConfig {
+        start,
+        init_offsets,
+        max_chain,
+        nice_match,
+    } = *cfg;
     let n = buffer.len();
-    if n < MIN_MATCH + 1 {
+    if n < start + MIN_MATCH + 1 {
         return (Vec::new(), init_offsets);
     }
 
-    // Insert every hashable position up front so chain walks see the whole
-    // block (back-references only look earlier, so insertion order within the
-    // block doesn't affect correctness).
-    matcher.resize_for(n);
-    for i in 0..n.saturating_sub(3) {
-        matcher.insert(buffer, i);
-    }
+    // The caller has already indexed the retained history (`[0, start)`). On the
+    // first parse pass we splice in this block's positions lazily as the DP
+    // advances, so the chains never contain positions ahead of the probe (the
+    // standard LZ invariant). Repricing passes reuse the now-complete index.
+    let hash_end = n.saturating_sub(3);
+    let mut next_insert = start;
 
+    // DP arrays are block-relative: index `j` maps to absolute position
+    // `start + j`, so memory stays proportional to the block (not the window).
+    let m = n - start;
     const INF: u32 = u32::MAX;
-    let mut price: Vec<u32> = vec![INF; n + 1];
-    // Back-pointer: (prev_pos, match_len, match_dist). match_len == 0 → literal.
-    let mut back: Vec<(u32, u32, u32)> = vec![(0, 0, 0); n + 1];
-    let mut reps_at: Vec<[u32; 3]> = vec![init_offsets; n + 1];
+    let mut price: Vec<u32> = vec![INF; m + 1];
+    // Back-pointer: (prev_abs_pos, match_len, match_dist). match_len == 0 → literal.
+    let mut back: Vec<(u32, u32, u32)> = vec![(0, 0, 0); m + 1];
+    let mut reps_at: Vec<[u32; 3]> = vec![init_offsets; m + 1];
+    // Length of the literal run ending at each node on its cheapest path. Used
+    // to (a) price the literal-length FSE code at the following match and (b)
+    // resolve repeat-offset aliasing (the LL==0 case shifts the rep codes).
+    let mut runlen_at: Vec<u32> = vec![0; m + 1];
     price[0] = 0;
 
     // Step length sparsely for long matches to bound DP work. Dense up to 128
@@ -774,46 +967,68 @@ fn optimal_parse(
 
     let mut cands: Vec<crate::zstd::matcher::Match> = Vec::new();
 
-    for i in 0..n {
-        let base = price[i];
+    for j in 0..m {
+        let i = start + j; // absolute input position
+
+        // Splice this position into the hash chain before it is probed (first
+        // pass only). Done unconditionally — even for unreachable nodes — so the
+        // chain stays complete and ordered; the match finder only ever looks at
+        // positions strictly before the probe.
+        if !block_indexed {
+            while next_insert <= i {
+                if next_insert < hash_end {
+                    matcher.insert(buffer, next_insert);
+                }
+                next_insert += 1;
+            }
+        }
+
+        let base = price[j];
         if base == INF {
             continue;
         }
-        let cur_reps = reps_at[i];
-
-        // Option A: emit a literal.
-        let lit_cand = base.saturating_add(LIT_PRICE);
-        if lit_cand < price[i + 1] {
-            price[i + 1] = lit_cand;
-            back[i + 1] = (i as u32, 0, 0);
-            reps_at[i + 1] = cur_reps;
+        let cur_reps = reps_at[j];
+
+        // Option A: emit a literal. Carries the running literal-run length so
+        // the next match can price its literal-length FSE code correctly.
+        let lit_cand = base.saturating_add(model.lit_run_price(1));
+        if lit_cand < price[j + 1] {
+            price[j + 1] = lit_cand;
+            back[j + 1] = (i as u32, 0, 0);
+            reps_at[j + 1] = cur_reps;
+            runlen_at[j + 1] = runlen_at[j] + 1;
         }
 
         if i + MIN_MATCH > n {
             continue;
         }
-        // Proxy literal-length for offset rep-aliasing: the common case is a
-        // sequence following some literals (LL>0, reps map to codes 1..=3).
-        let ll_proxy = 1u32;
+        // Actual literal-run length on the cheapest path to `i`. Drives both the
+        // repeat-offset aliasing (LL==0 shifts the rep codes) and the LL FSE
+        // code charged on the emitted sequence.
+        let ll = runlen_at[j];
+        // ML code + ML extra bits + LL code + LL extra bits for this sequence.
+        // The literal *bytes* are already priced step-by-step above, so only the
+        // length codes are charged here.
+        let ll_ml = |l: u32| model.ll_ml_price(ll, l);
 
         // Option B1: repeat-offset matches at the three active distances.
         for &d in &cur_reps {
             if d == 0 || (d as usize) > i {
                 continue;
             }
-            let m = matcher.check_repeat_offset(buffer, i, d as usize);
-            if m >= MIN_MATCH {
-                let max_l = m.min(n - i);
-                let off = offset_price(d, &cur_reps, ll_proxy);
+            let mm = matcher.check_repeat_offset(buffer, i, d as usize);
+            if mm >= MIN_MATCH {
+                let max_l = mm.min(n - i);
+                let off = model.offset_price(d, &cur_reps, ll);
+                let next_reps = advance_reps(d, ll, &cur_reps);
                 let mut l = MIN_MATCH;
                 while l <= max_l {
-                    let cost = base
-                        .saturating_add(off)
-                        .saturating_add(ll_ml_price(0, l as u32));
-                    if cost < price[i + l] {
-                        price[i + l] = cost;
-                        back[i + l] = (i as u32, l as u32, d);
-                        reps_at[i + l] = advance_reps(d, ll_proxy, &cur_reps);
+                    let cost = base.saturating_add(off).saturating_add(ll_ml(l as u32));
+                    if cost < price[j + l] {
+                        price[j + l] = cost;
+                        back[j + l] = (i as u32, l as u32, d);
+                        reps_at[j + l] = next_reps;
+                        runlen_at[j + l] = 0;
                     }
                     if l == max_l {
                         break;
@@ -827,17 +1042,22 @@ fn optimal_parse(
         matcher.collect_matches(buffer, i, n, max_chain, nice_match, &mut cands);
         for c in &cands {
             let d = c.distance as u32;
+            // Skip candidates that coincide with a repeat offset — already
+            // priced (more cheaply) in B1.
+            if rep_match_code(d, &cur_reps, ll).is_some() {
+                continue;
+            }
             let max_l = c.length.min(n - i);
-            let off = offset_price(d, &cur_reps, ll_proxy);
+            let off = model.offset_price(d, &cur_reps, ll);
+            let next_reps = advance_reps(d, ll, &cur_reps);
             let mut l = MIN_MATCH;
             while l <= max_l {
-                let cost = base
-                    .saturating_add(off)
-                    .saturating_add(ll_ml_price(0, l as u32));
-                if cost < price[i + l] {
-                    price[i + l] = cost;
-                    back[i + l] = (i as u32, l as u32, d);
-                    reps_at[i + l] = advance_reps(d, ll_proxy, &cur_reps);
+                let cost = base.saturating_add(off).saturating_add(ll_ml(l as u32));
+                if cost < price[j + l] {
+                    price[j + l] = cost;
+                    back[j + l] = (i as u32, l as u32, d);
+                    reps_at[j + l] = next_reps;
+                    runlen_at[j + l] = 0;
                 }
                 if l == max_l {
                     break;
@@ -849,11 +1069,11 @@ fn optimal_parse(
 
     // Backtrack to recover the chosen steps, then emit sequences forward.
     let mut steps: Vec<(u32, u32)> = Vec::new(); // (match_len, match_dist); 0 = literal
-    let mut i = n;
-    while i > 0 {
-        let (prev, mlen, mdist) = back[i];
+    let mut j = m;
+    while j > 0 {
+        let (prev_abs, mlen, mdist) = back[j];
         steps.push((mlen, mdist));
-        i = prev as usize;
+        j = (prev_abs as usize) - start;
     }
     steps.reverse();
 
@@ -879,6 +1099,180 @@ fn optimal_parse(
     (sequences, block_offsets)
 }
 
+/// Lowest level at which the optimal parser runs a second, statistics-driven
+/// repricing pass over the block. The first pass uses a flat heuristic model;
+/// the second rebuilds the price model from the actual LL/ML/OF code and
+/// literal-byte statistics and reparses, so prices reflect the entropy coding
+/// the block will actually use. Mirrors zstd's `btultra2` two-pass parse.
+const REPRICE_LEVEL: u8 = 15;
+
+/// Build a statistics-driven [`PriceModel`] from a completed parse's sequences
+/// and the block's literal bytes. The FSE code prices come from the same
+/// normalized-count tables [`build_sequences_section`] / [`pick_table`] would
+/// pick; the literal price is the Huffman entropy of the literal bytes.
+fn build_price_model(buffer: &[u8], start: usize, sequences: &[Seq]) -> PriceModel {
+    // Histogram the LL/ML/OF codes exactly as the sequence section will.
+    let mut ll_hist = [0u32; 36];
+    let mut ml_hist = [0u32; 53];
+    let mut of_hist = [0u32; 32];
+    for s in sequences {
+        let (lc, _, _) = ll_code(s.literal_length);
+        let (mc, _, _) = ml_code(s.match_length);
+        let (oc, _, _) = of_code(s.offset_value);
+        ll_hist[lc as usize] += 1;
+        ml_hist[mc as usize] += 1;
+        of_hist[(oc as usize).min(31)] += 1;
+    }
+    let n = sequences.len() as u32;
+
+    let ll = code_prices_for(&ll_hist, &DEFAULT_LL_COUNTS, DEFAULT_LL_ACCURACY_LOG, 9, n);
+    let of = code_prices_for(&of_hist, &DEFAULT_OF_COUNTS, DEFAULT_OF_ACCURACY_LOG, 8, n);
+    let ml = code_prices_for(&ml_hist, &DEFAULT_ML_COUNTS, DEFAULT_ML_ACCURACY_LOG, 9, n);
+
+    // Literal-byte price: average Huffman code length over the actual literal
+    // bytes of this block (the `[start, end)` tail). Reconstruct the literal
+    // stream from the sequences, build the same tree the literals section would,
+    // and average its code lengths.
+    let all_literals = reconstruct_literals(buffer, start, sequences);
+    let lit = literal_price(&all_literals);
+
+    PriceModel { ll, ml, of, lit }
+}
+
+/// Replay `sequences` over `buffer[start..]` to recover the block's literal
+/// bytes (the bytes not covered by any match). Shared by the price model and
+/// the cost predictor so they always agree with [`finish_compressed_block`].
+fn reconstruct_literals(buffer: &[u8], start: usize, sequences: &[Seq]) -> Vec<u8> {
+    let mut out: Vec<u8> = Vec::with_capacity(buffer.len() - start);
+    let mut cursor = start;
+    for s in sequences {
+        let ll = s.literal_length as usize;
+        out.extend_from_slice(&buffer[cursor..cursor + ll]);
+        cursor += ll + s.match_length as usize;
+    }
+    out.extend_from_slice(&buffer[cursor..]);
+    out
+}
+
+/// Pick the normalized FSE counts the encoder would actually emit for a code
+/// stream (custom if it beats predefined, else predefined) and turn them into a
+/// [`CodePrices`]. `al_cap` bounds the custom accuracy log; `n` is the sequence
+/// count (used to size the custom table the way [`pick_table`] does).
+fn code_prices_for(
+    hist: &[u32],
+    default_counts: &[i16],
+    default_al: u8,
+    al_cap: u8,
+    n: u32,
+) -> CodePrices {
+    if n == 0 {
+        return CodePrices::from_counts(default_counts, default_al);
+    }
+    // Mirror pick_table's accuracy-log selection for the custom table.
+    let mut al = al_cap;
+    while al > 5 && (1u32 << al) > n * 4 {
+        al -= 1;
+    }
+    if al < 5 {
+        al = 5;
+    }
+    if let Some(counts) = build_normalised_counts(hist, n, al) {
+        // Compare predicted bitstream cost the way pick_table does; only use the
+        // custom table's prices when it is the one that would be emitted.
+        let pred_custom = predict_fse_bits(&counts, hist, al);
+        let pred_default = predict_fse_bits(default_counts, hist, default_al);
+        let header = encode_fse_table_header(&counts, al);
+        let custom_bytes = (pred_custom / 8 + 1) as usize + header.len();
+        let default_bytes = (pred_default / 8 + 1) as usize;
+        if custom_bytes + 2 < default_bytes {
+            return CodePrices::from_counts(&counts, al);
+        }
+    }
+    CodePrices::from_counts(default_counts, default_al)
+}
+
+/// Average Huffman code length (as `bits << PRICE_SHIFT`) over `literals`,
+/// using the same canonical tree the literals section builds. Falls back to a
+/// flat 8 bits when the alphabet can't be Huffman-coded (tiny/degenerate input).
+fn literal_price(literals: &[u8]) -> u32 {
+    if literals.is_empty() {
+        return 8 * PRICE_ONE;
+    }
+    let freq = histogram(literals);
+    if let Some(lengths) = build_huff_lengths(&freq) {
+        let mut total_bits: u64 = 0;
+        let mut total_syms: u64 = 0;
+        for (b, &f) in freq.iter().enumerate() {
+            if f == 0 {
+                continue;
+            }
+            let len = lengths[b] as u64;
+            total_bits += (f as u64) * len.max(1);
+            total_syms += f as u64;
+        }
+        // Average bits/symbol in fixed point. Clamp to ≥1 bit so the DP never
+        // treats literals as free.
+        if let Some(avg) = (total_bits * PRICE_ONE as u64).checked_div(total_syms) {
+            return (avg as u32).max(PRICE_ONE);
+        }
+    }
+    8 * PRICE_ONE
+}
+
+/// Drive the optimal parser: a first heuristic-priced pass, then (at high
+/// levels) one or more statistics-driven repricing passes. Each repricing pass
+/// rebuilds the price model from the previous pass's sequences and reparses;
+/// the best (smallest predicted body) result wins. Returns the chosen sequences
+/// and final repeat-offset ring.
+fn optimal_parse_2pass(
+    matcher: &mut MatchFinder,
+    buffer: &[u8],
+    cfg: &ParseConfig,
+    reprice_passes: u32,
+) -> (Vec<Seq>, [u32; 3]) {
+    let start = cfg.start;
+    // Pass 1: flat heuristic prices. Splices the current block into the chains
+    // lazily (history was pre-indexed by the caller); later passes reuse them.
+    let mut model = PriceModel::heuristic();
+    let (mut best_seqs, mut best_offsets) = optimal_parse(matcher, buffer, cfg, &model, false);
+    if best_seqs.is_empty() {
+        return (best_seqs, best_offsets);
+    }
+    let mut best_cost = predicted_block_cost(buffer, start, &best_seqs);
+
+    // Repricing passes: rebuild the model from the current best parse and
+    // reparse. Keep iterating while it strictly improves the predicted cost.
+    for _ in 0..reprice_passes {
+        model = build_price_model(buffer, start, &best_seqs);
+        let (seqs, offsets) = optimal_parse(
+            matcher, buffer, cfg, &model, true, // chains already populated by pass 1
+        );
+        if seqs.is_empty() {
+            break;
+        }
+        let cost = predicted_block_cost(buffer, start, &seqs);
+        if cost < best_cost {
+            best_cost = cost;
+            best_seqs = seqs;
+            best_offsets = offsets;
+        } else {
+            break; // no further improvement
+        }
+    }
+
+    (best_seqs, best_offsets)
+}
+
+/// Predict the total compressed body size (in bytes) for a parse, used to pick
+/// the better of two repricing passes. Sums the literals section and sequences
+/// section the encoder would actually emit. Cheap relative to the parse itself.
+fn predicted_block_cost(buffer: &[u8], start: usize, sequences: &[Seq]) -> usize {
+    let all_literals = reconstruct_literals(buffer, start, sequences);
+    let (lit_section, _) = build_literals_section(&all_literals, None);
+    let seq_section = build_sequences_section(sequences);
+    lit_section.len() + seq_section.len()
+}
+
 /// Find the best (distance, length) match at `pos`, mixing repeat-offset
 /// probes with a hash-chain search.
 ///
@@ -1518,6 +1912,7 @@ impl RawEncoder for Encoder {
 
     fn raw_reset(&mut self) {
         self.state = State::Accepting;
+        self.history.clear();
         self.pending.clear();
         self.out_buf.clear();
         self.out_idx = 0;
diff --git a/tests/lz4_frame.rs b/tests/lz4_frame.rs
index bc4eee4..b850efb 100644
--- a/tests/lz4_frame.rs
+++ b/tests/lz4_frame.rs
@@ -418,6 +418,75 @@ fn linked_blocks_back_reference_across_boundary() {
     round_trip_with(cfg, &v);
 }
 
+/// Linked blocks must compress strictly better than independent blocks when
+/// the payload's second block repeats data from the first block's tail —
+/// because only linked mode can back-reference across the boundary.
+#[test]
+fn linked_beats_independent_across_boundary() {
+    // First 64 KiB: pseudo-random (so it is not itself very compressible).
+    // Second block: a verbatim copy of the first block's last stretch, which
+    // only a cross-block reference can exploit.
+    let mut first = Vec::with_capacity(64 * 1024);
+    let mut state: u32 = 0x1357_9BDF;
+    for _ in 0..(64 * 1024) {
+        state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+        first.push((state >> 16) as u8);
+    }
+    let mut payload = first.clone();
+    // Second block reuses the tail of the first block verbatim.
+    payload.extend_from_slice(&first[16 * 1024..]);
+
+    let linked = EncoderConfig {
+        block_max_size: BlockMaxSize::Max64KB,
+        block_independence: false,
+        level: 9,
+        ..Default::default()
+    };
+    let independent = EncoderConfig {
+        block_independence: true,
+        ..linked
+    };
+
+    let enc_linked = encode_with_cfg_chunked(linked, &payload, 8192, 8192);
+    let enc_indep = encode_with_cfg_chunked(independent, &payload, 8192, 8192);
+
+    // Both must round-trip.
+    round_trip_with(linked, &payload);
+    round_trip_with(independent, &payload);
+
+    assert!(
+        enc_linked.len() < enc_indep.len(),
+        "linked {} should be smaller than independent {}",
+        enc_linked.len(),
+        enc_indep.len()
+    );
+}
+
+/// Our linked-block (default) encoder output must decode byte-for-byte through
+/// the system `lz4` tool across multiple blocks — the core cross-block
+/// reference path.
+#[test]
+fn cross_tool_linked_multiblock_our_encode_system_decode() {
+    if !system_lz4_available() {
+        eprintln!("skipping cross-tool test: `lz4` not on PATH");
+        return;
+    }
+    // ~200 KiB so it spans several 64 KiB blocks, with long-range repetition
+    // that the linked window exploits across boundaries.
+    let unit = b"the quick brown fox jumps over the lazy dog. ".repeat(64);
+    let mut payload = Vec::new();
+    while payload.len() < 200 * 1024 {
+        payload.extend_from_slice(&unit);
+    }
+    let cfg = EncoderConfig {
+        level: 9,
+        ..Default::default()
+    };
+    let encoded = encode_with_cfg_chunked(cfg, &payload, 4096, 4096);
+    let decoded = system_decompress(&encoded).expect("system lz4 -dc failed");
+    assert_eq!(decoded, payload);
+}
+
 // ─── Cross-tool: system `lz4 -dc` ────────────────────────────────────────
 
 fn system_lz4_available() -> bool {