diff --git a/CHANGELOG.md b/CHANGELOG.md
index 05f4b82..7f6cfdb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- **Raw LZMA2 encoder** (`lzma2`): `compcol::lzma2::Lzma2` now encodes as well
+  as decodes — it emits the raw 7-Zip LZMA2 chunk stream (full dict/props/state
+  reset per chunk, uncompressed-chunk fallback when compression would expand,
+  `0x00` end marker), reusing the xz LZMA2 chunk codec. The dictionary size is
+  out of band (the 7z coder property); the encoder uses the 4 MiB default so a
+  default-config decoder round-trips. Validated by round-trip and by decoding
+  the output through the shared xz LZMA2 codec.
+- **LZFSE `bvx2` decoding** (`lzfse`): the core LZFSE v2 block type (LZ77 +
+  Finite State Entropy) now decodes — full v2 header parse, 4-way interleaved
+  literal FSE, three interleaved L/M/D FSE streams (reverse bitstreams), and LZ
+  reconstruction. The FSE table construction matches Apple's general
+  `fse_init_decoder_table` (the `k`/`k-1` split), so arbitrary frequency tables
+  are handled, not just power-of-two ones. Validated by round-trip against an
+  in-crate v2 encoder plus a frozen hand-written non-dyadic vector; there is no
+  Apple `lzfse` tool in the build environment, so real-stream interop is
+  best-effort but follows the documented format precisely. `bvx1` (v1) remains
+  `Unsupported`.
+
+### Changed
+
+- **lz5 (Lizard) Huffman sub-streams** stay `Unsupported`, now with a precise
+  rationale in the module docs: the Huff0 entropy stage selects X1/X2 from
+  `(regenSize, comprLen)` at runtime and there is no reference encoder or
+  fixture available to validate a decoder bit-exactly, so — consistent with the
+  crate's `lzham`/`sit13` policy — it is left honest rather than shipped blind.
+  The docs record the concrete reuse path (zstd's X1 Huff0 decoder + an X2
+  decoder + the `HUF_selectDecoder` heuristic) for a future round with fixtures.
+
+
 ### Added
 
 - **HTTP/3 QPACK header compression** (RFC 9204) behind the new `qpack`
diff --git a/README.md b/README.md
index 3bee9b0..567f56e 100644
--- a/README.md
+++ b/README.md
@@ -47,14 +47,14 @@ flag, and a `compcol` binary turns the library into a Unix-style filter.
 | LZW (`compress(1)` `.Z`) | `lzw` | `.lzw` | full | full | `compress(1)` / `uncompress(1)` |
 | LZMA (legacy `.lzma`) | `lzma` | `.lzma` | full | full | `python3 -m lzma` (FORMAT_ALONE) |
 | xz | `xz` | `.xz` | compressed-LZMA2 chunks + uncompressed fallback | full envelope + all reset variants | `xz(1)` both directions |
-| Raw LZMA2 (7z coder 21) | `lzma2` | `.lzma2` | `Unsupported` (decode-only) | full (raw LZMA2 chunk stream; reuses the xz LZMA2 engine) | round-trip vs the xz LZMA2 encoder |
+| Raw LZMA2 (7z coder 21) | `lzma2` | `.lzma2` | full (raw LZMA2 chunk stream; reuses the xz LZMA2 engine) | full (raw LZMA2 chunk stream; reuses the xz LZMA2 engine) | round-trip + cross-decode via the shared xz LZMA2 codec |
 | Zstandard (RFC 8478) | `zstd` | `.zst` | LZ77 + Huffman literals + FSE_Compressed_Mode sequences + repeat offsets + RLE blocks | full Compressed_Block | `zstd(1)` both directions |
 | Brotli (RFC 7932) | `brotli` | `.br` | LZ77 + length-limited Huffman + 704-symbol IC alphabet + static-dictionary refs | full (with 122 KiB static dictionary) | `brotli(1)` both directions |
 | LZO (LZO1X-1) | `lzo` | `.lzo` | LZ77 hash matcher | full | `python3 -c "import lzo"` |
 | LZX (Microsoft CAB / WIM) | `lzx` | `.lzx` | uncompressed blocks only | full (verbatim + aligned-offset + uncompressed; E8 filter) | — |
 | Amiga LZX (original 1995 Forbes) | `amiga_lzx` | — (`.lzx` claimed by MS LZX) | uncompressed blocks only | full (verbatim + aligned + uncompressed; fixed 64 KiB window, no chunk reset, no E8 filter) | — |
 | Quantum (Stac, old CAB) | `quantum` | `.q` | `Unsupported` (no public encoder exists) | full (libmspack-equivalent) | libmspack regression fixtures |
-| LZFSE (Apple) | `lzfse` | `.lzfse` | `Unsupported` (decoder-only) | `bvx-` raw + `bvxn` (LZVN); `bvx2` returns `Unsupported` | hand-built fixtures (no Apple toolchain bundled) |
+| LZFSE (Apple) | `lzfse` | `.lzfse` | `Unsupported` (decoder-only) | `bvx-` raw + `bvxn` (LZVN) + `bvx2` (LZ77 + FSE); `bvx1` returns `Unsupported` | round-trip (bvx2 vs own FSE encoder; no Apple toolchain bundled) |
 | ADC (Apple DMG) | `adc` | `.adc` | LZSS-style greedy match-finder | full | hand-built fixtures |
 | bzip2 | `bzip2` | `.bz2` | full (RLE-1 + SA-IS BWT + MTF + RLE-2 + dynamic Huffman) | full | `bzip2(1)` both directions |
 | PPMd (Shkarin's PPMII variant H) | `ppmd` | `.ppmd` | `Unsupported` (decoder-only; PPM model is intricate) | full (used in 7z / RAR3+ / ZIP method 98) | `python3 ppmd-cffi` |
@@ -427,7 +427,7 @@ lzw     = ["alloc"]
 lzo     = ["alloc"]
 lzx     = ["alloc"]
 quantum = ["alloc"]
-lzfse   = ["alloc"]            # decoder-only, bvx2 returns Unsupported
+lzfse   = ["alloc"]            # decoder-only; bvx-/bvxn/bvx2, bvx1 Unsupported
 adc     = ["alloc"]
 rar1    = ["alloc"]
 rar2    = ["alloc"]
diff --git a/src/lz5/block.rs b/src/lz5/block.rs
index a674319..f06ebb9 100644
--- a/src/lz5/block.rs
+++ b/src/lz5/block.rs
@@ -10,6 +10,19 @@
 //! Only the LZ4-codeword sequence loop (levels 10..=19, 30..=39) with
 //! all sub-streams stored raw (no Huffman entropy stage) is
 //! implemented; everything else returns [`Error::Unsupported`].
+//!
+//! Two paths stay `Unsupported` for documented, validation-driven
+//! reasons (see the inline comments at the `huffman_bits` and LIZv1
+//! rejections below):
+//!
+//!  * **Huff0 entropy stage** (any sub-stream flag bit set): Lizard's
+//!    generic `HUF_decompress` recomputes an X1-vs-X2 decoder choice
+//!    that is never carried in the stream; the crate has only an X1
+//!    Huff0 decoder (private to `zstd`), and there is no `lizard` CLI
+//!    or fixture here to validate an X2 decoder against. A round-trip
+//!    against our own X1-only encoder would prove nothing.
+//!  * **LIZv1 codewords** (levels 20..=29, 40..=49): a separate, larger
+//!    sequence format, out of scope for this round.
 
 use alloc::vec::Vec;
 
@@ -61,6 +74,12 @@ pub fn decode_compressed_block(input: &[u8], out: &mut Vec<u8>, cap: usize) -> R
     // Lizard groups levels by decompression strategy:
     //   10..=19, 30..=39  →  LZ4 codewords (this build supports)
     //   20..=29, 40..=49  →  LIZv1 codewords (not supported)
+    //
+    // LIZv1 is a distinct, larger sequence format (`Lizard_decompress_LIZv1`
+    // vs `Lizard_decompress_LZ4` in the reference): different token layout,
+    // explicit `lengths`/`offset16`/`offset24` streams, and a 24-bit offset
+    // path. Implementing it is a separate effort from the Huffman stage and
+    // is out of scope for this round, so it stays `Unsupported`.
     let is_lz4_mode = matches!(clevel, 10..=19 | 30..=39);
     if !is_lz4_mode {
         return Err(Error::Unsupported);
@@ -96,8 +115,38 @@ pub fn decode_compressed_block(input: &[u8], out: &mut Vec<u8>, cap: usize) -> R
     if res & FLAG_LEN != 0 {
         return Err(Error::Corrupt);
     }
-    // Any Huffman bit set on a sub-stream means we'd need to FSE-Huffman
-    // decode that stream. Out of scope.
+    // Any Huffman bit set on a sub-stream means the stream is entropy-coded
+    // with Huff0 (Yann Collet's FiniteStateEntropy library) and must be
+    // `HUF_decompress`'d before the sequence loop runs. Each such sub-stream
+    // is framed as a 6-byte header (3-byte LE regenerated size + 3-byte LE
+    // compressed size) followed by `compressed_size` bytes of Huff0 payload
+    // (`Lizard_readStream` → `HUF_decompress(op, regenSize, ip + 6, comprLen)`).
+    //
+    // This stays `Unsupported`. The decision is deliberate, not a TODO —
+    // there is no faithful way to *validate* such a decoder in this
+    // environment, and the crate's policy (see `lzham`, `sit13`) is to mark
+    // formats we cannot validate bit-exactly as `Unsupported` rather than
+    // ship a blind decoder. Concretely:
+    //
+    //   * The crate already has a Huff0 decoder in `src/zstd/huffman.rs`, but
+    //     it is (a) private to the `zstd` module (`mod huffman;`, not
+    //     reachable from here without re-exporting it) and (b) implements
+    //     only the **X1** (single-symbol) decode table that zstd's *literals*
+    //     spec restricts itself to.
+    //   * Lizard calls the *generic* `HUF_decompress`, which selects **X1 or
+    //     X2** (double-symbol) at runtime via `HUF_selectDecoder`. That
+    //     choice is **recomputed from (regenSize, comprLen)** and is **never
+    //     stored in the stream**, so a conformant decoder must implement both
+    //     X1 and X2 *and* reproduce `HUF_selectDecoder`'s timing heuristic
+    //     exactly. The crate has no X2 decoder anywhere. (The 4-stream jump
+    //     table — three LE u16 sizes — does match zstd's literals framing, so
+    //     that part would be reusable; the X1/X2 split is the blocker.)
+    //   * The lz5 encoder here is store-only, and there is no `lizard` CLI or
+    //     Huff0 fixture in this environment. A round-trip against a
+    //     hand-written X1-only encoder would always select X1 and "pass"
+    //     while proving nothing about a real (possibly X2) Lizard block — a
+    //     self-validating fiction. Absent a real fixture or reference
+    //     encoder there is no honest round-trip, so we do not ship.
     let huffman_bits = res & (FLAG_LITERALS | FLAG_FLAGS | FLAG_OFFSET16 | FLAG_OFFSET24);
     if huffman_bits != 0 {
         return Err(Error::Unsupported);
diff --git a/src/lz5/mod.rs b/src/lz5/mod.rs
index 1a21eab..ea27572 100644
--- a/src/lz5/mod.rs
+++ b/src/lz5/mod.rs
@@ -29,9 +29,34 @@
 //! **Decoder**: implemented for the **LZ4 codeword path with all
 //! sub-streams stored raw** (the most common shape produced by the
 //! reference CLI at levels 10..=19 on non-tiny inputs). Frames whose
-//! blocks use the LIZv1 sequence format (levels 20..=29) or any
-//! Huffman-coded sub-stream (levels 30+) are rejected with
-//! [`Error::Unsupported`]. The frame-level uncompressed block path
+//! blocks use the LIZv1 sequence format (levels 20..=29, 40..=49) or any
+//! Huffman-coded sub-stream are rejected with [`Error::Unsupported`].
+//!
+//! The Huffman path stays `Unsupported` for a concrete, validation-first
+//! reason rather than mere absence of effort. Lizard's entropy stage is
+//! Huff0 (`HUF_decompress` from Yann Collet's FiniteStateEntropy), the
+//! same family as zstd's literals Huffman, and each Huffman sub-stream is
+//! framed as a 6-byte header (3-byte LE regenerated size + 3-byte LE
+//! compressed size) then the Huff0 payload. But the *generic*
+//! `HUF_decompress` Lizard calls selects between **X1** (single-symbol)
+//! and **X2** (double-symbol) decode tables via `HUF_selectDecoder`, and
+//! that choice is **recomputed from the regenerated/compressed sizes,
+//! never stored in the stream**. This crate's Huff0 decoder
+//! (`src/zstd/huffman.rs`) is X1-only and is private to the `zstd`
+//! module; it covers neither X2 nor the size-driven selector. With no
+//! `lizard` CLI and no Huff0 fixtures in this environment, the only
+//! "test" available would be a round-trip against a hand-written
+//! X1-only encoder, which would always pick X1 and therefore validate
+//! nothing about real (possibly X2) blocks. Per the crate's
+//! `lzham`/`sit13` policy, an unvalidatable decoder is worse than an
+//! honest `Unsupported`, so we do not ship one.
+//!
+//! A future round could lift this once validation is possible: expose
+//! zstd's X1 Huff0 decoder as `pub(crate)`, add an X2 decoder plus the
+//! `HUF_selectDecoder` heuristic, and validate against fixtures from the
+//! `lizard` CLI (e.g. `lizard -30`). The 6-byte sub-stream header and the
+//! 4-stream jump table (three LE u16 sizes) already match formats this
+//! crate parses elsewhere. The frame-level uncompressed block path
 //! (high bit on block-size word) is handled fully, so frames where
 //! every block stored raw decode without ever exercising the sequence
 //! loop. Block checksums (FLG bit 4) and external dictionaries are
diff --git a/src/lzfse/decoder.rs b/src/lzfse/decoder.rs
index da14f70..d4eff69 100644
--- a/src/lzfse/decoder.rs
+++ b/src/lzfse/decoder.rs
@@ -59,10 +59,12 @@ enum State {
 enum BlockKind {
     Uncompressed,
     Lzvn,
-    /// `bvx2` returns Unsupported once we've parsed its header far enough
-    /// to know we hit it; this variant exists so the state machine can
-    /// surface that decision uniformly with the other block kinds.
+    /// `bvx2` (LZFSE v2): FSE + LZ77. Decoded by [`lzfse_v2::decode_block`]
+    /// once the whole block (variable-length header + both payload streams)
+    /// is buffered.
     V2,
+    /// `bvx1` (LZFSE v1, uncompressed-freq variant): not emitted by modern
+    /// encoders; returns [`Error::Unsupported`].
     V1,
 }
 
@@ -216,23 +218,56 @@ impl Decoder {
                         };
                     }
                     BlockKind::V2 => {
-                        // We don't decode v2 in this build, but we need to
-                        // skip past the block cleanly so callers don't
-                        // confuse "block we can't decode" with "garbage".
-                        // Parse the n_payload_bytes field from the header.
-                        if self.input_buf.len() < lzfse_v2::V2_HEADER_FIXED_BYTES {
+                        // The v2 header is variable-length (FSE frequency
+                        // tables follow the fixed packed fields). Buffer the
+                        // fixed 28 bytes (post-magic: n_raw + three u64 words)
+                        // first so we can read `header_size` and the payload
+                        // sizes, then arrange to buffer the whole block (header
+                        // + payload) before decoding it in one shot.
+                        let fixed = lzfse_v2::V2_HEADER_FIXED_BYTES;
+                        if self.input_buf.len() < fixed {
                             return Ok(RawProgress {
                                 consumed,
                                 written,
                                 done: false,
                             });
                         }
-                        // We *could* skip past the v2 block, but the spec is
-                        // explicit that the encoder may mix block types
-                        // freely. Returning Unsupported here is the
-                        // documented behaviour for v2 in this build.
-                        self.poisoned = true;
-                        return Err(Error::Unsupported);
+                        let header_size = match lzfse_v2::parse_header_size(&self.input_buf) {
+                            Ok(h) => h as usize,
+                            Err(e) => {
+                                self.poisoned = true;
+                                return Err(e);
+                            }
+                        };
+                        let n_payload = match lzfse_v2::parse_payload_size(&self.input_buf) {
+                            Ok(n) => n as usize,
+                            Err(e) => {
+                                self.poisoned = true;
+                                return Err(e);
+                            }
+                        };
+                        // `header_size` includes the 4-byte magic we already
+                        // dropped; remaining block bytes after the magic are
+                        // `header_size - 4 + n_payload`.
+                        let header_len = match header_size.checked_sub(4) {
+                            Some(h) if h >= fixed => h,
+                            _ => {
+                                self.poisoned = true;
+                                return Err(Error::Corrupt);
+                            }
+                        };
+                        let block_len = match header_len.checked_add(n_payload) {
+                            Some(b) => b,
+                            None => {
+                                self.poisoned = true;
+                                return Err(Error::Corrupt);
+                            }
+                        };
+                        self.state = State::AwaitPayload {
+                            kind: BlockKind::V2,
+                            payload_len: block_len,
+                            decoded_size: 0,
+                        };
                     }
                     BlockKind::V1 => {
                         self.poisoned = true;
@@ -287,7 +322,33 @@ impl Decoder {
                             self.input_buf.drain(..payload_len);
                             self.state = State::AwaitMagic;
                         }
-                        BlockKind::V2 | BlockKind::V1 => {
+                        BlockKind::V2 => {
+                            // The whole block (header + both payload streams)
+                            // is now buffered in `payload_len` bytes. Decode in
+                            // one shot. Bound the up-front output reservation by
+                            // a payload-derived hint (an FSE block can expand
+                            // more than LZVN, but is still bounded; the decoder
+                            // enforces the exact `n_raw_bytes` internally).
+                            let cap_hint = payload_len.saturating_mul(32).saturating_add(1 << 16);
+                            let mut block_out = Vec::new();
+                            match lzfse_v2::decode_block(
+                                &self.input_buf[..payload_len],
+                                &mut block_out,
+                                cap_hint,
+                            ) {
+                                Ok(consumed_block) => {
+                                    debug_assert_eq!(consumed_block, payload_len);
+                                }
+                                Err(e) => {
+                                    self.poisoned = true;
+                                    return Err(e);
+                                }
+                            }
+                            self.output_buf.append(&mut block_out);
+                            self.input_buf.drain(..payload_len);
+                            self.state = State::AwaitMagic;
+                        }
+                        BlockKind::V1 => {
                             // Unreachable — header step would have errored.
                             self.poisoned = true;
                             return Err(Error::Unsupported);
diff --git a/src/lzfse/fse.rs b/src/lzfse/fse.rs
index be962dd..106ee1e 100644
--- a/src/lzfse/fse.rs
+++ b/src/lzfse/fse.rs
@@ -20,6 +20,27 @@
 //! the symbol is a `u8`; for L/M/D, a base value and a count of extra value
 //! bits are stored.
 //!
+//! ## Table construction (general, k/k-1 split)
+//!
+//! Table construction matches Apple's `fse_init_decoder_table`: the `f`
+//! slots spread for a symbol are **not** all assigned the same bit-width.
+//! With `n_states = 2^L` (always a power of two) and per-symbol frequency
+//! `f` (arbitrary, `1..=n_states`, summing to `n_states`):
+//!
+//! ```text
+//! k  = L - floor(log2(f))          // == clz(f) - clz(n_states)
+//! j0 = ((2 * n_states) >> k) - f
+//! for i in 0..f (i = the i-th slot for this symbol, in spread order):
+//!     if i < j0:  entry.k = k;     entry.delta = ((f + i) << k) - n_states
+//!     else:       entry.k = k - 1; entry.delta = (i - j0) << (k - 1)
+//! ```
+//!
+//! The first `j0` slots consume `k` bits, the remaining `f - j0` consume
+//! `k - 1` bits. When `f` is a power of two `j0 == f` and the table
+//! degenerates to a single bit-width per symbol; for general `f` the split
+//! is required to tile `[0, n_states)` exactly. This is the algorithm real
+//! Apple-produced LZFSE v2 streams rely on.
+//!
 //! Frequency tables in the v2 block header are encoded with the custom
 //! variable-width scheme implemented by [`decode_freq_table`].
 
@@ -71,29 +92,32 @@ pub(crate) fn build_literal_decoder(freq: &[u16], n_states: usize) -> Result<Vec
     let mut t = 0usize;
     let step = spread_step(n_states);
     let mask = n_states - 1;
-    let n_states_log2 = n_states.trailing_zeros();
+    let n_states_log2 = n_states.trailing_zeros() as i32;
     for (s, &f) in freq.iter().enumerate() {
         let f = f as usize;
         if f == 0 {
             continue;
         }
-        let k = if f == 1 {
-            n_states_log2 as i32
-        } else {
-            let ceil = 32 - (f as u32 - 1).leading_zeros();
-            n_states_log2 as i32 - ceil as i32
-        };
+        // k = L - floor(log2(f)) = clz(f) - clz(n_states); j0 splits the
+        // symbol's slots into a k-bit prefix and a (k-1)-bit suffix.
+        let floor_log2 = 31 - (f as u32).leading_zeros() as i32;
+        let k = n_states_log2 - floor_log2;
         if k < 0 {
             return Err(Error::Corrupt);
         }
         let k = k as u32;
+        let j0 = (((2 * n_states) >> k) as i32) - f as i32;
         for i in 0..f {
             while occupied[t] {
                 t = (t + step) & mask;
             }
-            let delta = ((f as i32 + i as i32) << k) - n_states as i32;
+            let (ek, delta) = if (i as i32) < j0 {
+                (k, ((f as i32 + i as i32) << k) - n_states as i32)
+            } else {
+                (k - 1, (i as i32 - j0) << (k - 1))
+            };
             table[t] = FseEntry {
-                k: k as u8,
+                k: ek as u8,
                 symbol: s as u8,
                 delta: delta as i16,
             };
@@ -129,29 +153,32 @@ pub(crate) fn build_lmd_decoder(
     let mut t = 0usize;
     let step = spread_step(n_states);
     let mask = n_states - 1;
-    let n_states_log2 = n_states.trailing_zeros();
+    let n_states_log2 = n_states.trailing_zeros() as i32;
     for (s, &f) in freq.iter().enumerate() {
         let f = f as usize;
         if f == 0 {
             continue;
         }
-        let k = if f == 1 {
-            n_states_log2 as i32
-        } else {
-            let ceil = 32 - (f as u32 - 1).leading_zeros();
-            n_states_log2 as i32 - ceil as i32
-        };
+        // k = L - floor(log2(f)); j0 splits the symbol's slots into a k-bit
+        // prefix and a (k-1)-bit suffix (see module docs).
+        let floor_log2 = 31 - (f as u32).leading_zeros() as i32;
+        let k = n_states_log2 - floor_log2;
         if k < 0 {
             return Err(Error::Corrupt);
         }
         let k = k as u32;
+        let j0 = (((2 * n_states) >> k) as i32) - f as i32;
         for i in 0..f {
             while occupied[t] {
                 t = (t + step) & mask;
             }
-            let delta = ((f as i32 + i as i32) << k) - n_states as i32;
+            let (ek, delta) = if (i as i32) < j0 {
+                (k, ((f as i32 + i as i32) << k) - n_states as i32)
+            } else {
+                (k - 1, (i as i32 - j0) << (k - 1))
+            };
             table[t] = LmdVEntry {
-                total_bits: (k as u8) + bits_per_symbol[s],
+                total_bits: (ek as u8) + bits_per_symbol[s],
                 value_bits: bits_per_symbol[s],
                 delta: delta as i16,
                 v_base: base_per_symbol[s],
@@ -260,3 +287,119 @@ pub(crate) fn decode_freq_table(
     }
     Ok((freqs, pos))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Core FSE invariant: for **each symbol** the `f` entries that carry it
+    /// must, via their `[delta, delta + 2^k)` next-state ranges, tile
+    /// `[0, n_states)` exactly once — that is what lets the encoder transition
+    /// to that symbol from any state. This holds **iff** the k/k-1 split is
+    /// implemented correctly; a regression to a single bit-width per symbol
+    /// breaks the tiling for any non-power-of-two frequency. The check is
+    /// independent of any encoder.
+    fn assert_literal_table_bijective(freq: &[u16], n_states: usize) {
+        let table = build_literal_decoder(freq, n_states).expect("table builds");
+        assert_eq!(table.len(), n_states);
+        // Per-symbol coverage of the next-state space.
+        let mut hits = vec![vec![0u32; n_states]; freq.len()];
+        for e in &table {
+            let span = 1usize << e.k;
+            let base = e.delta as i32;
+            for off in 0..span as i32 {
+                let next = base + off;
+                assert!(
+                    (0..n_states as i32).contains(&next),
+                    "next {next} out of range for entry {e:?}"
+                );
+                hits[e.symbol as usize][next as usize] += 1;
+            }
+        }
+        for (sym, &f) in freq.iter().enumerate() {
+            if f == 0 {
+                assert!(
+                    hits[sym].iter().all(|&h| h == 0),
+                    "absent symbol {sym} has table entries"
+                );
+                continue;
+            }
+            for (s, &h) in hits[sym].iter().enumerate() {
+                assert_eq!(
+                    h, 1,
+                    "symbol {sym}: state {s} reachable {h} times (expected exactly 1)"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn literal_table_bijective_non_dyadic() {
+        // Deliberately non-power-of-two frequency sets that sum to 1024.
+        // A single-`k` table builder cannot tile [0,1024) for any of these.
+        assert_literal_table_bijective(&[3, 5, 1000, 16], 1024);
+        assert_literal_table_bijective(&[300, 700, 24], 1024);
+        // Many singletons + one large symbol (1 is non-dyadic-adjacent edge).
+        let mut f = vec![1u16; 24];
+        f[0] = 1024 - 23;
+        assert_literal_table_bijective(&f, 1024);
+        // Skewed but smooth distribution (sums to 1024).
+        assert_literal_table_bijective(&[100, 101, 103, 107, 109, 504], 1024);
+    }
+
+    #[test]
+    fn literal_table_bijective_dyadic_still_ok() {
+        // The power-of-two case (j0 == f) must still tile correctly.
+        assert_literal_table_bijective(&[512, 256, 256], 1024);
+        assert_literal_table_bijective(&[1024], 1024);
+    }
+
+    #[test]
+    fn lmd_table_built_for_non_dyadic_freqs() {
+        // L stream: 64 states, a non-power-of-two split across symbols.
+        let mut freq = vec![0u16; 20];
+        freq[0] = 30;
+        freq[1] = 20;
+        freq[2] = 7;
+        freq[3] = 5;
+        freq[16] = 2; // a symbol carrying extra value bits
+        let extra = [0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 8];
+        let base = [
+            0i32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 28, 60,
+        ];
+        let table = build_lmd_decoder(&freq, 64, &extra, &base).expect("lmd table builds");
+        assert_eq!(table.len(), 64);
+        // For each symbol the state-transition portion (total_bits-value_bits)
+        // must tile [0,64). Group entries by symbol via v_base, which is
+        // unique per symbol in `base`.
+        let mut hits: Vec<vec::Vec<u32>> = (0..20).map(|_| vec![0u32; 64]).collect();
+        for e in &table {
+            let sym = base
+                .iter()
+                .position(|&b| b == e.v_base)
+                .expect("known base");
+            let kbits = e.total_bits - e.value_bits;
+            let span = 1usize << kbits;
+            for off in 0..span as i32 {
+                let next = e.delta as i32 + off;
+                assert!((0..64).contains(&next));
+                hits[sym][next as usize] += 1;
+            }
+        }
+        for (sym, &f) in freq.iter().enumerate() {
+            if f == 0 {
+                continue;
+            }
+            assert!(
+                hits[sym].iter().all(|&h| h == 1),
+                "lmd symbol {sym} not bijective over states"
+            );
+        }
+    }
+
+    #[test]
+    fn non_power_of_two_table_size_rejected() {
+        // The table SIZE must be 2^L even though per-symbol freqs are general.
+        assert!(build_literal_decoder(&[5, 5], 10).is_err());
+    }
+}
diff --git a/src/lzfse/lzfse_v2.rs b/src/lzfse/lzfse_v2.rs
index 92527bb..c59bf0a 100644
--- a/src/lzfse/lzfse_v2.rs
+++ b/src/lzfse/lzfse_v2.rs
@@ -1,62 +1,1378 @@
-//! LZFSE v2 block decoder.
+//! LZFSE v2 (`bvx2`) block decoder.
 //!
 //! ## Status in this build
 //!
-//! **`bvx2` blocks return [`Error::Unsupported`]**. The FSE primitives that
-//! a full v2 implementation needs are present in [`super::fse`], but the
-//! intricate bit-packed v2 block header, the L/M/D table parsing, and the
-//! reverse FSE bit stream are sufficiently subtle that a half-correct
-//! implementation would silently corrupt output for some inputs.
-//!
-//! The decoder dispatches on `bvx2` magic, parses just enough of the v2
-//! header to know how many bytes the block claims to occupy (so we can
-//! advance past it cleanly), and returns Unsupported rather than risk a
-//! buggy decode.
-//!
-//! ## Wire format reference
-//!
-//! For a future round, the v2 header layout is (LSB-first packed):
-//! - `n_raw_bytes: 20`
-//! - `n_payload_bytes: 20`
-//! - `n_literals: 20`
-//! - `n_matches: 20`
-//! - `n_literal_payload_bytes: 20`
-//! - `n_lmd_payload_bytes: 20`
-//! - `literal_bits: 3` (number of stub bits in the literal stream final byte)
-//! - `literal_state[0..=3]: 10 each` (40 bits — four interleaved FSE states)
-//! - `lmd_bits: 3`
-//! - `l_state: 10`
-//! - `m_state: 10`
-//! - `d_state: 10`
-//! - followed by packed frequency tables for D (64 syms), M (20 syms),
-//!   L (20 syms), and LIT (256 syms).
-//!
-//! The two payload streams (literal then LMD) are encoded *in reverse*:
-//! the decoder pulls bytes from the end of each payload toward its start.
-
-#![allow(dead_code)]
+//! **`bvx2` blocks are now decoded.** This is the core LZFSE block type
+//! (LZ77 literal/match commands entropy-coded with Finite State Entropy),
+//! so the `lzfse` decoder handles real compressed payloads here rather than
+//! only the `bvx-` (uncompressed) and `bvxn` (LZVN) block kinds.
+//!
+//! ## Validation & interop caveat
+//!
+//! There is **no Apple `lzfse` reference tool and no captured `bvx2`
+//! fixtures available in this build environment**, so correctness is gated
+//! by **round-trip against this crate's own spec-conformant v2 encoder**
+//! ([`encode_block`], `#[cfg(test)]`): we assert `decode(encode(x)) == x`
+//! over empty / small / text / repetitive / random / multi-block inputs,
+//! including inputs large enough to force a genuine FSE-coded block. The
+//! encoder builds FSE frequency tables from the L/M/D/LIT histograms with the
+//! standard quantized (nearest) normalization — producing **general,
+//! non-power-of-two frequencies** — FSE-encodes the interleaved literal and
+//! LMD streams in reverse, and packs the v2 header exactly per the documented
+//! wire layout. Round-trip tests deliberately include skewed, non-dyadic
+//! literal distributions and small (singleton) match-count histograms, plus
+//! one hand-frozen non-dyadic block decoded independently of the encoder, so
+//! a regression to a single bit-width per symbol would fail.
+//!
+//! The FSE table construction ([`super::fse`]) now matches Apple's general
+//! `fse_init_decoder_table` (the **k/k-1 split**: a symbol's `f` spread slots
+//! are partitioned into a `k`-bit prefix and a `(k-1)`-bit suffix at the
+//! boundary `j0 = (2·n_states >> k) − f`), so arbitrary per-symbol
+//! frequencies are handled — not just power-of-two normalizations. The table
+//! *size* is always `2^L`; only the per-symbol frequencies are general.
+//!
+//! Interop with Apple-produced `bvx2` is therefore **best-effort but follows
+//! the real table-construction algorithm**: the decoder mirrors the
+//! documented format precisely (the same header layout, the same L/M/D
+//! base/extra-bit tables, the same frequency-table encoding, the same reverse
+//! FSE bit convention, and now the same general FSE table construction). It
+//! has still not been cross-checked against an actual Apple-produced stream
+//! in this environment, so full Apple-stream interop remains unverified here.
+//!
+//! ## Wire format reference (v2 header, authoritative)
+//!
+//! After the 4-byte `bvx2` magic the v2 header is (little-endian,
+//! `__packed__`):
+//!
+//! - `n_raw_bytes: u32` — decoded output size of this block.
+//! - `packed_fields[0]: u64`
+//!   - `[0..20)`  `n_literals`
+//!   - `[20..40)` `n_literal_payload_bytes`
+//!   - `[40..60)` `n_matches`
+//!   - `[60..63)` `literal_bits` (FSE final-byte stub width for the literal
+//!     stream)
+//! - `packed_fields[1]: u64`
+//!   - `[0..10)`  `literal_state[0]`
+//!   - `[10..20)` `literal_state[1]`
+//!   - `[20..30)` `literal_state[2]`
+//!   - `[30..40)` `literal_state[3]`
+//!   - `[40..60)` `n_lmd_payload_bytes`
+//!   - `[60..63)` `lmd_bits` (FSE stub width for the LMD stream)
+//! - `packed_fields[2]: u64`
+//!   - `[0..32)`  `header_size` (bytes, magic..end of freq tables)
+//!   - `[32..42)` `l_state`
+//!   - `[42..52)` `m_state`
+//!   - `[52..62)` `d_state`
+//! - then the variable-length frequency tables, bit-contiguous, in order
+//!   **L (20 syms), M (20 syms), D (64 syms), LIT (256 syms)**, each packed
+//!   with the LZFSE Huffman-style fixed encoding
+//!   ([`super::fse::decode_freq_table`]).
+//!
+//! The two payload streams follow the header: `n_literal_payload_bytes` of
+//! literal FSE stream, then `n_lmd_payload_bytes` of LMD FSE stream. Both are
+//! decoded **in reverse** (the FSE encoder is LIFO, so the decoder pulls
+//! bytes from the end of each stream toward its start).
+
+use alloc::vec;
+use alloc::vec::Vec;
 
 use crate::error::Error;
-use crate::lzfse::bits::HeaderBits;
+use crate::lzfse::bits::FseBits;
+use crate::lzfse::fse;
 
-/// Size of the fixed-width portion of the v2 header (the packed bit fields
-/// before the variable-length frequency tables). Apple's reference: the v2
-/// header is 28 bytes of packed fields plus the freq-table payload.
+/// Size of the fixed-width portion of the v2 header **after the 4-byte
+/// magic**: `n_raw_bytes`(4) + three packed `u64` words (24) = 28 bytes. The
+/// variable-length frequency tables follow it. (Apple's `header_size` field
+/// additionally counts the 4-byte magic, so `header_size == 4 +
+/// V2_HEADER_FIXED_BYTES + freq_table_bytes`.)
 pub(crate) const V2_HEADER_FIXED_BYTES: usize = 28;
 
-/// Parse just the `n_payload_bytes` field out of a v2 block header. Used
-/// by the main decoder to know how many bytes the block occupies so we
-/// can skip it cleanly when returning Unsupported.
-///
-/// `bytes` is the slice starting **after** the 4-byte magic.
-/// Returns `Err(Error::UnexpectedEnd)` if `bytes.len() < V2_HEADER_FIXED_BYTES`.
+/// Number of symbols in each stream's alphabet.
+const N_L_SYMBOLS: usize = 20;
+const N_M_SYMBOLS: usize = 20;
+const N_D_SYMBOLS: usize = 64;
+const N_LIT_SYMBOLS: usize = 256;
+
+/// FSE state counts (table sizes) for each stream. Fixed by the LZFSE format.
+const L_STATES: usize = 64;
+const M_STATES: usize = 64;
+const D_STATES: usize = 256;
+const LIT_STATES: usize = 1024;
+
+/// L/M/D extra-bit widths and base values (Apple's `lzfse_internal.h`).
+const L_EXTRA_BITS: [u8; N_L_SYMBOLS] =
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 8];
+const L_BASE: [i32; N_L_SYMBOLS] = [
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 28, 60,
+];
+const M_EXTRA_BITS: [u8; N_M_SYMBOLS] =
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 8, 11];
+const M_BASE: [i32; N_M_SYMBOLS] = [
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 56, 312,
+];
+const D_EXTRA_BITS: [u8; N_D_SYMBOLS] = [
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+    8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14,
+    14, 14, 15, 15, 15, 15,
+];
+const D_BASE: [i32; N_D_SYMBOLS] = [
+    0, 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 36, 44, 52, 60, 76, 92, 108, 124, 156, 188, 220,
+    252, 316, 380, 444, 508, 636, 764, 892, 1020, 1276, 1532, 1788, 2044, 2556, 3068, 3580, 4092,
+    5116, 6140, 7164, 8188, 10236, 12284, 14332, 16380, 20476, 24572, 28668, 32764, 40956, 49148,
+    57340, 65532, 81916, 98300, 114684, 131068, 163836, 196604, 229372,
+];
+
+/// Parsed v2 header.
+struct V2Header {
+    n_raw_bytes: u32,
+    n_literals: u32,
+    n_literal_payload_bytes: u32,
+    n_matches: u32,
+    literal_bits: u32,
+    literal_state: [u32; 4],
+    n_lmd_payload_bytes: u32,
+    lmd_bits: u32,
+    header_size: u32,
+    l_state: u32,
+    m_state: u32,
+    d_state: u32,
+    l_freq: Vec<u16>,
+    m_freq: Vec<u16>,
+    d_freq: Vec<u16>,
+    lit_freq: Vec<u16>,
+}
+
+/// Extract `width` bits starting at `lo` from a 64-bit packed word.
+#[inline]
+fn bits64(word: u64, lo: u32, width: u32) -> u64 {
+    if width == 0 {
+        return 0;
+    }
+    let mask = if width == 64 {
+        u64::MAX
+    } else {
+        (1u64 << width) - 1
+    };
+    (word >> lo) & mask
+}
+
+/// Total payload size (literal + LMD) declared by a v2 block header. Used by
+/// the streaming decoder to know how many payload bytes to buffer. `bytes`
+/// is the slice starting **after** the 4-byte magic.
 pub(crate) fn parse_payload_size(bytes: &[u8]) -> Result<u32, Error> {
     if bytes.len() < V2_HEADER_FIXED_BYTES {
         return Err(Error::UnexpectedEnd);
     }
-    let mut bits = HeaderBits::new(&bytes[..V2_HEADER_FIXED_BYTES]);
-    // Skip n_raw_bytes (20 bits).
-    let _n_raw = bits.read(20)?;
-    let n_payload = bits.read(20)?;
-    Ok(n_payload)
+    let w0 = u64::from_le_bytes([
+        bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11],
+    ]);
+    let w1 = u64::from_le_bytes([
+        bytes[12], bytes[13], bytes[14], bytes[15], bytes[16], bytes[17], bytes[18], bytes[19],
+    ]);
+    let n_literal_payload_bytes = bits64(w0, 20, 20) as u32;
+    let n_lmd_payload_bytes = bits64(w1, 40, 20) as u32;
+    n_literal_payload_bytes
+        .checked_add(n_lmd_payload_bytes)
+        .ok_or(Error::Corrupt)
+}
+
+/// Total header length (including magic) declared by a v2 block header.
+/// `bytes` starts after the magic.
+pub(crate) fn parse_header_size(bytes: &[u8]) -> Result<u32, Error> {
+    if bytes.len() < V2_HEADER_FIXED_BYTES {
+        return Err(Error::UnexpectedEnd);
+    }
+    let w2 = u64::from_le_bytes([
+        bytes[20], bytes[21], bytes[22], bytes[23], bytes[24], bytes[25], bytes[26], bytes[27],
+    ]);
+    Ok(bits64(w2, 0, 32) as u32)
+}
+
+/// Parse the v2 header from `bytes`, which begins **just after** the 4-byte
+/// magic.
+fn parse_header(bytes: &[u8]) -> Result<V2Header, Error> {
+    // The fixed post-magic header is n_raw(4) + three u64 packed words (24) =
+    // 28 bytes = V2_HEADER_FIXED_BYTES; the frequency tables follow it.
+    let fixed = V2_HEADER_FIXED_BYTES;
+    if bytes.len() < fixed {
+        return Err(Error::UnexpectedEnd);
+    }
+    let n_raw_bytes = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
+    let w0 = u64::from_le_bytes([
+        bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11],
+    ]);
+    let w1 = u64::from_le_bytes([
+        bytes[12], bytes[13], bytes[14], bytes[15], bytes[16], bytes[17], bytes[18], bytes[19],
+    ]);
+    let w2 = u64::from_le_bytes([
+        bytes[20], bytes[21], bytes[22], bytes[23], bytes[24], bytes[25], bytes[26], bytes[27],
+    ]);
+
+    let n_literals = bits64(w0, 0, 20) as u32;
+    let n_literal_payload_bytes = bits64(w0, 20, 20) as u32;
+    let n_matches = bits64(w0, 40, 20) as u32;
+    let literal_bits = bits64(w0, 60, 3) as u32;
+
+    let literal_state = [
+        bits64(w1, 0, 10) as u32,
+        bits64(w1, 10, 10) as u32,
+        bits64(w1, 20, 10) as u32,
+        bits64(w1, 30, 10) as u32,
+    ];
+    let n_lmd_payload_bytes = bits64(w1, 40, 20) as u32;
+    let lmd_bits = bits64(w1, 60, 3) as u32;
+
+    let header_size = bits64(w2, 0, 32) as u32;
+    let l_state = bits64(w2, 32, 10) as u32;
+    let m_state = bits64(w2, 42, 10) as u32;
+    let d_state = bits64(w2, 52, 10) as u32;
+
+    if literal_bits > 7 || lmd_bits > 7 {
+        return Err(Error::Corrupt);
+    }
+
+    // `header_size` includes the 4-byte magic, so the minimum valid value is
+    // magic(4) + the fixed packed fields.
+    if (header_size as usize) < 4 + V2_HEADER_FIXED_BYTES {
+        return Err(Error::Corrupt);
+    }
+    let freq_end = (header_size as usize) - 4; // post-magic offset
+    if freq_end < fixed || freq_end > bytes.len() {
+        return Err(Error::UnexpectedEnd);
+    }
+    let freq_bytes = &bytes[fixed..freq_end];
+
+    let (l_freq, m_freq, d_freq, lit_freq) = decode_all_freqs(freq_bytes)?;
+
+    check_freq_sum(&l_freq, L_STATES)?;
+    check_freq_sum(&m_freq, M_STATES)?;
+    check_freq_sum(&d_freq, D_STATES)?;
+    check_freq_sum(&lit_freq, LIT_STATES)?;
+
+    if literal_state.iter().any(|&s| s as usize >= LIT_STATES)
+        || l_state as usize >= L_STATES
+        || m_state as usize >= M_STATES
+        || d_state as usize >= D_STATES
+    {
+        return Err(Error::Corrupt);
+    }
+
+    Ok(V2Header {
+        n_raw_bytes,
+        n_literals,
+        n_literal_payload_bytes,
+        n_matches,
+        literal_bits,
+        literal_state,
+        n_lmd_payload_bytes,
+        lmd_bits,
+        header_size,
+        l_state,
+        m_state,
+        d_state,
+        l_freq,
+        m_freq,
+        d_freq,
+        lit_freq,
+    })
+}
+
+fn check_freq_sum(freq: &[u16], states: usize) -> Result<(), Error> {
+    let mut sum = 0usize;
+    for &f in freq {
+        sum += f as usize;
+    }
+    if sum != states {
+        return Err(Error::Corrupt);
+    }
+    Ok(())
+}
+
+/// The four frequency tables (L, M, D, LIT) decoded from a v2 header.
+type FreqTables = (Vec<u16>, Vec<u16>, Vec<u16>, Vec<u16>);
+
+/// Decode the four bit-contiguous frequency tables (L, M, D, LIT).
+fn decode_all_freqs(freq_bytes: &[u8]) -> Result<FreqTables, Error> {
+    let mut bit_pos = 0usize;
+    let l = decode_freq_at(freq_bytes, &mut bit_pos, N_L_SYMBOLS)?;
+    let m = decode_freq_at(freq_bytes, &mut bit_pos, N_M_SYMBOLS)?;
+    let d = decode_freq_at(freq_bytes, &mut bit_pos, N_D_SYMBOLS)?;
+    let lit = decode_freq_at(freq_bytes, &mut bit_pos, N_LIT_SYMBOLS)?;
+    Ok((l, m, d, lit))
+}
+
+/// Decode one frequency table at bit offset `*bit_pos`, advancing it.
+///
+/// [`fse::decode_freq_table`] reads LSB-first from bit 0 of the slice it is
+/// given. Our tables are bit-packed back-to-back, so a table may begin
+/// mid-byte; we shift a temporary view so it starts at bit 0.
+fn decode_freq_at(
+    freq_bytes: &[u8],
+    bit_pos: &mut usize,
+    n_symbols: usize,
+) -> Result<Vec<u16>, Error> {
+    let byte_off = *bit_pos / 8;
+    let in_byte = (*bit_pos % 8) as u32;
+    if byte_off > freq_bytes.len() {
+        return Err(Error::UnexpectedEnd);
+    }
+    let tail = &freq_bytes[byte_off..];
+    if in_byte == 0 {
+        let (freqs, consumed_bits) = fse::decode_freq_table(tail, n_symbols)?;
+        *bit_pos += consumed_bits;
+        Ok(freqs)
+    } else {
+        // Shift `tail` right by `in_byte` bits so the table begins at bit 0.
+        let mut shifted = Vec::with_capacity(tail.len());
+        for w in 0..tail.len() {
+            let lo = tail[w] >> in_byte;
+            let hi = if w + 1 < tail.len() {
+                tail[w + 1].checked_shl(8 - in_byte).unwrap_or(0)
+            } else {
+                0
+            };
+            shifted.push(lo | hi);
+        }
+        let (freqs, consumed_bits) = fse::decode_freq_table(&shifted, n_symbols)?;
+        *bit_pos += consumed_bits;
+        Ok(freqs)
+    }
+}
+
+/// Decode a full `bvx2` block. `block` is the slice **after** the 4-byte
+/// magic and must contain at least `header_size - 4 + payload` bytes.
+/// Decoded output is appended to `out`. Returns the number of bytes consumed
+/// from `block` (header + payload).
+///
+/// `out_cap_hint` bounds the up-front output reservation against a hostile
+/// `n_raw_bytes`; the real `n_raw_bytes` bound is still enforced exactly.
+pub(crate) fn decode_block(
+    block: &[u8],
+    out: &mut Vec<u8>,
+    out_cap_hint: usize,
+) -> Result<usize, Error> {
+    let hdr = parse_header(block)?;
+
+    let header_len = (hdr.header_size as usize) - 4; // post-magic
+    let lit_payload_len = hdr.n_literal_payload_bytes as usize;
+    let lmd_payload_len = hdr.n_lmd_payload_bytes as usize;
+    let payload_len = lit_payload_len
+        .checked_add(lmd_payload_len)
+        .ok_or(Error::Corrupt)?;
+    let total = header_len.checked_add(payload_len).ok_or(Error::Corrupt)?;
+    if block.len() < total {
+        return Err(Error::UnexpectedEnd);
+    }
+
+    let lit_payload = &block[header_len..header_len + lit_payload_len];
+    let lmd_payload = &block[header_len + lit_payload_len..total];
+
+    // ── 1. Decode literals (4-way interleaved FSE, reverse stream) ──
+    let lit_table = fse::build_literal_decoder(&hdr.lit_freq, LIT_STATES)?;
+    let n_literals = hdr.n_literals as usize;
+    // Reject an absurd literal count up-front (DoS guard).
+    if n_literals > out_cap_hint.saturating_mul(16).saturating_add(1 << 20) {
+        return Err(Error::Corrupt);
+    }
+    let mut literals = vec![0u8; n_literals];
+    {
+        let mut bits = FseBits::new_with_stub(lit_payload, hdr.literal_bits)?;
+        let mut states = hdr.literal_state;
+        let mut i = 0usize;
+        while i < n_literals {
+            for state in states.iter_mut() {
+                if i >= n_literals {
+                    break;
+                }
+                let (sym, next) = fse::fse_decode_literal(*state, &lit_table, &mut bits)?;
+                literals[i] = sym;
+                *state = next;
+                i += 1;
+            }
+        }
+    }
+
+    // ── 2 & 3. Decode L/M/D commands and execute the LZ ──
+    let l_table = fse::build_lmd_decoder(&hdr.l_freq, L_STATES, &L_EXTRA_BITS, &L_BASE)?;
+    let m_table = fse::build_lmd_decoder(&hdr.m_freq, M_STATES, &M_EXTRA_BITS, &M_BASE)?;
+    let d_table = fse::build_lmd_decoder(&hdr.d_freq, D_STATES, &D_EXTRA_BITS, &D_BASE)?;
+
+    let n_raw = hdr.n_raw_bytes as usize;
+    let start_len = out.len();
+    out.reserve(n_raw.min(out_cap_hint));
+
+    let mut lmd = FseBits::new_with_stub(lmd_payload, hdr.lmd_bits)?;
+    let mut l_state = hdr.l_state;
+    let mut m_state = hdr.m_state;
+    let mut d_state = hdr.d_state;
+
+    let mut lit_pos = 0usize;
+    let mut prev_d: i32 = 0;
+    let n_matches = hdr.n_matches as usize;
+
+    for _ in 0..n_matches {
+        // The encoder pushed streams so the decoder pulls L, then M, then D.
+        let (l_val, l_next) = fse::fse_decode_lmd(l_state, &l_table, &mut lmd)?;
+        let (m_val, m_next) = fse::fse_decode_lmd(m_state, &m_table, &mut lmd)?;
+        let (d_val, d_next) = fse::fse_decode_lmd(d_state, &d_table, &mut lmd)?;
+        l_state = l_next;
+        m_state = m_next;
+        d_state = d_next;
+
+        // D == 0 means "reuse the previous distance".
+        let d = if d_val == 0 { prev_d } else { d_val };
+        if d_val != 0 {
+            prev_d = d_val;
+        }
+        if l_val < 0 || m_val < 0 || d <= 0 {
+            return Err(Error::Corrupt);
+        }
+        let l = l_val as usize;
+        let m = m_val as usize;
+        let d = d as usize;
+
+        // Emit L literals.
+        if lit_pos + l > n_literals {
+            return Err(Error::Corrupt);
+        }
+        if out.len() + l > start_len + n_raw {
+            return Err(Error::Corrupt);
+        }
+        out.extend_from_slice(&literals[lit_pos..lit_pos + l]);
+        lit_pos += l;
+
+        // Copy an M-byte match at distance d (may overlap).
+        let cur = out.len() - start_len;
+        if d > cur {
+            return Err(Error::Corrupt);
+        }
+        if out.len() + m > start_len + n_raw {
+            return Err(Error::Corrupt);
+        }
+        for src in (out.len() - d..).take(m) {
+            let b = out[src];
+            out.push(b);
+        }
+    }
+
+    // Trailing literals after the last match.
+    let remaining = n_literals - lit_pos;
+    if remaining > 0 {
+        if out.len() + remaining > start_len + n_raw {
+            return Err(Error::Corrupt);
+        }
+        out.extend_from_slice(&literals[lit_pos..]);
+    }
+
+    if out.len() - start_len != n_raw {
+        return Err(Error::Corrupt);
+    }
+
+    Ok(total)
+}
+
+// ───────────────────────── test-only encoder ─────────────────────────────
+//
+// A spec-conformant `bvx2` encoder used only to validate the decoder by
+// round-trip. It uses a greedy LZ parser, the standard quantized (nearest)
+// FSE frequency normalization producing general, non-power-of-two
+// frequencies, encode slots that exactly invert the decoder's general k/k-1
+// FSE table, and the documented header/payload packing.
+
+#[cfg(test)]
+pub(crate) use test_encoder::encode_block;
+
+#[cfg(test)]
+mod test_encoder {
+    use super::*;
+
+    /// One FSE encode slot for a symbol: covers next-state range `[lo, hi]`,
+    /// emits `(next_state - lo)` in `k` bits and moves the encoder's running
+    /// state to table index `t`.
+    struct EncSlot {
+        t: usize,
+        k: u8,
+        lo: i32,
+        hi: i32,
+    }
+
+    /// Build per-symbol encode slots that exactly invert
+    /// `fse::build_literal_decoder` / `build_lmd_decoder`, including the
+    /// general k/k-1 split. Frequencies are arbitrary (`1..=n_states`) and
+    /// must sum to `n_states`; the per-symbol slot set tiles `[0, n_states)`.
+    ///
+    /// Each decode entry maps a current state `t` to a `(next_state, k_bits)`
+    /// pull. The encoder inverts this: given the *next* state `cur` it finds
+    /// the slot whose `[lo, hi]` next-state range contains `cur`, emits
+    /// `cur - lo` in `k` bits and moves the running state to that slot's `t`.
+    /// A slot in the `i < j0` region uses `k` bits, otherwise `k - 1` bits —
+    /// matching the decode table exactly.
+    fn build_enc_slots(freq: &[u16], n_states: usize) -> Vec<Vec<EncSlot>> {
+        let mut slots: Vec<Vec<EncSlot>> = (0..freq.len()).map(|_| Vec::new()).collect();
+        let mut occ = vec![false; n_states];
+        let mut t = 0usize;
+        let step = (n_states >> 1) + (n_states >> 3) + 3;
+        let mask = n_states - 1;
+        let log2 = n_states.trailing_zeros() as i32;
+        for (s, &f) in freq.iter().enumerate() {
+            let f = f as usize;
+            if f == 0 {
+                continue;
+            }
+            let floor_log2 = 31 - (f as u32).leading_zeros() as i32;
+            let k = log2 - floor_log2;
+            let j0 = (((2 * n_states) >> k) as i32) - f as i32;
+            for i in 0..f {
+                while occ[t] {
+                    t = (t + step) & mask;
+                }
+                let (ek, delta) = if (i as i32) < j0 {
+                    (k, ((f as i32 + i as i32) << k) - n_states as i32)
+                } else {
+                    (k - 1, (i as i32 - j0) << (k - 1))
+                };
+                slots[s].push(EncSlot {
+                    t,
+                    k: ek as u8,
+                    lo: delta,
+                    hi: delta + (1i32 << ek) - 1,
+                });
+                occ[t] = true;
+                t = (t + step) & mask;
+            }
+        }
+        for sl in slots.iter_mut() {
+            sl.sort_by_key(|x| x.lo);
+        }
+        slots
+    }
+
+    /// A bit accumulator producing the reverse FSE stream byte layout that
+    /// [`FseBits`] consumes.
+    ///
+    /// The FSE encoder must walk symbols **in reverse** to chain states
+    /// correctly (each symbol's emitted state is determined by the following
+    /// symbol in the same lane). The caller therefore [`push`](Self::push)es
+    /// `(value, n_bits)` chunks in reverse-of-pull order. [`finish`] reverses
+    /// the chunk list back to forward pull order, then packs the resulting
+    /// bit string into bytes laid out so [`FseBits`] (which pulls from the end
+    /// of the payload backward) reads them back in exactly pull order. One
+    /// stub byte always trails so the decoder's init-byte consumption lands on
+    /// it.
+    struct FseSink {
+        /// Each entry is one symbol's `(value, n_bits)`, recorded in
+        /// reverse-of-pull order.
+        chunks: Vec<(u64, u8)>,
+    }
+
+    impl FseSink {
+        fn new() -> Self {
+            Self { chunks: Vec::new() }
+        }
+
+        /// Record `n` bits of `value` for one symbol (reverse-of-pull order).
+        fn push(&mut self, value: u64, n: u8) {
+            self.chunks.push((value, n));
+        }
+
+        /// Serialize to `(payload_bytes, stub_bits)`.
+        fn finish(&self) -> (Vec<u8>, u32) {
+            // Reverse chunks to forward pull order, then flatten to a bit
+            // vector (LSB-first within each chunk).
+            let mut bits: Vec<u8> = Vec::new();
+            for &(value, n) in self.chunks.iter().rev() {
+                for i in 0..n {
+                    bits.push(((value >> i) & 1) as u8);
+                }
+            }
+            let total = bits.len();
+            let stub = (total % 8) as u32;
+            let full = total / 8;
+            let plen = full + 1;
+            let mut payload = vec![0u8; plen];
+            let mut bi = 0usize;
+            let mut sb = 0u8;
+            for i in 0..stub {
+                sb |= bits[bi] << i;
+                bi += 1;
+            }
+            payload[plen - 1] = sb;
+            let mut idx = plen as i32 - 2;
+            while idx >= 0 {
+                let mut b = 0u8;
+                for i in 0..8 {
+                    if bi < total {
+                        b |= bits[bi] << i;
+                        bi += 1;
+                    }
+                }
+                payload[idx as usize] = b;
+                idx -= 1;
+            }
+            (payload, stub)
+        }
+    }
+
+    /// Encode a frequency value with the LZFSE Huffman-style fixed encoding
+    /// (inverse of `fse::decode_freq_table`).
+    fn encode_freq_value(v: u16) -> (u32, u32) {
+        match v {
+            0 => (0b00, 2),
+            1 => (0b10, 2),
+            2 => (0b001, 3),
+            3 => (0b101, 3),
+            4 => (0b00011, 5),
+            5 => (0b01011, 5),
+            6 => (0b10011, 5),
+            7 => (0b11011, 5),
+            8..=23 => (0b0111 | ((v as u32 - 8) << 4), 8),
+            24..=1047 => (0b1111 | ((v as u32 - 24) << 4), 14),
+            _ => panic!("frequency {v} too large to encode"),
+        }
+    }
+
+    /// Normalize a histogram to **general** (arbitrary, not power-of-two)
+    /// frequencies summing exactly to `n_states`, giving every present symbol
+    /// at least 1. This is the standard quantized normalization: scale each
+    /// count by `n_states / total`, round to nearest, force present symbols to
+    /// 1, then correct the running sum by nudging the largest entry (which can
+    /// absorb ±1 changes without dropping a present symbol to 0).
+    ///
+    /// The resulting per-symbol frequencies are deliberately *not* coerced to
+    /// powers of two — the decoder's general k/k-1 table builder handles them
+    /// directly. Singletons and skewed (non-dyadic) distributions are
+    /// produced as-is so the round-trip exercises the general FSE path.
+    pub(super) fn normalize_general(hist: &[u32], n_states: usize) -> Vec<u16> {
+        let n = hist.len();
+        let total: u64 = hist.iter().map(|&h| h as u64).sum();
+        let mut freq = vec![0u16; n];
+        if total == 0 {
+            freq[0] = n_states as u16;
+            return freq;
+        }
+        // Nearest-rounding scale, with a floor of 1 for every present symbol.
+        let mut assigned: i64 = 0;
+        for (i, &h) in hist.iter().enumerate() {
+            if h == 0 {
+                continue;
+            }
+            let scaled = (h as u64 * n_states as u64 + total / 2) / total;
+            let f = scaled.max(1).min(n_states as u64) as i64;
+            freq[i] = f as u16;
+            assigned += f;
+        }
+        let target = n_states as i64;
+        // Correct the sum. Each step adjusts the symbol that can absorb the
+        // change: when overshooting, the largest entry with `f > 1`; when
+        // undershooting, simply the largest entry. This converges because the
+        // largest entry is at least `n_states / n` which exceeds the total
+        // correction magnitude (bounded by `n`).
+        while assigned != target {
+            if assigned > target {
+                let (idx, _) = freq
+                    .iter()
+                    .enumerate()
+                    .filter(|&(_, &f)| f > 1)
+                    .max_by_key(|&(_, &f)| f)
+                    .expect("an entry > 1 exists while overshooting");
+                freq[idx] -= 1;
+                assigned -= 1;
+            } else {
+                let (idx, _) = freq
+                    .iter()
+                    .enumerate()
+                    .max_by_key(|&(_, &f)| f)
+                    .expect("non-empty alphabet");
+                freq[idx] += 1;
+                assigned += 1;
+            }
+        }
+        debug_assert_eq!(assigned, target);
+        freq
+    }
+
+    /// Map an L/M/D value to `(symbol, extra_value)`.
+    fn map_lmd(value: i32, base: &[i32], extra: &[u8]) -> (usize, u32) {
+        for s in 0..base.len() {
+            if base[s] <= value {
+                let hi = base[s] + ((1i64 << extra[s]) - 1) as i32;
+                if value <= hi {
+                    return (s, (value - base[s]) as u32);
+                }
+            }
+        }
+        let s = base.len() - 1;
+        (s, (value - base[s]).max(0) as u32)
+    }
+
+    struct Cmd {
+        l: usize,
+        m: usize,
+        d: usize,
+    }
+
+    /// Greedy LZ parse of `data` via a hash chain over 4-byte prefixes.
+    fn lz_parse(data: &[u8]) -> (Vec<u8>, Vec<Cmd>) {
+        const MIN_MATCH: usize = 4;
+        const MAX_MATCH: usize = 2359; // M max encodable
+        const MAX_DIST: usize = 262_139; // D max encodable
+        let mut literals = Vec::new();
+        let mut cmds = Vec::new();
+        let n = data.len();
+
+        let hsize = 1usize << 15;
+        let mut head = vec![usize::MAX; hsize];
+        let mut prev = vec![usize::MAX; n.max(1)];
+        let hash = |d: &[u8], i: usize| -> usize {
+            let v = (d[i] as usize)
+                | ((d[i + 1] as usize) << 8)
+                | ((d[i + 2] as usize) << 16)
+                | ((d[i + 3] as usize) << 24);
+            (v.wrapping_mul(2654435761) >> 17) & (hsize - 1)
+        };
+
+        let mut i = 0usize;
+        let mut pending_lit = 0usize;
+        while i < n {
+            let mut best_len = 0usize;
+            let mut best_dist = 0usize;
+            if i + MIN_MATCH <= n {
+                let h = hash(data, i);
+                let mut cand = head[h];
+                let mut chain = 0;
+                while cand != usize::MAX && chain < 64 {
+                    if i - cand <= MAX_DIST {
+                        let mut len = 0usize;
+                        while i + len < n && len < MAX_MATCH && data[cand + len] == data[i + len] {
+                            len += 1;
+                        }
+                        if len > best_len {
+                            best_len = len;
+                            best_dist = i - cand;
+                        }
+                    } else {
+                        break;
+                    }
+                    cand = prev[cand];
+                    chain += 1;
+                }
+            }
+
+            if best_len >= MIN_MATCH {
+                let end = i + best_len;
+                cmds.push(Cmd {
+                    l: pending_lit,
+                    m: best_len,
+                    d: best_dist,
+                });
+                pending_lit = 0;
+                while i < end {
+                    if i + MIN_MATCH <= n {
+                        let h = hash(data, i);
+                        prev[i] = head[h];
+                        head[h] = i;
+                    }
+                    i += 1;
+                }
+            } else {
+                literals.push(data[i]);
+                pending_lit += 1;
+                if i + MIN_MATCH <= n {
+                    let h = hash(data, i);
+                    prev[i] = head[h];
+                    head[h] = i;
+                }
+                i += 1;
+            }
+        }
+        // Remaining `pending_lit` literals are trailing literals (no command);
+        // the decoder appends them after the last match.
+        let _ = pending_lit;
+        (literals, cmds)
+    }
+
+    /// Encode `data` as a single `bvx2` block (NOT including the 4-byte
+    /// magic, which the caller prepends).
+    pub(crate) fn encode_block(data: &[u8]) -> Vec<u8> {
+        let (literals, cmds) = lz_parse(data);
+
+        let mut lit_hist = vec![0u32; N_LIT_SYMBOLS];
+        for &b in &literals {
+            lit_hist[b as usize] += 1;
+        }
+        let mut l_hist = vec![0u32; N_L_SYMBOLS];
+        let mut m_hist = vec![0u32; N_M_SYMBOLS];
+        let mut d_hist = vec![0u32; N_D_SYMBOLS];
+
+        struct MappedCmd {
+            l_sym: usize,
+            l_extra: u32,
+            m_sym: usize,
+            m_extra: u32,
+            d_sym: usize,
+            d_extra: u32,
+        }
+        let mut mapped = Vec::with_capacity(cmds.len());
+        for c in &cmds {
+            let (l_sym, l_extra) = map_lmd(c.l as i32, &L_BASE, &L_EXTRA_BITS);
+            let (m_sym, m_extra) = map_lmd(c.m as i32, &M_BASE, &M_EXTRA_BITS);
+            let (d_sym, d_extra) = map_lmd(c.d as i32, &D_BASE, &D_EXTRA_BITS);
+            l_hist[l_sym] += 1;
+            m_hist[m_sym] += 1;
+            d_hist[d_sym] += 1;
+            mapped.push(MappedCmd {
+                l_sym,
+                l_extra,
+                m_sym,
+                m_extra,
+                d_sym,
+                d_extra,
+            });
+        }
+
+        let lit_freq = normalize_general(&lit_hist, LIT_STATES);
+        let l_freq = normalize_general(&l_hist, L_STATES);
+        let m_freq = normalize_general(&m_hist, M_STATES);
+        let d_freq = normalize_general(&d_hist, D_STATES);
+
+        let lit_slots = build_enc_slots(&lit_freq, LIT_STATES);
+        let l_slots = build_enc_slots(&l_freq, L_STATES);
+        let m_slots = build_enc_slots(&m_freq, M_STATES);
+        let d_slots = build_enc_slots(&d_freq, D_STATES);
+
+        // ── Encode literals (reverse, 4-way interleaved) ──
+        let n_lit = literals.len();
+        let mut lit_sink = FseSink::new();
+        let mut lit_states = [0i32; 4];
+        for idx in (0..n_lit).rev() {
+            let lane = idx % 4;
+            let sym = literals[idx] as usize;
+            let cur = lit_states[lane];
+            let slot = lit_slots[sym]
+                .iter()
+                .find(|s| cur >= s.lo && cur <= s.hi)
+                .expect("literal slot covers state");
+            lit_sink.push((cur - slot.lo) as u64, slot.k);
+            lit_states[lane] = slot.t as i32;
+        }
+        let literal_state = [
+            lit_states[0] as u32,
+            lit_states[1] as u32,
+            lit_states[2] as u32,
+            lit_states[3] as u32,
+        ];
+        let (lit_payload, literal_bits) = lit_sink.finish();
+
+        // ── Encode LMD (reverse). Decoder pulls L, M, D per command, so to
+        // invert we iterate commands in reverse and push D, then M, then L. ──
+        let mut lmd_sink = FseSink::new();
+        let mut l_st = 0i32;
+        let mut m_st = 0i32;
+        let mut d_st = 0i32;
+        for mc in mapped.iter().rev() {
+            let d_slot = d_slots[mc.d_sym]
+                .iter()
+                .find(|s| d_st >= s.lo && d_st <= s.hi)
+                .expect("d slot");
+            let raw = (d_st - d_slot.lo) as u64 | ((mc.d_extra as u64) << d_slot.k);
+            lmd_sink.push(raw, d_slot.k + D_EXTRA_BITS[mc.d_sym]);
+            d_st = d_slot.t as i32;
+
+            let m_slot = m_slots[mc.m_sym]
+                .iter()
+                .find(|s| m_st >= s.lo && m_st <= s.hi)
+                .expect("m slot");
+            let raw = (m_st - m_slot.lo) as u64 | ((mc.m_extra as u64) << m_slot.k);
+            lmd_sink.push(raw, m_slot.k + M_EXTRA_BITS[mc.m_sym]);
+            m_st = m_slot.t as i32;
+
+            let l_slot = l_slots[mc.l_sym]
+                .iter()
+                .find(|s| l_st >= s.lo && l_st <= s.hi)
+                .expect("l slot");
+            let raw = (l_st - l_slot.lo) as u64 | ((mc.l_extra as u64) << l_slot.k);
+            lmd_sink.push(raw, l_slot.k + L_EXTRA_BITS[mc.l_sym]);
+            l_st = l_slot.t as i32;
+        }
+        let l_state = l_st as u32;
+        let m_state = m_st as u32;
+        let d_state = d_st as u32;
+        let (lmd_payload, lmd_bits) = lmd_sink.finish();
+
+        // ── Pack frequency tables (L, M, D, LIT, bit-contiguous) ──
+        let mut freq_bits: Vec<u8> = Vec::new();
+        for table in [&l_freq, &m_freq, &d_freq, &lit_freq] {
+            for &f in table.iter() {
+                let (code, len) = encode_freq_value(f);
+                for i in 0..len {
+                    freq_bits.push(((code >> i) & 1) as u8);
+                }
+            }
+        }
+        let mut freq_bytes = vec![0u8; freq_bits.len().div_ceil(8)];
+        for (i, &b) in freq_bits.iter().enumerate() {
+            freq_bytes[i / 8] |= b << (i % 8);
+        }
+
+        // ── Assemble the header ──
+        // `header_size` is measured from the start of the block, i.e. it
+        // includes the 4-byte magic that the caller prepends:
+        //   magic(4) + n_raw(4) + 3*u64(24) + freq = 4 + V2_HEADER_FIXED_BYTES + freq.
+        let header_size = (4 + V2_HEADER_FIXED_BYTES + freq_bytes.len()) as u32;
+        let n_raw_bytes = data.len() as u32;
+        let n_literals = n_lit as u32;
+        let n_matches = cmds.len() as u32;
+        let n_literal_payload_bytes = lit_payload.len() as u32;
+        let n_lmd_payload_bytes = lmd_payload.len() as u32;
+
+        let mut w0 = 0u64;
+        w0 |= (n_literals as u64) & 0xFFFFF;
+        w0 |= ((n_literal_payload_bytes as u64) & 0xFFFFF) << 20;
+        w0 |= ((n_matches as u64) & 0xFFFFF) << 40;
+        w0 |= ((literal_bits as u64) & 0x7) << 60;
+
+        let mut w1 = 0u64;
+        w1 |= (literal_state[0] as u64) & 0x3FF;
+        w1 |= ((literal_state[1] as u64) & 0x3FF) << 10;
+        w1 |= ((literal_state[2] as u64) & 0x3FF) << 20;
+        w1 |= ((literal_state[3] as u64) & 0x3FF) << 30;
+        w1 |= ((n_lmd_payload_bytes as u64) & 0xFFFFF) << 40;
+        w1 |= ((lmd_bits as u64) & 0x7) << 60;
+
+        let mut w2 = 0u64;
+        w2 |= (header_size as u64) & 0xFFFFFFFF;
+        w2 |= ((l_state as u64) & 0x3FF) << 32;
+        w2 |= ((m_state as u64) & 0x3FF) << 42;
+        w2 |= ((d_state as u64) & 0x3FF) << 52;
+
+        let mut out =
+            Vec::with_capacity(header_size as usize + lit_payload.len() + lmd_payload.len());
+        out.extend_from_slice(&n_raw_bytes.to_le_bytes());
+        out.extend_from_slice(&w0.to_le_bytes());
+        out.extend_from_slice(&w1.to_le_bytes());
+        out.extend_from_slice(&w2.to_le_bytes());
+        out.extend_from_slice(&freq_bytes);
+        out.extend_from_slice(&lit_payload);
+        out.extend_from_slice(&lmd_payload);
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::lzfse::decoder::Decoder;
+    use crate::traits::{RawDecoder, RawProgress};
+
+    /// Wrap a v2-encoded block (post-magic bytes) into a full `bvx2` block.
+    fn v2_block(data: &[u8]) -> Vec<u8> {
+        let mut b = Vec::new();
+        b.extend_from_slice(b"bvx2");
+        b.extend_from_slice(&encode_block(data));
+        b
+    }
+
+    /// Block-level round-trip: encode then `decode_block`.
+    fn rt_block(data: &[u8]) {
+        let block = encode_block(data);
+        let mut out = Vec::new();
+        let consumed = decode_block(&block, &mut out, 1 << 20)
+            .unwrap_or_else(|e| panic!("decode_block failed on len {}: {e:?}", data.len()));
+        assert_eq!(consumed, block.len(), "did not consume whole block");
+        assert_eq!(out, data, "round-trip mismatch (len {})", data.len());
+    }
+
+    /// Full-stream round-trip through the streaming `Decoder`.
+    fn rt_stream(blocks: &[&[u8]]) -> Vec<u8> {
+        let mut stream = Vec::new();
+        for b in blocks {
+            stream.extend_from_slice(&v2_block(b));
+        }
+        stream.extend_from_slice(b"bvx$");
+
+        let mut dec = Decoder::new();
+        let mut out = Vec::new();
+        let mut buf = vec![0u8; 512];
+        let mut pos = 0usize;
+        loop {
+            let RawProgress {
+                consumed,
+                written,
+                done,
+            } = dec.raw_decode(&stream[pos..], &mut buf).unwrap();
+            pos += consumed;
+            out.extend_from_slice(&buf[..written]);
+            if done {
+                break;
+            }
+            if consumed == 0 && written == 0 {
+                // Need to finish.
+                let RawProgress { written, done, .. } = dec.raw_finish(&mut buf).unwrap();
+                out.extend_from_slice(&buf[..written]);
+                if done || written == 0 {
+                    break;
+                }
+            }
+        }
+        out
+    }
+
+    #[test]
+    fn block_roundtrip_empty() {
+        rt_block(b"");
+    }
+
+    #[test]
+    fn block_roundtrip_small() {
+        rt_block(b"a");
+        rt_block(b"ab");
+        rt_block(b"abc");
+        rt_block(b"hello world");
+    }
+
+    #[test]
+    fn block_roundtrip_text() {
+        let text = b"the quick brown fox jumps over the lazy dog. \
+                     the quick brown fox jumps over the lazy dog. \
+                     pack my box with five dozen liquor jugs.";
+        rt_block(text);
+    }
+
+    #[test]
+    fn block_roundtrip_repetitive() {
+        rt_block(&vec![b'A'; 1000]);
+        rt_block(&vec![0u8; 5000]);
+        let mut v = Vec::new();
+        for _ in 0..500 {
+            v.extend_from_slice(b"abcd");
+        }
+        rt_block(&v);
+    }
+
+    #[test]
+    fn block_roundtrip_random() {
+        // Deterministic LCG "random" bytes (incompressible-ish) of varied sizes.
+        let mut state = 0x1234_5678u32;
+        let mut next = || {
+            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+            (state >> 24) as u8
+        };
+        for &len in &[
+            0usize, 1, 7, 15, 16, 17, 63, 64, 100, 255, 256, 1024, 4096, 9001,
+        ] {
+            let data: Vec<u8> = (0..len).map(|_| next()).collect();
+            rt_block(&data);
+        }
+    }
+
+    #[test]
+    fn block_roundtrip_mixed_structure() {
+        // Repetitive prefix + random tail + repetitive again exercises both
+        // literal-heavy and match-heavy command streams.
+        let mut data = vec![b'x'; 300];
+        let mut state = 0x9E37_79B9u32;
+        for _ in 0..300 {
+            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+            data.push((state >> 23) as u8);
+        }
+        data.extend_from_slice(&vec![b'y'; 400]);
+        data.extend_from_slice(b"the the the the the the the the the the the the");
+        rt_block(&data);
+    }
+
+    #[test]
+    fn block_roundtrip_all_byte_values() {
+        // Every byte value present forces a full 256-symbol literal table.
+        let mut data = Vec::new();
+        for _ in 0..8 {
+            for b in 0u16..256 {
+                data.push(b as u8);
+            }
+        }
+        rt_block(&data);
+    }
+
+    #[test]
+    fn block_roundtrip_long_match() {
+        // A long run produces large match lengths (exercises M extra bits).
+        let data = vec![b'Q'; 50_000];
+        rt_block(&data);
+    }
+
+    #[test]
+    fn block_roundtrip_far_distance() {
+        // Distinct head, large gap, then a copy of the head — exercises large
+        // D extra bits.
+        let mut data: Vec<u8> = b"UNIQUEPREFIXHERE0123456789".to_vec();
+        data.extend(core::iter::repeat_n(b'.', 70_000));
+        data.extend_from_slice(b"UNIQUEPREFIXHERE0123456789");
+        rt_block(&data);
+    }
+
+    #[test]
+    fn stream_roundtrip_single_block() {
+        let data = b"the quick brown fox jumps over the lazy dog".repeat(20);
+        let out = rt_stream(&[&data]);
+        assert_eq!(out, data);
+    }
+
+    #[test]
+    fn stream_roundtrip_multi_block() {
+        let a = b"first block contents, repeated repeated repeated".repeat(10);
+        let b = vec![b'Z'; 2000];
+        let c = b"third".repeat(100);
+        let out = rt_stream(&[&a, &b, &c]);
+        let mut want = Vec::new();
+        want.extend_from_slice(&a);
+        want.extend_from_slice(&b);
+        want.extend_from_slice(&c);
+        assert_eq!(out, want);
+    }
+
+    #[test]
+    fn stream_roundtrip_empty_block() {
+        let out = rt_stream(&[b""]);
+        assert_eq!(out, b"");
+    }
+
+    #[test]
+    fn corrupt_header_size_rejected() {
+        let mut block = encode_block(b"hello world this is a test of corruption");
+        // header_size lives in packed_fields[2] low 32 bits, at byte offset
+        // 4 + 8 + 8 = 20 within the post-magic block. Set it absurdly large.
+        block[20] = 0xFF;
+        block[21] = 0xFF;
+        block[22] = 0xFF;
+        block[23] = 0xFF;
+        let mut out = Vec::new();
+        assert!(decode_block(&block, &mut out, 1 << 20).is_err());
+    }
+
+    #[test]
+    fn truncated_payload_rejected() {
+        let block = encode_block(&vec![b'k'; 2000]);
+        // Drop the last few payload bytes.
+        let truncated = &block[..block.len() - 3];
+        let mut out = Vec::new();
+        assert!(decode_block(truncated, &mut out, 1 << 20).is_err());
+    }
+
+    #[test]
+    fn garbage_freq_does_not_panic() {
+        // A short, mostly-zero block: parse_header should reject (freq sums
+        // won't match) rather than panic.
+        let mut block = vec![0u8; 64];
+        // Give n_raw a small value and a plausible header_size.
+        block[0..4].copy_from_slice(&8u32.to_le_bytes());
+        // header_size = 32 (magic + fixed, no freq bytes) — freq tables empty
+        // → sums won't match the FSE state counts.
+        let w2 = 32u64;
+        block[20..28].copy_from_slice(&w2.to_le_bytes());
+        let mut out = Vec::new();
+        let _ = decode_block(&block, &mut out, 1 << 20);
+    }
+
+    #[test]
+    fn stream_roundtrip_one_byte_at_a_time() {
+        // Feed a v2 block + EOS one byte at a time, exercising the streaming
+        // decoder's reassembly of the variable-length v2 header and payload.
+        let data = b"streaming reassembly test streaming reassembly test".repeat(8);
+        let mut stream = v2_block(&data);
+        stream.extend_from_slice(b"bvx$");
+
+        let mut dec = Decoder::new();
+        let mut out = Vec::new();
+        let mut buf = vec![0u8; 64];
+        let mut pos = 0usize;
+        while pos < stream.len() {
+            let end = (pos + 1).min(stream.len());
+            let RawProgress {
+                consumed,
+                written,
+                done,
+            } = dec.raw_decode(&stream[pos..end], &mut buf).unwrap();
+            pos += consumed;
+            out.extend_from_slice(&buf[..written]);
+            if done {
+                break;
+            }
+        }
+        loop {
+            let RawProgress { written, done, .. } = dec.raw_finish(&mut buf).unwrap();
+            out.extend_from_slice(&buf[..written]);
+            if done || written == 0 {
+                break;
+            }
+        }
+        assert_eq!(out, data);
+    }
+
+    /// A hand-frozen `bvx2` stream, independent of this crate's encoder.
+    ///
+    /// It is a literals-only block (`n_matches == 0`) whose **literal
+    /// frequency table is deliberately non-dyadic**: the high-frequency
+    /// literal symbol `0x3d` (`=`) has frequency 1000 and the rare symbol
+    /// `0x3e` (`>`) has 24 (sum 1024 = LIT_STATES). Neither is a power of two,
+    /// so decoding correctly *requires* the general k/k-1 FSE table
+    /// construction — a single-`k` decoder cannot build a table that tiles
+    /// `[0,1024)` for these frequencies and mis-decodes the literals.
+    ///
+    /// The bytes (post-magic header + freq tables + literal FSE payload, then
+    /// the `bvx$` EOS) were generated once and frozen here; this test does not
+    /// call the encoder, so it guards against the encoder and decoder sharing
+    /// the same table-construction bug. The four literals decode to the exact
+    /// ASCII string `=>==`.
+    const HAND_VECTOR: &[u8] = &[
+        0x62, 0x76, 0x78, 0x32, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00,
+        0x50, 0x48, 0x40, 0x8f, 0x04, 0x12, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x8f, 0x02, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x8f, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0xc0, 0x43, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x76, 0x78, 0x24,
+    ];
+
+    #[test]
+    fn hand_vector_non_dyadic_decodes_to_known_string() {
+        // Decode the frozen, encoder-independent vector through the public
+        // streaming decoder and assert the exact output.
+        let mut dec = Decoder::new();
+        let mut out = Vec::new();
+        let mut buf = vec![0u8; 64];
+        let mut pos = 0usize;
+        loop {
+            let RawProgress {
+                consumed,
+                written,
+                done,
+            } = dec.raw_decode(&HAND_VECTOR[pos..], &mut buf).unwrap();
+            pos += consumed;
+            out.extend_from_slice(&buf[..written]);
+            if done {
+                break;
+            }
+            if consumed == 0 && written == 0 {
+                let RawProgress { written, done, .. } = dec.raw_finish(&mut buf).unwrap();
+                out.extend_from_slice(&buf[..written]);
+                if done || written == 0 {
+                    break;
+                }
+            }
+        }
+        assert_eq!(out, b"=>==", "hand vector decoded to {out:?}");
+    }
+
+    #[test]
+    fn normalize_general_produces_non_dyadic_freqs() {
+        // A skewed histogram must normalize to general (non-power-of-two)
+        // frequencies that sum exactly to n_states and give every present
+        // symbol at least 1. A regression that snapped to powers of two would
+        // be visible here.
+        use super::test_encoder::normalize_general;
+        let hist = [1000u32, 3, 17, 250, 0, 1];
+        let freq = normalize_general(&hist, 1024);
+        assert_eq!(freq.iter().map(|&f| f as u32).sum::<u32>(), 1024);
+        // Absent symbol stays 0; present symbols are >= 1.
+        assert_eq!(freq[4], 0);
+        for (i, &h) in hist.iter().enumerate() {
+            if h > 0 {
+                assert!(freq[i] >= 1, "present symbol {i} dropped to 0");
+            }
+        }
+        // At least one present symbol is genuinely non-power-of-two.
+        assert!(
+            freq.iter().any(|&f| f > 0 && !f.is_power_of_two()),
+            "expected a non-power-of-two frequency, got {freq:?}"
+        );
+    }
+
+    /// Round-trip a payload whose literal histogram is deliberately skewed so
+    /// the normalized FSE frequencies are non-dyadic. A regression to a
+    /// single-`k` decode table would corrupt the result.
+    fn rt_assert_non_dyadic_lit(data: &[u8]) {
+        use super::test_encoder::normalize_general;
+        // Recompute the literal histogram the way encode_block does, but only
+        // over true literals would require the parser; instead assert on a
+        // raw-byte histogram, which upper-bounds the literal alphabet and is a
+        // good proxy for "this input yields a non-dyadic literal table".
+        let mut hist = vec![0u32; 256];
+        for &b in data {
+            hist[b as usize] += 1;
+        }
+        let freq = normalize_general(&hist, LIT_STATES);
+        assert!(
+            freq.iter().any(|&f| f > 0 && !f.is_power_of_two()),
+            "test input does not exercise a non-dyadic table"
+        );
+        rt_block(data);
+    }
+
+    #[test]
+    fn block_roundtrip_non_dyadic_literals() {
+        // Skewed-but-not-dyadic byte distributions (counts chosen so the
+        // 1024-state normalization lands on non-powers-of-two).
+        let mut data = Vec::new();
+        data.extend(core::iter::repeat_n(b'a', 1000));
+        data.extend(core::iter::repeat_n(b'b', 333));
+        data.extend(core::iter::repeat_n(b'c', 77));
+        data.extend(core::iter::repeat_n(b'd', 7));
+        data.push(b'e'); // a singleton
+        rt_assert_non_dyadic_lit(&data);
+
+        // A 3-symbol skew (~70/29/1 split).
+        let mut d2 = Vec::new();
+        d2.extend(core::iter::repeat_n(b'x', 700));
+        d2.extend(core::iter::repeat_n(b'y', 290));
+        d2.extend(core::iter::repeat_n(b'z', 11));
+        rt_assert_non_dyadic_lit(&d2);
+    }
+
+    #[test]
+    fn block_roundtrip_small_match_counts() {
+        // Few, varied matches produce small non-power-of-two L/M/D histograms
+        // (e.g. a single match → a singleton frequency in each LMD table).
+        // Each must round-trip through the general k/k-1 LMD tables.
+        let cases: &[&[u8]] = &[
+            b"abcabc",                         // one short match
+            b"abcdeabcde_xyzxyz",              // two matches, different lens
+            b"AAAABBBBCCCCAAAABBBBCCCC",       // a couple of medium matches
+            b"the cat sat on the mat the cat", // overlapping repeats
+        ];
+        for c in cases {
+            rt_block(c);
+        }
+    }
+
+    #[test]
+    fn fuzz_roundtrip_many_sizes() {
+        // Broad deterministic fuzz: many sizes, several content shapes. Each
+        // must round-trip exactly through decode_block(encode_block(x)).
+        let mut state = 0xDEAD_BEEFu32;
+        let mut rng = || {
+            state = state.wrapping_mul(1_103_515_245).wrapping_add(12_345);
+            state
+        };
+        for len in 0..400usize {
+            // Shape 0: random bytes. Shape 1: small alphabet (matches galore).
+            // Shape 2: mostly-constant with sparse noise.
+            for shape in 0..3 {
+                let data: Vec<u8> = (0..len)
+                    .map(|_| match shape {
+                        0 => (rng() >> 24) as u8,
+                        1 => b"abcde"[(rng() as usize) % 5],
+                        _ => {
+                            if rng() % 16 == 0 {
+                                (rng() >> 24) as u8
+                            } else {
+                                b'='
+                            }
+                        }
+                    })
+                    .collect();
+                rt_block(&data);
+            }
+        }
+    }
 }
diff --git a/src/lzfse/mod.rs b/src/lzfse/mod.rs
index 84a66fd..b377e39 100644
--- a/src/lzfse/mod.rs
+++ b/src/lzfse/mod.rs
@@ -29,11 +29,18 @@
 //!   StreamEnd.
 //! - `bvx1` blocks: not commonly emitted by modern encoders; this build
 //!   returns [`Error::Unsupported`].
-//! - `bvx2` (LZFSE v2 compressed) blocks: the FSE table-construction
-//!   primitives are present (see `fse.rs`), but the full v2 block decoder
-//!   is gated off in this release. `bvx2` blocks return
-//!   [`Error::Unsupported`]; see the internal `lzfse_v2` module for the
-//!   layout reference and the gap analysis.
+//! - `bvx2` (LZFSE v2 compressed) blocks: **decoder implemented** — the core
+//!   LZFSE block type (LZ77 commands entropy-coded with Finite State
+//!   Entropy). The FSE table construction matches Apple's general
+//!   `fse_init_decoder_table` (k/k-1 split), so arbitrary per-symbol
+//!   frequencies decode, not only power-of-two normalizations. Validated by
+//!   round-trip against this crate's own spec-conformant general-frequency v2
+//!   encoder, including deliberately non-dyadic distributions and a
+//!   hand-frozen non-dyadic block (no Apple reference fixtures are available
+//!   in this environment, so Apple-interop is best-effort but follows the
+//!   documented wire format and real table-construction algorithm). See the
+//!   internal `lzfse_v2` module for the layout reference and
+//!   validation/interop notes.
 //!
 //! Real LZFSE files produced by Apple's encoders mix these block types
 //! freely: small payloads land in `bvxn`, large ones in `bvx2`, and short
diff --git a/src/lzma2/mod.rs b/src/lzma2/mod.rs
index e7b41ab..41eb5ba 100644
--- a/src/lzma2/mod.rs
+++ b/src/lzma2/mod.rs
@@ -54,6 +54,27 @@
 //! machinery used by [`crate::xz`] (the shared `LzmaCore`); this module only
 //! adds the raw chunk framing and self-termination handling. There is no
 //! re-implementation of LZMA here.
+//!
+//! ## Encoder
+//!
+//! The [`Encoder`] produces the same raw LZMA2 chunk stream the decoder
+//! consumes, reusing the shared `encode_lzma_chunk` range coder from
+//! [`crate::xz`]'s internals — no LZMA re-implementation. Every chunk is a
+//! full-reset chunk (control byte `0xE0` for compressed, `0x01` for
+//! uncompressed) so each chunk is independently decodable; an uncompressed
+//! chunk is emitted as a fallback whenever compression would expand the data.
+//! The stream is terminated by a single `0x00` end-marker byte.
+//!
+//! ### Dictionary-size contract
+//!
+//! A raw LZMA2 stream carries **no** dictionary size in band — that value is
+//! the 7z coder property the decoder receives out of band (via
+//! [`DecoderConfig::with_dict_prop`] / [`DecoderConfig::with_dict_size`]).
+//! The encoder bounds its match distances by a fixed 4 MiB dictionary (the
+//! [`crate::xz`] default), so a decoder built with the default config — which
+//! also uses a 4 MiB window — round-trips the output exactly. If you transport
+//! this stream inside a 7z container, advertise a dictionary size of at least
+//! 4 MiB in the coder property.
 
 #![cfg_attr(docsrs, doc(cfg(feature = "lzma2")))]
 
@@ -74,12 +95,13 @@ const MAX_DICT: usize = 128 * 1024 * 1024;
 /// LZMA2 default and the size [`crate::xz`] uses).
 const DEFAULT_DICT: usize = 4 * 1024 * 1024;
 
-/// Raw LZMA2 stream codec (7-Zip coder id 21). Decode-only.
+/// Raw LZMA2 stream codec (7-Zip coder id 21).
 ///
-/// The encoder is a permanent [`Error::Unsupported`] stub: 7z LZMA2 framing
-/// is produced by the [`crate::xz`] encoder path, and there is no need for a
-/// standalone raw LZMA2 encoder. See the [module docs](self) for the stream
-/// shape.
+/// Both directions are implemented: the [`Encoder`] emits a raw LZMA2 chunk
+/// stream (full-reset chunks + `0x00` end marker) bounded by a 4 MiB
+/// dictionary, and the [`Decoder`] consumes that stream. The dictionary size
+/// is out of band (see the [module docs](self#dictionary-size-contract)); a
+/// default-config decoder round-trips the default-config encoder's output.
 #[derive(Debug, Clone, Copy, Default)]
 pub struct Lzma2;
 
@@ -138,7 +160,7 @@ impl Algorithm for Lzma2 {
     type EncoderConfig = ();
     type DecoderConfig = DecoderConfig;
     fn encoder_with(_: ()) -> Encoder {
-        Encoder
+        Encoder::new()
     }
     fn decoder_with(cfg: DecoderConfig) -> Decoder {
         Decoder::new(cfg)
@@ -157,24 +179,282 @@ fn resolve_dict_size(cfg: &DecoderConfig) -> Result<usize, Error> {
     Ok(raw.clamp(4096, MAX_DICT))
 }
 
-// ─── encoder stub ─────────────────────────────────────────────────────────
+// ─── encoder ──────────────────────────────────────────────────────────────
+
+use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk};
 
-/// Raw LZMA2 encoder stub: permanently returns [`Error::Unsupported`].
+/// Dictionary size (in bytes) the encoder advertises to the LZMA chunk
+/// coder as the match-distance ceiling. Fixed at 4 MiB — the [`crate::xz`]
+/// default — so a default-config [`Decoder`] (also 4 MiB) round-trips.
+const ENC_DICT_SIZE: u32 = DEFAULT_DICT as u32;
+
+/// Default compression level (mirrors xz-utils' and [`crate::xz`]'s default).
+const ENC_DEFAULT_LEVEL: u8 = 6;
+
+/// Maximum uncompressed bytes buffered per LZMA2 chunk. Capped at 65_536 so
+/// both the uncompressed-chunk 16-bit size field and the compressed-chunk
+/// size fields stay in range, matching the [`crate::xz`] encoder's cap and
+/// bounding peak working-buffer memory.
+const ENC_CHUNK_MAX: usize = 65_536;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum EncPhase {
+    /// Buffering input; flushing a chunk when the buffer fills.
+    Body,
+    /// Draining a staged chunk from `pending`, then back to `Body`.
+    DrainPending,
+    /// (`finish` only) Flush any partial buffered chunk, then stage the
+    /// `0x00` end marker.
+    Finishing,
+    /// (`finish` only) Draining the `0x00` end marker from `pending`.
+    DrainEnd,
+    /// All chunks plus the `0x00` end marker have been drained.
+    Done,
+}
+
+/// Raw LZMA2 encoder.
 ///
-/// Lets the crate auto-derive the public [`Encoder`](crate::Encoder) trait
-/// while making encode attempts fail cleanly. LZMA2 output is produced via
-/// the [`crate::xz`] encoder.
-#[derive(Debug, Clone, Copy, Default)]
-pub struct Encoder;
+/// Emits the raw LZMA2 chunk stream consumed by [`Decoder`] — a sequence of
+/// full-reset chunks terminated by a single `0x00` end marker. There is **no**
+/// `.xz` container (no stream magic, block header, index, or CRC); for that,
+/// use [`crate::xz`]. Match distances are bounded by a fixed 4 MiB dictionary
+/// that the decoder must be told about out of band (see the
+/// [module docs](self#dictionary-size-contract)).
+///
+/// Each chunk is independently decodable: the encoder always full-resets
+/// (dict + props + state) at the chunk boundary, emitting a compressed chunk
+/// (control `0xE0`) when that shrinks the data and an uncompressed chunk
+/// (control `0x01`) otherwise.
+pub struct Encoder {
+    phase: EncPhase,
+    /// Staged bytes for the current chunk (or end marker), drained to the
+    /// caller from `pending[pending_idx..]`.
+    pending: Vec<u8>,
+    pending_idx: usize,
+    /// Input accumulated for the next chunk; flushed at `ENC_CHUNK_MAX` or on
+    /// `finish`.
+    in_buf: Vec<u8>,
+    /// Level-derived match-finder tuning; preserved across `reset`.
+    params: EncoderParams,
+}
+
+impl Default for Encoder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Encoder {
+    /// Build an encoder at the default compression level (6).
+    pub fn new() -> Self {
+        Self {
+            phase: EncPhase::Body,
+            pending: Vec::new(),
+            pending_idx: 0,
+            in_buf: Vec::new(),
+            params: EncoderParams::from_level(ENC_DEFAULT_LEVEL),
+        }
+    }
+
+    /// Push staged bytes from `pending[pending_idx..]` into `output`. Returns
+    /// true once the buffer is fully drained.
+    fn drain_pending(&mut self, output: &mut [u8], written: &mut usize) -> bool {
+        while self.pending_idx < self.pending.len() && *written < output.len() {
+            output[*written] = self.pending[self.pending_idx];
+            *written += 1;
+            self.pending_idx += 1;
+        }
+        if self.pending_idx >= self.pending.len() {
+            self.pending.clear();
+            self.pending_idx = 0;
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Stage one LZMA2 chunk for `data` (`1..=ENC_CHUNK_MAX` bytes), choosing
+    /// a compressed chunk when it shrinks the data and an uncompressed
+    /// fallback otherwise.
+    fn stage_chunk(&mut self, data: &[u8]) {
+        debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
+        let compressed = encode_lzma_chunk(data, ENC_DICT_SIZE, self.params);
+        // A compressed chunk is only worth emitting when its range-coded body
+        // is both smaller than the input and fits the 16-bit (+1) comp-size
+        // field. Otherwise the uncompressed chunk is strictly smaller.
+        let use_compressed =
+            !compressed.is_empty() && compressed.len() <= 65_536 && compressed.len() < data.len();
+        if use_compressed {
+            self.stage_compressed_chunk(data, &compressed);
+        } else {
+            self.stage_uncompressed_chunk(data);
+        }
+    }
+
+    /// Stage a full-reset compressed chunk: control `0xE0` (compressed, with
+    /// dict, props, and state all reset; top 5 bits of `uncomp_size-1`), a
+    /// 2-byte `uncomp_size-1` BE remainder, a 2-byte `comp_size-1` BE, the
+    /// 1-byte LZMA props (present because we full-reset), then the
+    /// range-coded body.
+    fn stage_compressed_chunk(&mut self, data: &[u8], compressed: &[u8]) {
+        debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
+        debug_assert!(!compressed.is_empty() && compressed.len() <= 65_536);
+
+        let uncomp_m1 = (data.len() - 1) as u32; // 0..=65535 with our cap
+        // Top 5 bits of (uncomp_size - 1) live in the control byte; with a
+        // 65_536 cap they are always zero, yielding exactly 0xE0.
+        let control: u8 = 0xE0 | ((uncomp_m1 >> 16) & 0x1F) as u8;
+        let comp_m1 = (compressed.len() - 1) as u16;
+
+        self.pending.reserve(6 + compressed.len());
+        self.pending.push(control);
+        self.pending.push(((uncomp_m1 >> 8) & 0xFF) as u8);
+        self.pending.push((uncomp_m1 & 0xFF) as u8);
+        self.pending.push((comp_m1 >> 8) as u8);
+        self.pending.push((comp_m1 & 0xFF) as u8);
+        self.pending.push(LZMA2_PROPS_BYTE);
+        self.pending.extend_from_slice(compressed);
+        self.pending_idx = 0;
+    }
+
+    /// Stage an uncompressed chunk: control `0x01` (dict reset), 2-byte
+    /// `size-1` BE, then the raw bytes.
+    fn stage_uncompressed_chunk(&mut self, data: &[u8]) {
+        debug_assert!(!data.is_empty() && data.len() <= ENC_CHUNK_MAX);
+        let size_m1 = (data.len() - 1) as u16;
+        self.pending.reserve(3 + data.len());
+        self.pending.push(0x01);
+        self.pending.push((size_m1 >> 8) as u8);
+        self.pending.push((size_m1 & 0xFF) as u8);
+        self.pending.extend_from_slice(data);
+        self.pending_idx = 0;
+    }
+}
 
 impl RawEncoder for Encoder {
-    fn raw_encode(&mut self, _input: &[u8], _output: &mut [u8]) -> Result<RawProgress, Error> {
-        Err(Error::Unsupported)
+    fn raw_encode(&mut self, input: &[u8], output: &mut [u8]) -> Result<RawProgress, Error> {
+        let mut consumed = 0usize;
+        let mut written = 0usize;
+
+        loop {
+            match self.phase {
+                EncPhase::Body => {
+                    while consumed < input.len() && self.in_buf.len() < ENC_CHUNK_MAX {
+                        let take = (ENC_CHUNK_MAX - self.in_buf.len()).min(input.len() - consumed);
+                        self.in_buf
+                            .extend_from_slice(&input[consumed..consumed + take]);
+                        consumed += take;
+                    }
+                    if self.in_buf.len() == ENC_CHUNK_MAX {
+                        let data = core::mem::take(&mut self.in_buf);
+                        self.stage_chunk(&data);
+                        self.phase = EncPhase::DrainPending;
+                    } else {
+                        return Ok(RawProgress {
+                            consumed,
+                            written,
+                            done: false,
+                        });
+                    }
+                }
+                EncPhase::DrainPending => {
+                    if self.drain_pending(output, &mut written) {
+                        self.phase = EncPhase::Body;
+                    } else {
+                        return Ok(RawProgress {
+                            consumed,
+                            written,
+                            done: false,
+                        });
+                    }
+                }
+                // `encode` never advances into the finish-only phases.
+                EncPhase::Finishing | EncPhase::DrainEnd | EncPhase::Done => {
+                    return Ok(RawProgress {
+                        consumed,
+                        written,
+                        done: false,
+                    });
+                }
+            }
+        }
     }
-    fn raw_finish(&mut self, _output: &mut [u8]) -> Result<RawProgress, Error> {
-        Err(Error::Unsupported)
+
+    fn raw_finish(&mut self, output: &mut [u8]) -> Result<RawProgress, Error> {
+        let mut written = 0usize;
+
+        // `encode` leaves the encoder in `Body`/`DrainPending`; the first
+        // `finish` call drives it through `Finishing` → `DrainEnd` → `Done`.
+        if self.phase == EncPhase::Body || self.phase == EncPhase::DrainPending {
+            // A `DrainPending` left over from `encode` still has chunk bytes
+            // staged; drain those before flushing the tail.
+            self.phase = EncPhase::Finishing;
+        }
+
+        loop {
+            match self.phase {
+                EncPhase::Finishing => {
+                    if !self.pending.is_empty() {
+                        // Drain a chunk staged during `encode` first.
+                        if !self.drain_pending(output, &mut written) {
+                            return Ok(RawProgress {
+                                consumed: 0,
+                                written,
+                                done: false,
+                            });
+                        }
+                    }
+                    if !self.in_buf.is_empty() {
+                        let data = core::mem::take(&mut self.in_buf);
+                        self.stage_chunk(&data);
+                        // Stay in `Finishing`; the loop drains this chunk then
+                        // re-checks the (now empty) buffer.
+                    } else {
+                        // Buffer empty and any staged chunk drained: emit the
+                        // single 0x00 end marker.
+                        self.pending.push(0x00);
+                        self.pending_idx = 0;
+                        self.phase = EncPhase::DrainEnd;
+                    }
+                }
+                EncPhase::DrainEnd => {
+                    if self.drain_pending(output, &mut written) {
+                        self.phase = EncPhase::Done;
+                        return Ok(RawProgress {
+                            consumed: 0,
+                            written,
+                            done: true,
+                        });
+                    }
+                    return Ok(RawProgress {
+                        consumed: 0,
+                        written,
+                        done: false,
+                    });
+                }
+                EncPhase::Done => {
+                    return Ok(RawProgress {
+                        consumed: 0,
+                        written,
+                        done: true,
+                    });
+                }
+                // Unreachable: normalized to `Finishing` above.
+                EncPhase::Body | EncPhase::DrainPending => {
+                    self.phase = EncPhase::Finishing;
+                }
+            }
+        }
+    }
+
+    fn raw_reset(&mut self) {
+        let params = self.params;
+        self.phase = EncPhase::Body;
+        self.pending.clear();
+        self.pending_idx = 0;
+        self.in_buf.clear();
+        self.params = params;
     }
-    fn raw_reset(&mut self) {}
 }
 
 // ─── decoder ───────────────────────────────────────────────────────────────
@@ -796,4 +1076,134 @@ mod tests {
         assert_eq!(st2, Status::StreamEnd);
         assert_eq!(&out[..p2.written], &data[..]);
     }
+
+    // ── encoder tests ─────────────────────────────────────────────────────
+
+    use crate::traits::Encoder as _;
+
+    /// Encode `data` with the raw LZMA2 [`Encoder`], driving the streaming
+    /// API with the given output-buffer size to stress phase boundaries.
+    fn encode_all(data: &[u8], out_chunk: usize) -> Vec<u8> {
+        let mut enc = Lzma2::encoder_with(());
+        let mut stream = Vec::new();
+        let mut obuf = vec![0u8; out_chunk];
+        let mut consumed = 0;
+        loop {
+            let (p, st) = enc.encode(&data[consumed..], &mut obuf).unwrap();
+            stream.extend_from_slice(&obuf[..p.written]);
+            consumed += p.consumed;
+            match st {
+                Status::InputEmpty => break,
+                Status::OutputFull => {}
+                Status::StreamEnd => unreachable!("encode never ends the stream"),
+            }
+        }
+        loop {
+            let (p, st) = enc.finish(&mut obuf).unwrap();
+            stream.extend_from_slice(&obuf[..p.written]);
+            if st == Status::StreamEnd {
+                break;
+            }
+        }
+        stream
+    }
+
+    /// Encode then decode `data`, asserting a byte-identical round-trip both
+    /// in bulk and one byte at a time.
+    fn enc_roundtrip(data: &[u8]) {
+        for out_chunk in [4usize, 64, 4096, 1 << 17] {
+            let stream = encode_all(data, out_chunk);
+            // Last byte of a valid stream is always the 0x00 end marker.
+            assert_eq!(stream.last().copied(), Some(0u8), "missing end marker");
+            let got = decode_all(&stream, data.len()).expect("decode_all");
+            assert_eq!(got, data, "bulk decode mismatch (out_chunk={out_chunk})");
+        }
+        // Stable framing → byte-streaming decode through every phase boundary.
+        let stream = encode_all(data, 1 << 17);
+        decode_byte_streaming(&stream, data);
+    }
+
+    #[test]
+    fn enc_empty() {
+        let stream = encode_all(&[], 16);
+        assert_eq!(stream, vec![0x00]);
+        assert!(decode_all(&stream, 0).unwrap().is_empty());
+    }
+
+    #[test]
+    fn enc_one_byte() {
+        enc_roundtrip(b"Z");
+    }
+
+    #[test]
+    fn enc_small_text() {
+        enc_roundtrip(b"hello hello hello world the quick brown fox hello hello");
+    }
+
+    #[test]
+    fn enc_highly_compressible() {
+        // Zeros: forces the compressed-chunk path; ratio must be large.
+        let data = vec![0u8; 200 * 1024];
+        let stream = encode_all(&data, 1 << 17);
+        assert!(
+            stream.len() < data.len() / 4,
+            "zeros should compress hard, got {} from {}",
+            stream.len(),
+            data.len()
+        );
+        enc_roundtrip(&data);
+    }
+
+    #[test]
+    fn enc_multi_chunk() {
+        // > one 64 KiB chunk: several chunks plus the end marker.
+        let data: Vec<u8> = (0u32..200_000)
+            .map(|i| (i.wrapping_mul(31) >> 3) as u8)
+            .collect();
+        enc_roundtrip(&data);
+    }
+
+    #[test]
+    fn enc_incompressible_falls_back() {
+        // A pseudo-random, incompressible buffer forces uncompressed-chunk
+        // fallback (control 0x01). Verify at least one such chunk appears.
+        let mut data = vec![0u8; 4096];
+        let mut x = 0x1234_5678u32;
+        for b in data.iter_mut() {
+            x ^= x << 13;
+            x ^= x >> 17;
+            x ^= x << 5;
+            *b = (x >> 24) as u8;
+        }
+        let stream = encode_all(&data, 1 << 17);
+        assert_eq!(stream[0], 0x01, "expected uncompressed fallback chunk");
+        enc_roundtrip(&data);
+    }
+
+    #[test]
+    fn enc_reset_reuses_encoder() {
+        let data = b"reusable encoder content content content".to_vec();
+        let s1 = encode_all(&data, 1 << 17);
+        let mut enc = Lzma2::encoder_with(());
+        let mut obuf = vec![0u8; 1 << 17];
+        let mut produce = |enc: &mut Encoder| {
+            let mut out = Vec::new();
+            let (p, _) = enc.encode(&data, &mut obuf).unwrap();
+            out.extend_from_slice(&obuf[..p.written]);
+            loop {
+                let (p, st) = enc.finish(&mut obuf).unwrap();
+                out.extend_from_slice(&obuf[..p.written]);
+                if st == Status::StreamEnd {
+                    break;
+                }
+            }
+            out
+        };
+        let a = produce(&mut enc);
+        enc.reset();
+        let b = produce(&mut enc);
+        assert_eq!(a, b);
+        assert_eq!(a, s1, "reset output diverged from a fresh encoder");
+        assert_eq!(decode_all(&a, data.len()).unwrap(), data);
+    }
 }
diff --git a/src/lzma2_internal/mod.rs b/src/lzma2_internal/mod.rs
index f291ea4..fe253a6 100644
--- a/src/lzma2_internal/mod.rs
+++ b/src/lzma2_internal/mod.rs
@@ -8,8 +8,9 @@
 
 pub(crate) mod lzma2_decoder;
 
-// The LZMA payload *encoder* is only needed by the `.xz` container encoder
-// and by round-trip tests; a raw `lzma2`-only build (decode-only) would
-// otherwise carry it as dead code.
-#[cfg(any(feature = "xz", test))]
+// The LZMA payload *encoder* backs both the `.xz` container encoder and the
+// raw LZMA2 encoder ([`crate::lzma2::Encoder`]); it is also exercised by
+// round-trip tests. A build with neither `xz` nor `lzma2` would otherwise
+// carry it as dead code.
+#[cfg(any(feature = "xz", feature = "lzma2", test))]
 pub(crate) mod lzma2_encoder;
diff --git a/tests/lzfse.rs b/tests/lzfse.rs
index d5228e4..6e56b7a 100644
--- a/tests/lzfse.rs
+++ b/tests/lzfse.rs
@@ -342,25 +342,30 @@ fn lzvn_one_byte_at_a_time() {
     assert_eq!(out, HELLO_WORLD);
 }
 
-// ─── bvx2 (LZFSE v2) is documented Unsupported in this build ─────────────
+// ─── bvx2 (LZFSE v2) is now decoded ──────────────────────────────────────
+//
+// The bvx2 decoder itself is validated by round-trip against this crate's
+// own spec-conformant v2 encoder in `src/lzfse/lzfse_v2.rs` (in-crate unit
+// tests, which can reach the `#[cfg(test)]` encoder helper). From the public
+// integration surface we only assert that a *malformed* bvx2 header is
+// rejected cleanly (no panic, no Unsupported) — proving the v2 arm is wired
+// in and reaches real header parsing rather than the old Unsupported stub.
 
 #[test]
-fn bvx2_block_returns_unsupported() {
-    // Construct a stream that starts with bvx2 magic. The decoder should
-    // read the magic, peek at the v2 header (need 28 bytes after magic
-    // for the fixed-size portion), and then return Unsupported.
+fn bvx2_malformed_header_rejected_without_panic() {
+    // A bvx2 magic followed by 32 zero header bytes is not a valid v2 block
+    // (the frequency tables would not sum to the FSE state counts). The
+    // decoder must reject it as Corrupt rather than returning Unsupported or
+    // panicking.
     let mut stream = b"bvx2".to_vec();
-    // 28 bytes of arbitrary header bytes — content doesn't matter because
-    // we return Unsupported before interpreting them.
     stream.extend_from_slice(&[0u8; 32]);
 
     let mut dec = Decoder::new();
     let mut buf = [0u8; 256];
-    // Feed all input. Expect Err(Unsupported) at some point.
     let r = dec.decode(&stream, &mut buf);
     assert!(
-        matches!(r, Err(Error::Unsupported)),
-        "expected Unsupported on bvx2 block, got {:?}",
+        matches!(r, Err(Error::Corrupt) | Err(Error::UnexpectedEnd)),
+        "expected Corrupt/UnexpectedEnd on malformed bvx2 block, got {:?}",
         r
     );
 }
diff --git a/tests/lzma2.rs b/tests/lzma2.rs
index 78f3e49..fa60d47 100644
--- a/tests/lzma2.rs
+++ b/tests/lzma2.rs
@@ -1,11 +1,12 @@
-//! Public-API tests for the raw LZMA2 decoder (7-Zip coder id 21).
+//! Public-API tests for the raw LZMA2 codec (7-Zip coder id 21).
 //!
-//! The crate-private LZMA payload encoder is exercised by the in-module
-//! unit tests (`src/lzma2/mod.rs`), which cover compressed multi-chunk
-//! round-trips, dict resets, and 1-byte streaming. Here we validate the
-//! public surface: decoding hand-framed *uncompressed* LZMA2 chunks (which
-//! need no encoder), self-termination on the `0x00` control byte, the
-//! factory wiring, and DoS hygiene on crafted input.
+//! The in-module unit tests (`src/lzma2/mod.rs`) cover encoder/decoder
+//! round-trips, dict resets, fallback, and 1-byte streaming. Here we
+//! validate the public surface: encoder→decoder round-trips through the
+//! `Lzma2` public types, decoding hand-framed *uncompressed* LZMA2 chunks,
+//! self-termination on the `0x00` control byte, cross-validation against the
+//! shared `xz` chunk codec, the factory wiring, and DoS hygiene on crafted
+//! input.
 
 #![cfg(feature = "lzma2")]
 
@@ -127,8 +128,205 @@ fn factory_wiring() {
     assert!(compcol::factory::names().contains(&"lzma2"));
     assert_eq!(compcol::factory::extension("lzma2"), Some("lzma2"));
     assert!(compcol::factory::decoder_by_name("lzma2").is_some());
-    // Encoder resolves but is an Unsupported stub.
+    // The encoder is now a real working encoder: a factory-built encoder
+    // round-trips through a factory-built decoder.
+    let data = b"factory-routed lzma2 round-trip round-trip round-trip";
     let mut enc = compcol::factory::encoder_by_name("lzma2").expect("encoder present");
-    let mut out = [0u8; 16];
-    assert_eq!(enc.encode(b"x", &mut out), Err(Error::Unsupported));
+    let mut stream = Vec::new();
+    let mut obuf = [0u8; 256];
+    let (p, _) = enc.encode(data, &mut obuf).unwrap();
+    stream.extend_from_slice(&obuf[..p.written]);
+    loop {
+        let (p, st) = enc.finish(&mut obuf).unwrap();
+        stream.extend_from_slice(&obuf[..p.written]);
+        if st == Status::StreamEnd {
+            break;
+        }
+    }
+    let got = decode_all(&stream, DecoderConfig::default(), data.len()).unwrap();
+    assert_eq!(got, data);
+}
+
+/// Encode `data` with the public raw LZMA2 [`Lzma2`] encoder, draining
+/// `output` in `out_chunk`-sized slices to exercise the streaming API.
+fn encode_all(data: &[u8], out_chunk: usize) -> Vec<u8> {
+    let mut enc = Lzma2::encoder_with(());
+    let mut stream = Vec::new();
+    let mut obuf = vec![0u8; out_chunk];
+    let mut consumed = 0;
+    loop {
+        let (p, st) = enc.encode(&data[consumed..], &mut obuf).unwrap();
+        stream.extend_from_slice(&obuf[..p.written]);
+        consumed += p.consumed;
+        match st {
+            Status::InputEmpty => break,
+            Status::OutputFull => {}
+            Status::StreamEnd => unreachable!(),
+        }
+    }
+    loop {
+        let (p, st) = enc.finish(&mut obuf).unwrap();
+        stream.extend_from_slice(&obuf[..p.written]);
+        if st == Status::StreamEnd {
+            break;
+        }
+    }
+    stream
+}
+
+#[test]
+fn encoder_decoder_roundtrip_public() {
+    // Cover the required spread of input shapes through the public API.
+    let zeros = vec![0u8; 130 * 1024];
+    let big: Vec<u8> = (0u32..150_000)
+        .map(|i| (i.wrapping_mul(2654435761) >> 19) as u8)
+        .collect();
+    let mut rnd = vec![0u8; 8192];
+    let mut x = 0x9e37_79b9u32;
+    for b in rnd.iter_mut() {
+        x ^= x << 13;
+        x ^= x >> 17;
+        x ^= x << 5;
+        *b = (x >> 24) as u8;
+    }
+    let cases: Vec<Vec<u8>> = vec![
+        Vec::new(),
+        b"q".to_vec(),
+        b"the quick brown fox jumps over the lazy dog".to_vec(),
+        zeros,
+        big,
+        rnd,
+    ];
+    for data in &cases {
+        for out_chunk in [7usize, 1 << 16] {
+            let stream = encode_all(data, out_chunk);
+            let got = decode_all(&stream, DecoderConfig::default(), data.len()).unwrap();
+            assert_eq!(&got, data, "len={} out_chunk={out_chunk}", data.len());
+        }
+    }
+}
+
+/// Cross-validate framing against the shared `xz` chunk codec: wrap the raw
+/// LZMA2 stream this encoder emits inside a minimal `.xz` container and decode
+/// it with the public `xz` decoder. Because the `xz` and `lzma2` paths share
+/// `lzma2_decoder`, a successful decode proves our chunk framing is exactly
+/// what `xz` consumes. We build the container around our own payload rather
+/// than re-encoding with `xz`, so this exercises *our* bytes.
+#[test]
+#[cfg(feature = "xz")]
+fn xz_cross_validates_framing() {
+    use compcol::xz::Xz;
+
+    fn crc32(data: &[u8]) -> u32 {
+        let mut s = 0xFFFF_FFFFu32;
+        for &b in data {
+            s ^= b as u32;
+            for _ in 0..8 {
+                s = if s & 1 != 0 {
+                    0xEDB8_8320 ^ (s >> 1)
+                } else {
+                    s >> 1
+                };
+            }
+        }
+        s ^ 0xFFFF_FFFF
+    }
+    fn varint(mut v: u64, out: &mut Vec<u8>) {
+        while v >= 0x80 {
+            out.push((v as u8 & 0x7F) | 0x80);
+            v >>= 7;
+        }
+        out.push(v as u8);
+    }
+
+    // Data with both compressible and incompressible regions so the payload
+    // contains compressed (0xE0) and uncompressed (0x01) chunks.
+    let mut data = vec![0u8; 100 * 1024];
+    let mut x = 0x1234_5678u32;
+    for (i, b) in data.iter_mut().enumerate() {
+        if i % 3 == 0 {
+            *b = 0; // compressible runs
+        } else {
+            x ^= x << 13;
+            x ^= x >> 17;
+            x ^= x << 5;
+            *b = (x >> 24) as u8;
+        }
+    }
+
+    // Our raw LZMA2 payload (chunks + 0x00 end marker), unchanged.
+    let payload = encode_all(&data, 1 << 16);
+
+    // ── Stream Header: magic | flags(00,01=CRC32) | CRC32(flags) ──
+    let mut xz = vec![0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00, 0x00, 0x01];
+    xz.extend_from_slice(&crc32(&[0x00, 0x01]).to_le_bytes());
+
+    // ── Block Header: size byte | flags | filter id | props size | dict
+    //    flag (0x14 = 4 MiB) | pad to mult-of-4-minus-CRC | CRC32 ──
+    let mut bh = vec![0x02u8, 0x00, 0x21, 0x01, 0x14, 0x00, 0x00, 0x00];
+    let bh_crc = crc32(&bh).to_le_bytes();
+    bh.extend_from_slice(&bh_crc);
+    let block_header_len = bh.len() as u64;
+    xz.extend_from_slice(&bh);
+
+    // ── Block payload + padding + Check(CRC32 of uncompressed data) ──
+    xz.extend_from_slice(&payload);
+    let compressed_size = payload.len() as u64;
+    let unpadded_no_pad = block_header_len + compressed_size + 4;
+    let pad = ((4 - (unpadded_no_pad % 4)) % 4) as usize;
+    xz.extend(core::iter::repeat_n(0u8, pad));
+    xz.extend_from_slice(&crc32(&data).to_le_bytes());
+
+    // ── Index: 00 | numrec | (unpadded, uncompressed) | pad | CRC32 ──
+    let unpadded_size = block_header_len + compressed_size + 4;
+    let mut idx = vec![0x00u8];
+    varint(1, &mut idx);
+    varint(unpadded_size, &mut idx);
+    varint(data.len() as u64, &mut idx);
+    while idx.len() % 4 != 0 {
+        idx.push(0x00);
+    }
+    let idx_crc = crc32(&idx).to_le_bytes();
+    idx.extend_from_slice(&idx_crc);
+    let index_size = idx.len() as u32;
+    xz.extend_from_slice(&idx);
+
+    // ── Stream Footer: CRC32(body) | backward_size | flags | magic ──
+    let mut footer_body = ((index_size / 4) - 1).to_le_bytes().to_vec();
+    footer_body.push(0x00);
+    footer_body.push(0x01);
+    let f_crc = crc32(&footer_body).to_le_bytes();
+    xz.extend_from_slice(&f_crc);
+    xz.extend_from_slice(&footer_body);
+    xz.extend_from_slice(&[0x59, 0x5A]);
+
+    // Decode the whole thing through the public xz decoder.
+    let mut dec = Xz::decoder_with(());
+    let mut out = vec![0u8; data.len() + 64];
+    let mut consumed = 0;
+    let mut written = 0;
+    loop {
+        let (p, st) = dec.decode(&xz[consumed..], &mut out[written..]).unwrap();
+        consumed += p.consumed;
+        written += p.written;
+        match st {
+            Status::StreamEnd => break,
+            Status::InputEmpty if consumed >= xz.len() => {
+                // Whole container consumed; finish surfaces the trailer end.
+                let (p, fst) = dec.finish(&mut out[written..]).unwrap();
+                written += p.written;
+                assert_eq!(fst, Status::StreamEnd, "xz trailer not terminated");
+                break;
+            }
+            _ => assert!(
+                !(p.consumed == 0 && p.written == 0),
+                "xz decoder stalled — framing mismatch"
+            ),
+        }
+    }
+    out.truncate(written);
+    assert_eq!(
+        out, data,
+        "xz cross-decode of our raw LZMA2 framing mismatched"
+    );
 }