diff --git a/CHANGELOG.md b/CHANGELOG.md index 33bfdae..6f0cc26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **HTTP/2 HPACK header compression** (RFC 7541) behind the new `hpack` + feature. `compcol::hpack::{HpackEncoder, HpackDecoder}` implement the full + header codec — static + dynamic indexing tables, N-bit-prefix integers, + string literals, and all field representations (indexed, literal + with/without indexing, never-indexed, dynamic-table size update). Validated + byte-for-byte against the RFC 7541 Appendix C worked examples. The §5.2 + string Huffman primitive is also exposed as the `Http2Huffman` codec + (name `h2-huffman`) through the uniform `Encoder`/`Decoder` traits. +- **LHA `-lh2-`** added to the `lha` feature: 8 KiB-window LZSS with adaptive + (dynamic) Huffman for both literals/lengths and match positions. Like `lh1` + it is continuous and size-terminated, so its decoder takes the uncompressed + length via `DecoderConfig::with_len`. Clean-room, round-trip validated. + ## [0.6.0](https://github.com/KarpelesLab/compcol/compare/v0.5.1...v0.6.0) - 2026-06-03 ### Other diff --git a/Cargo.toml b/Cargo.toml index 92b5f45..edc85dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ all = [ "lha", "bcj", "bcj2", "delta", "arc_crunch", "arc_squeeze", "arc_squash", + "hpack", ] # Enables `alloc`-backed conveniences (e.g. the `factory` module, the # `compcol::vec` one-shot helpers). Pulled in automatically by features @@ -239,6 +240,12 @@ arc_squeeze = ["alloc"] # block-mode CLEAR code and no header byte (no RLE pre-pass). Encoder and # decoder both implemented and validated by round-trip. arc_squash = ["alloc"] +# HTTP/2 HPACK header compression (RFC 7541): static + dynamic indexing +# tables, integer/string coding, and the §5.2 string Huffman code. The +# Huffman primitive is also exposed as the `Http2Huffman` codec +# (name `"h2-huffman"`). The full header codec lives behind its own +# `compcol::hpack` API (it is stateful over header lists, not a byte stream). +hpack = ["alloc"] # `compcol::tokio_io` — async mirrors of compcol::io for the tokio # runtime. Pulls the tokio dependency for its AsyncRead/AsyncWrite # trait definitions; the rest of the crate stays dep-free. diff --git a/README.md b/README.md index 3d3b348..d853c3f 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ flag, and a `compcol` binary turns the library into a Unix-style filter. | Microsoft Xpress (plain LZ77) | `xpress` | `.xpress` | full | full (per [MS-XCA] §2.2) | hand-built fixtures | | Microsoft Xpress Huffman | `xpress_huffman` | `.xph` | full (LZ77 + canonical Huffman) | full (per [MS-XCA] §2.1; used in WIM / CompactOS NTFS) | hand-built fixtures | | LZNT1 (NTFS native compression) | `lznt1` | `.lznt1` | full | full (per [MS-XCA] §2.5; 4 KiB-chunked LZ77, no entropy coding) | hand-built fixtures | -| LHA / LZH (`-lh1-`/`-lh4-`/`-lh5-`/`-lh6-`/`-lh7-`) | `lha` | `.lzh` | full (lh1 adaptive Huffman; lh4/5/6/7 static Huffman) | full (clean-room from Okumura LZHUF / ar002) | own round-trip (no reference fixture) | +| LHA / LZH (`-lh1-`/`-lh2-`/`-lh4-`/`-lh5-`/`-lh6-`/`-lh7-`) | `lha` | `.lzh` | full (lh1/lh2 adaptive Huffman; lh4/5/6/7 static Huffman) | full (clean-room from Okumura LZHUF / ar002) | own round-trip (no reference fixture) | | BCJ branch filters (x86, ARM, ARMT, ARM64, PPC, SPARC, IA-64, RISC-V) | `bcj` | `bcj-` | full (reversible filter) | full | round-trip identity (public-domain LZMA SDK transform) | | BCJ2 (7z 4-stream x86 filter) | `bcj2` | — | `bcj2::encode` (fn API) | `bcj2::decode` (fn API) | round-trip identity (LZMA SDK algorithm) | | Delta filter (distance 1..=256) | `delta` | `delta` | full (reversible filter) | full | round-trip identity | @@ -76,6 +76,13 @@ flag, and a `compcol` binary turns the library into a Unix-style filter. | RAR 2.x | `rar2` | `.rar` | `Unsupported` (license) | full LZ77+Huffman + audio predictor | real rar-2.60 fixtures | | RAR 3.x | `rar3` | `.rar` | `Unsupported` (license) | full LZ77+Huffman + E8 filter; PPMd & VM filters refused | libarchive RAR3 fixtures | | RAR 5.x | `rar5` | `.rar` | `Unsupported` (license) | full LZ77+Huffman + x86 filter; Delta/ARM refused | RARLAB-CLI fixtures | +| HTTP/2 HPACK (RFC 7541) | `hpack` | — | full (header codec + `h2-huffman` string codec) | full (static+dynamic tables, integer/string coding) | RFC 7541 Appendix C vectors | + +HPACK is HTTP/2's header-compression codec, not a byte-stream codec: it +operates on `(name, value)` header lists with per-connection dynamic-table +state, so it lives behind its own `compcol::hpack` API (`HpackEncoder` / +`HpackDecoder`). The §5.2 string Huffman primitive is also exposed as the +`Http2Huffman` codec (name `h2-huffman`) through the uniform trait surface. The RAR encoders are permanently `Unsupported` per RARLAB's unRAR license terms (every clean-room RAR reader — libarchive, The diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 0134bd3..519cd00 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -199,6 +199,13 @@ test = false doc = false bench = false +[[bin]] +name = "decoder_h2_huffman" +path = "fuzz_targets/decoder_h2_huffman.rs" +test = false +doc = false +bench = false + [[bin]] name = "decoder_bcj" path = "fuzz_targets/decoder_bcj.rs" diff --git a/fuzz/fuzz_targets/decoder_h2_huffman.rs b/fuzz/fuzz_targets/decoder_h2_huffman.rs new file mode 100644 index 0000000..f7929ce --- /dev/null +++ b/fuzz/fuzz_targets/decoder_h2_huffman.rs @@ -0,0 +1,20 @@ +#![no_main] +use compcol::hpack::{huffman, HpackDecoder}; +use libfuzzer_sys::fuzz_target; + +// Smoke property: neither the HPACK header decoder nor the standalone +// "h2 huffman" string decoder may panic on arbitrary attacker-controlled +// input. libfuzzer feeds us garbage; any panic/abort trips the harness. +// +// Both are pure whole-buffer transforms (the HPACK header block decoder is +// the primary attack surface — it walks the integer/string/index +// representations), so we just call them and discard the result. +fuzz_target!(|data: &[u8]| { + // HPACK header block: bounded table so a hostile size update can't grow + // state without limit. + let mut dec = HpackDecoder::with_max_table_size(4096); + let _ = dec.decode(data); + + // The §5.2 Huffman string primitive on its own. + let _ = huffman::decode(data); +}); diff --git a/src/factory.rs b/src/factory.rs index f987285..4c172d5 100644 --- a/src/factory.rs +++ b/src/factory.rs @@ -127,6 +127,8 @@ pub fn encoder_by_name(name: &str) -> Option> { #[cfg(feature = "lha")] crate::lha::Lh1::NAME => Some(Box::new(::encoder())), #[cfg(feature = "lha")] + crate::lha::Lh2::NAME => Some(Box::new(::encoder())), + #[cfg(feature = "lha")] crate::lha::Lh4::NAME => Some(Box::new(::encoder())), #[cfg(feature = "lha")] crate::lha::Lh5::NAME => Some(Box::new(::encoder())), @@ -134,6 +136,10 @@ pub fn encoder_by_name(name: &str) -> Option> { crate::lha::Lh6::NAME => Some(Box::new(::encoder())), #[cfg(feature = "lha")] crate::lha::Lh7::NAME => Some(Box::new(::encoder())), + #[cfg(feature = "hpack")] + crate::hpack::Http2Huffman::NAME => Some(Box::new( + ::encoder(), + )), #[cfg(feature = "bcj")] crate::bcj::BcjX86::NAME => Some(Box::new(::encoder())), #[cfg(feature = "bcj")] @@ -359,6 +365,8 @@ pub fn decoder_by_name(name: &str) -> Option> { #[cfg(feature = "lha")] crate::lha::Lh1::NAME => Some(Box::new(::decoder())), #[cfg(feature = "lha")] + crate::lha::Lh2::NAME => Some(Box::new(::decoder())), + #[cfg(feature = "lha")] crate::lha::Lh4::NAME => Some(Box::new(::decoder())), #[cfg(feature = "lha")] crate::lha::Lh5::NAME => Some(Box::new(::decoder())), @@ -366,6 +374,10 @@ pub fn decoder_by_name(name: &str) -> Option> { crate::lha::Lh6::NAME => Some(Box::new(::decoder())), #[cfg(feature = "lha")] crate::lha::Lh7::NAME => Some(Box::new(::decoder())), + #[cfg(feature = "hpack")] + crate::hpack::Http2Huffman::NAME => Some(Box::new( + ::decoder(), + )), #[cfg(feature = "bcj")] crate::bcj::BcjX86::NAME => Some(Box::new(::decoder())), #[cfg(feature = "bcj")] @@ -701,6 +713,9 @@ pub const fn extension(name: &str) -> Option<&'static str> { if str_eq(name, "lh1") && cfg!(feature = "lha") { return Some("lzh"); } + if str_eq(name, "lh2") && cfg!(feature = "lha") { + return Some("lzh"); + } if str_eq(name, "lh4") && cfg!(feature = "lha") { return Some("lzh"); } @@ -861,6 +876,8 @@ pub const fn names() -> &'static [&'static str] { #[cfg(feature = "lha")] crate::lha::Lh1::NAME, #[cfg(feature = "lha")] + crate::lha::Lh2::NAME, + #[cfg(feature = "lha")] crate::lha::Lh4::NAME, #[cfg(feature = "lha")] crate::lha::Lh5::NAME, @@ -868,6 +885,8 @@ pub const fn names() -> &'static [&'static str] { crate::lha::Lh6::NAME, #[cfg(feature = "lha")] crate::lha::Lh7::NAME, + #[cfg(feature = "hpack")] + crate::hpack::Http2Huffman::NAME, #[cfg(feature = "bcj")] crate::bcj::BcjX86::NAME, #[cfg(feature = "bcj")] diff --git a/src/hpack/huffman.rs b/src/hpack/huffman.rs new file mode 100644 index 0000000..cb0851b --- /dev/null +++ b/src/hpack/huffman.rs @@ -0,0 +1,421 @@ +//! HPACK string Huffman coding — RFC 7541 §5.2, table from Appendix B. +//! +//! This is the "h2 huffman" primitive: a fixed canonical Huffman code over +//! 257 symbols (the 256 byte values plus a 30-bit EOS used only for +//! padding). Strings are coded MSB-first; the final byte is padded with the +//! most-significant bits of the EOS code (all `1`s). +//! +//! The codec wrapper [`Http2Huffman`] exposes this primitive through the +//! crate's uniform [`Encoder`](crate::Encoder) / [`Decoder`](crate::Decoder) +//! traits (encode = compress a byte string, decode = expand one). The full +//! HPACK header codec lives in the parent module. +//! +//! Clean-room: the code table is transcribed from RFC 7541 Appendix B. + +extern crate alloc; +use alloc::vec::Vec; + +use crate::error::Error; +use crate::traits::{Algorithm, RawDecoder, RawEncoder, RawProgress}; + +/// Number of real symbols (byte values 0..=255); index 256 is EOS. +const EOS: u16 = 256; + +/// `(code, bit_length)` for symbols 0..=256, transcribed from RFC 7541 +/// Appendix B. Index = symbol; entry 256 is the EOS marker. +#[rustfmt::skip] +pub(crate) const CODES: [(u32, u8); 257] = [ + (0x1ff8, 13), (0x7fffd8, 23), (0xfffffe2, 28), (0xfffffe3, 28), + (0xfffffe4, 28), (0xfffffe5, 28), (0xfffffe6, 28), (0xfffffe7, 28), + (0xfffffe8, 28), (0xffffea, 24), (0x3ffffffc, 30), (0xfffffe9, 28), + (0xfffffea, 28), (0x3ffffffd, 30), (0xfffffeb, 28), (0xfffffec, 28), + (0xfffffed, 28), (0xfffffee, 28), (0xfffffef, 28), (0xffffff0, 28), + (0xffffff1, 28), (0xffffff2, 28), (0x3ffffffe, 30), (0xffffff3, 28), + (0xffffff4, 28), (0xffffff5, 28), (0xffffff6, 28), (0xffffff7, 28), + (0xffffff8, 28), (0xffffff9, 28), (0xffffffa, 28), (0xffffffb, 28), + (0x14, 6), (0x3f8, 10), (0x3f9, 10), (0xffa, 12), + (0x1ff9, 13), (0x15, 6), (0xf8, 8), (0x7fa, 11), + (0x3fa, 10), (0x3fb, 10), (0xf9, 8), (0x7fb, 11), + (0xfa, 8), (0x16, 6), (0x17, 6), (0x18, 6), + (0x0, 5), (0x1, 5), (0x2, 5), (0x19, 6), + (0x1a, 6), (0x1b, 6), (0x1c, 6), (0x1d, 6), + (0x1e, 6), (0x1f, 6), (0x5c, 7), (0xfb, 8), + (0x7ffc, 15), (0x20, 6), (0xffb, 12), (0x3fc, 10), + (0x1ffa, 13), (0x21, 6), (0x5d, 7), (0x5e, 7), + (0x5f, 7), (0x60, 7), (0x61, 7), (0x62, 7), + (0x63, 7), (0x64, 7), (0x65, 7), (0x66, 7), + (0x67, 7), (0x68, 7), (0x69, 7), (0x6a, 7), + (0x6b, 7), (0x6c, 7), (0x6d, 7), (0x6e, 7), + (0x6f, 7), (0x70, 7), (0x71, 7), (0x72, 7), + (0xfc, 8), (0x73, 7), (0xfd, 8), (0x1ffb, 13), + (0x7fff0, 19), (0x1ffc, 13), (0x3ffc, 14), (0x22, 6), + (0x7ffd, 15), (0x3, 5), (0x23, 6), (0x4, 5), + (0x24, 6), (0x5, 5), (0x25, 6), (0x26, 6), + (0x27, 6), (0x6, 5), (0x74, 7), (0x75, 7), + (0x28, 6), (0x29, 6), (0x2a, 6), (0x7, 5), + (0x2b, 6), (0x76, 7), (0x2c, 6), (0x8, 5), + (0x9, 5), (0x2d, 6), (0x77, 7), (0x78, 7), + (0x79, 7), (0x7a, 7), (0x7b, 7), (0x7ffe, 15), + (0x7fc, 11), (0x3ffd, 14), (0x1ffd, 13), (0xffffffc, 28), + (0xfffe6, 20), (0x3fffd2, 22), (0xfffe7, 20), (0xfffe8, 20), + (0x3fffd3, 22), (0x3fffd4, 22), (0x3fffd5, 22), (0x7fffd9, 23), + (0x3fffd6, 22), (0x7fffda, 23), (0x7fffdb, 23), (0x7fffdc, 23), + (0x7fffdd, 23), (0x7fffde, 23), (0xffffeb, 24), (0x7fffdf, 23), + (0xffffec, 24), (0xffffed, 24), (0x3fffd7, 22), (0x7fffe0, 23), + (0xffffee, 24), (0x7fffe1, 23), (0x7fffe2, 23), (0x7fffe3, 23), + (0x7fffe4, 23), (0x1fffdc, 21), (0x3fffd8, 22), (0x7fffe5, 23), + (0x3fffd9, 22), (0x7fffe6, 23), (0x7fffe7, 23), (0xffffef, 24), + (0x3fffda, 22), (0x1fffdd, 21), (0xfffe9, 20), (0x3fffdb, 22), + (0x3fffdc, 22), (0x7fffe8, 23), (0x7fffe9, 23), (0x1fffde, 21), + (0x7fffea, 23), (0x3fffdd, 22), (0x3fffde, 22), (0xfffff0, 24), + (0x1fffdf, 21), (0x3fffdf, 22), (0x7fffeb, 23), (0x7fffec, 23), + (0x1fffe0, 21), (0x1fffe1, 21), (0x3fffe0, 22), (0x1fffe2, 21), + (0x7fffed, 23), (0x3fffe1, 22), (0x7fffee, 23), (0x7fffef, 23), + (0xfffea, 20), (0x3fffe2, 22), (0x3fffe3, 22), (0x3fffe4, 22), + (0x7ffff0, 23), (0x3fffe5, 22), (0x3fffe6, 22), (0x7ffff1, 23), + (0x3ffffe0, 26), (0x3ffffe1, 26), (0xfffeb, 20), (0x7fff1, 19), + (0x3fffe7, 22), (0x7ffff2, 23), (0x3fffe8, 22), (0x1ffffec, 25), + (0x3ffffe2, 26), (0x3ffffe3, 26), (0x3ffffe4, 26), (0x7ffffde, 27), + (0x7ffffdf, 27), (0x3ffffe5, 26), (0xfffff1, 24), (0x1ffffed, 25), + (0x7fff2, 19), (0x1fffe3, 21), (0x3ffffe6, 26), (0x7ffffe0, 27), + (0x7ffffe1, 27), (0x3ffffe7, 26), (0x7ffffe2, 27), (0xfffff2, 24), + (0x1fffe4, 21), (0x1fffe5, 21), (0x3ffffe8, 26), (0x3ffffe9, 26), + (0xffffffd, 28), (0x7ffffe3, 27), (0x7ffffe4, 27), (0x7ffffe5, 27), + (0xfffec, 20), (0xfffff3, 24), (0xfffed, 20), (0x1fffe6, 21), + (0x3fffe9, 22), (0x1fffe7, 21), (0x1fffe8, 21), (0x7ffff3, 23), + (0x3fffea, 22), (0x3fffeb, 22), (0x1ffffee, 25), (0x1ffffef, 25), + (0xfffff4, 24), (0xfffff5, 24), (0x3ffffea, 26), (0x7ffff4, 23), + (0x3ffffeb, 26), (0x7ffffe6, 27), (0x3ffffec, 26), (0x3ffffed, 26), + (0x7ffffe7, 27), (0x7ffffe8, 27), (0x7ffffe9, 27), (0x7ffffea, 27), + (0x7ffffeb, 27), (0xffffffe, 28), (0x7ffffec, 27), (0x7ffffed, 27), + (0x7ffffee, 27), (0x7ffffef, 27), (0x7fffff0, 27), (0x3ffffee, 26), + (0x3fffffff, 30), +]; + +const MAX_LEN: usize = 30; + +/// Canonical decode tables reconstructed from [`CODES`]. Cheap to build +/// (257-entry sweep); built per decode call. +struct DecodeTable { + /// `first_code[len]` = numeric value of the first codeword of length + /// `len` (1..=30). + first_code: [u32; MAX_LEN + 1], + /// `first_index[len]` = offset into `symbols` of the first codeword of + /// length `len`. + first_index: [usize; MAX_LEN + 1], + /// Symbols ordered by (length asc, code asc). + symbols: Vec, + count: [u32; MAX_LEN + 1], +} + +impl DecodeTable { + fn build() -> Self { + let mut count = [0u32; MAX_LEN + 1]; + for &(_, len) in CODES.iter() { + count[len as usize] += 1; + } + // Symbols sorted by length then symbol number. For a canonical code + // (which Appendix B is) that is also code-ascending order. + let mut symbols: Vec = Vec::with_capacity(CODES.len()); + for len in 1..=MAX_LEN { + for (sym, &(_, l)) in CODES.iter().enumerate() { + if l as usize == len { + symbols.push(sym as u16); + } + } + } + let mut first_code = [0u32; MAX_LEN + 1]; + let mut first_index = [0usize; MAX_LEN + 1]; + let mut code = 0u32; + let mut index = 0usize; + for len in 1..=MAX_LEN { + first_code[len] = code; + first_index[len] = index; + code = (code + count[len]) << 1; + index += count[len] as usize; + } + DecodeTable { + first_code, + first_index, + symbols, + count, + } + } + + /// If `acc` (a value of exactly `len` bits) is a complete codeword, + /// return its symbol. + fn lookup(&self, acc: u32, len: usize) -> Option { + let c = self.count[len]; + if c == 0 { + return None; + } + let off = acc.checked_sub(self.first_code[len])?; + if off < c { + Some(self.symbols[self.first_index[len] + off as usize]) + } else { + None + } + } +} + +/// Huffman-encode `data` (RFC 7541 §5.2): each byte's codeword MSB-first, +/// final byte padded with EOS-prefix `1` bits. +pub fn encode(data: &[u8]) -> Vec { + let mut out = Vec::with_capacity(data.len()); + let mut acc: u64 = 0; + let mut nbits: u32 = 0; + for &b in data { + let (code, len) = CODES[b as usize]; + acc = (acc << len) | code as u64; + nbits += len as u32; + while nbits >= 8 { + nbits -= 8; + out.push((acc >> nbits) as u8); + } + } + if nbits > 0 { + // Pad the low (8 - nbits) bits with 1s (the MSBs of EOS). + let pad = 8 - nbits; + let byte = ((acc << pad) | ((1u64 << pad) - 1)) as u8; + out.push(byte); + } + out +} + +/// Number of bytes [`encode`] would produce for `data`, without allocating. +/// Used by the HPACK encoder to choose Huffman vs raw per RFC 7541 §5.2. +pub fn encoded_len(data: &[u8]) -> usize { + let bits: usize = data.iter().map(|&b| CODES[b as usize].1 as usize).sum(); + bits.div_ceil(8) +} + +/// Huffman-decode `data`. Rejects (RFC 7541 §5.2): padding longer than 7 +/// bits, padding not consisting of EOS-prefix `1`s, and any appearance of +/// the EOS symbol — all as [`Error::Corrupt`]. +pub fn decode(data: &[u8]) -> Result, Error> { + let table = DecodeTable::build(); + let mut out = Vec::with_capacity(data.len() * 2); + let mut acc: u32 = 0; + let mut nbits: usize = 0; + for &byte in data { + for i in (0..8).rev() { + let bit = ((byte >> i) & 1) as u32; + acc = (acc << 1) | bit; + nbits += 1; + if nbits > MAX_LEN { + // No codeword is longer than 30 bits. + return Err(Error::Corrupt); + } + if let Some(sym) = table.lookup(acc, nbits) { + if sym == EOS { + return Err(Error::Corrupt); + } + out.push(sym as u8); + acc = 0; + nbits = 0; + } + } + } + // Trailing bits are padding: must be < 8 bits, all 1s. A prefix-free code + // guarantees these EOS-prefix 1s cannot complete a real symbol above. + if nbits >= 8 { + return Err(Error::Corrupt); + } + if nbits > 0 { + let mask = (1u32 << nbits) - 1; + if acc & mask != mask { + return Err(Error::Corrupt); + } + } + Ok(out) +} + +// ─── codec wrapper (uniform Encoder/Decoder surface) ───────────────────── + +/// HTTP/2 HPACK string Huffman coding ([RFC 7541] §5.2) as a standalone +/// compcol codec. `NAME = "h2-huffman"`. +/// +/// Encoding compresses a byte string with the fixed HPACK code; decoding +/// expands one. There is no framing — the whole input is one Huffman string, +/// exactly as it appears inside an HPACK string literal. +/// +/// [RFC 7541]: https://www.rfc-editor.org/rfc/rfc7541 +#[derive(Debug, Clone, Copy, Default)] +pub struct Http2Huffman; + +impl Algorithm for Http2Huffman { + const NAME: &'static str = "h2-huffman"; + type Encoder = Encoder; + type Decoder = Decoder; + type EncoderConfig = (); + type DecoderConfig = (); + fn encoder_with(_: ()) -> Encoder { + Encoder::default() + } + fn decoder_with(_: ()) -> Decoder { + Decoder::default() + } +} + +/// Streaming wrapper that buffers the whole input, then Huffman-encodes it +/// in `finish` and drains the result. (The padding can't be emitted until +/// the input ends, so the transform is whole-buffer.) +#[derive(Debug, Default)] +pub struct Encoder { + input: Vec, + output: Vec, + cursor: usize, + done: bool, +} + +impl RawEncoder for Encoder { + fn raw_encode(&mut self, input: &[u8], _out: &mut [u8]) -> Result { + self.input.extend_from_slice(input); + Ok(RawProgress { + consumed: input.len(), + written: 0, + done: false, + }) + } + + fn raw_finish(&mut self, output: &mut [u8]) -> Result { + if !self.done { + self.output = encode(&self.input); + self.done = true; + } + Ok(drain(&self.output, &mut self.cursor, output)) + } + + fn raw_reset(&mut self) { + self.input.clear(); + self.output.clear(); + self.cursor = 0; + self.done = false; + } +} + +/// Streaming wrapper that buffers the whole input, then Huffman-decodes it +/// in `finish` and drains the result. +#[derive(Debug, Default)] +pub struct Decoder { + input: Vec, + output: Vec, + cursor: usize, + decoded: bool, +} + +impl RawDecoder for Decoder { + fn raw_decode(&mut self, input: &[u8], output: &mut [u8]) -> Result { + if !self.decoded { + self.input.extend_from_slice(input); + return Ok(RawProgress { + consumed: input.len(), + written: 0, + done: false, + }); + } + Ok(drain(&self.output, &mut self.cursor, output)) + } + + fn raw_finish(&mut self, output: &mut [u8]) -> Result { + if !self.decoded { + self.output = decode(&self.input)?; + self.decoded = true; + } + Ok(drain(&self.output, &mut self.cursor, output)) + } + + fn raw_reset(&mut self) { + self.input.clear(); + self.output.clear(); + self.cursor = 0; + self.decoded = false; + } +} + +fn drain(buf: &[u8], cursor: &mut usize, output: &mut [u8]) -> RawProgress { + let remaining = buf.len() - *cursor; + let take = remaining.min(output.len()); + output[..take].copy_from_slice(&buf[*cursor..*cursor + take]); + *cursor += take; + RawProgress { + consumed: 0, + written: take, + done: *cursor >= buf.len(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn table_is_canonical_and_complete() { + // Reconstructing codes from lengths must reproduce the table exactly, + // which both validates the transcription and proves the code is + // canonical (so the decoder's first_code math is correct). + let table = DecodeTable::build(); + assert_eq!(table.symbols.len(), 257); + let mut next = table.first_code; + for &sym in &table.symbols { + let (code, len) = CODES[sym as usize]; + let l = len as usize; + assert_eq!(next[l], code, "symbol {sym} code mismatch"); + next[l] += 1; + } + } + + #[test] + fn rfc_c4_string_vectors() { + // RFC 7541 C.4.1: "www.example.com" → f1e3 c2e5 f23a 6ba0 ab90 f4ff + let enc = encode(b"www.example.com"); + assert_eq!( + enc, + [ + 0xf1, 0xe3, 0xc2, 0xe5, 0xf2, 0x3a, 0x6b, 0xa0, 0xab, 0x90, 0xf4, 0xff + ] + ); + assert_eq!(decode(&enc).unwrap(), b"www.example.com"); + + // C.4.2: "no-cache" → a8eb 1064 9cbf + let enc = encode(b"no-cache"); + assert_eq!(enc, [0xa8, 0xeb, 0x10, 0x64, 0x9c, 0xbf]); + assert_eq!(decode(&enc).unwrap(), b"no-cache"); + + // C.4.3: "custom-key" → 25a8 49e9 5ba9 7d7f + assert_eq!( + encode(b"custom-key"), + [0x25, 0xa8, 0x49, 0xe9, 0x5b, 0xa9, 0x7d, 0x7f] + ); + // C.4.3: "custom-value" → 25a8 49e9 5bb8 e8b4 bf + assert_eq!( + encode(b"custom-value"), + [0x25, 0xa8, 0x49, 0xe9, 0x5b, 0xb8, 0xe8, 0xb4, 0xbf] + ); + } + + #[test] + fn round_trip_all_bytes_and_empty() { + assert_eq!(encode(b""), b""); + assert_eq!(decode(b"").unwrap(), b""); + let all: Vec = (0..=255).collect(); + assert_eq!(decode(&encode(&all)).unwrap(), all); + } + + #[test] + fn eos_symbol_rejected() { + // 30 one-bits = EOS code; as a full byte-aligned input it decodes to + // the EOS symbol and must be rejected. + let bytes = [0xffu8, 0xff, 0xff, 0xff, 0xc0]; // 30 ones + 10 pad ones + // (40 bits: first 30 = EOS) → Corrupt + assert!(matches!(decode(&bytes), Err(Error::Corrupt))); + } + + #[test] + fn bad_padding_rejected() { + // "0" encodes as symbol 48 = 00000 (5 bits); pad with zeros instead of + // ones → invalid padding. + let bad = [0b0000_0000u8]; // 5-bit code 00000 then 000 padding + assert!(matches!(decode(&bad), Err(Error::Corrupt))); + } +} diff --git a/src/hpack/integer.rs b/src/hpack/integer.rs new file mode 100644 index 0000000..b5cf831 --- /dev/null +++ b/src/hpack/integer.rs @@ -0,0 +1,118 @@ +//! HPACK integer representation — RFC 7541 §5.1. +//! +//! HPACK integers carry an `N`-bit prefix that shares its byte with +//! preceding flag bits. Values below `2^N − 1` fit entirely in the prefix; +//! larger values store `2^N − 1` in the prefix and the remainder as a +//! little-endian base-128 continuation (each byte's high bit = "more"). + +extern crate alloc; +use alloc::vec::Vec; + +use crate::error::Error; + +/// Encode `value` with an `n`-bit prefix (`1 <= n <= 8`), OR-ing `flags` +/// (the high `8 - n` bits already positioned) into the prefix byte. +/// +/// Appends to `out`. +pub fn encode_int(out: &mut Vec, value: usize, n: u32, flags: u8) { + debug_assert!((1..=8).contains(&n)); + let max_prefix = (1usize << n) - 1; + if value < max_prefix { + out.push(flags | value as u8); + return; + } + out.push(flags | max_prefix as u8); + let mut v = value - max_prefix; + while v >= 128 { + out.push((v & 0x7f) as u8 | 0x80); + v >>= 7; + } + out.push(v as u8); +} + +/// Decode an `n`-bit-prefix integer starting at `buf[pos]` (`1 <= n <= 8`). +/// +/// Returns `(value, next_pos)`. The prefix flag bits above the low `n` bits +/// of `buf[pos]` are ignored (the caller already dispatched on them). +/// +/// Rejects continuations that would overflow `usize` or run past the buffer +/// (`Error::Corrupt` / `Error::UnexpectedEnd`) — the standard HPACK +/// integer-overflow guard. +pub fn decode_int(buf: &[u8], pos: usize, n: u32) -> Result<(usize, usize), Error> { + debug_assert!((1..=8).contains(&n)); + let max_prefix = (1usize << n) - 1; + let first = *buf.get(pos).ok_or(Error::UnexpectedEnd)? as usize; + let mut value = first & max_prefix; + let mut p = pos + 1; + if value < max_prefix { + return Ok((value, p)); + } + // Continuation: base-128, high bit = "more". Cap the shift so a crafted + // run of 0x80 bytes can't spin or overflow. + let mut shift = 0u32; + loop { + let b = *buf.get(p).ok_or(Error::UnexpectedEnd)? as usize; + p += 1; + // 7 fresh bits at `shift`; guard against usize overflow. + if shift >= usize::BITS { + return Err(Error::Corrupt); + } + let add = (b & 0x7f).checked_shl(shift).ok_or(Error::Corrupt)?; + value = value.checked_add(add).ok_or(Error::Corrupt)?; + if b & 0x80 == 0 { + break; + } + shift += 7; + } + Ok((value, p)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rfc_c1_examples() { + // C.1.1: 10 with a 5-bit prefix → 0x0a + let mut out = Vec::new(); + encode_int(&mut out, 10, 5, 0); + assert_eq!(out, [0x0a]); + assert_eq!(decode_int(&out, 0, 5).unwrap(), (10, 1)); + + // C.1.2: 1337 with a 5-bit prefix → 0x1f 0x9a 0x0a + let mut out = Vec::new(); + encode_int(&mut out, 1337, 5, 0); + assert_eq!(out, [0x1f, 0x9a, 0x0a]); + assert_eq!(decode_int(&out, 0, 5).unwrap(), (1337, 3)); + + // C.1.3: 42 with an 8-bit prefix → 0x2a + let mut out = Vec::new(); + encode_int(&mut out, 42, 8, 0); + assert_eq!(out, [0x2a]); + assert_eq!(decode_int(&out, 0, 8).unwrap(), (42, 1)); + } + + #[test] + fn flags_preserved_and_ignored() { + let mut out = Vec::new(); + encode_int(&mut out, 2, 6, 0b1100_0000); + assert_eq!(out, [0b1100_0010]); + // decode ignores the two high flag bits + assert_eq!(decode_int(&out, 0, 6).unwrap(), (2, 1)); + } + + #[test] + fn overlong_continuation_rejected() { + // Many 0x80 bytes never terminate within usize → Corrupt, not a hang. + let mut buf = alloc::vec![0xffu8]; // 5-bit prefix max + buf.extend(core::iter::repeat_n(0x80, 64)); + buf.push(0x00); + assert!(matches!(decode_int(&buf, 0, 5), Err(Error::Corrupt))); + } + + #[test] + fn truncated_continuation_rejected() { + let buf = [0x1fu8, 0x9a]; // promises more but ends + assert!(matches!(decode_int(&buf, 0, 5), Err(Error::UnexpectedEnd))); + } +} diff --git a/src/hpack/mod.rs b/src/hpack/mod.rs new file mode 100644 index 0000000..b27574b --- /dev/null +++ b/src/hpack/mod.rs @@ -0,0 +1,338 @@ +//! HTTP/2 HPACK header compression — [RFC 7541]. +//! +//! HPACK compresses an ordered list of `(name, value)` header fields against +//! a static table (61 common fields) and a per-connection dynamic table. +//! Unlike the byte-stream codecs elsewhere in this crate, an HPACK codec is +//! **stateful across header blocks** (the dynamic table evolves) and operates +//! on header *lists*, not a byte stream — so it has its own API +//! ([`HpackEncoder`] / [`HpackDecoder`]) rather than the +//! [`Encoder`](crate::Encoder) / [`Decoder`](crate::Decoder) traits. +//! +//! The string-literal Huffman coding (§5.2) is also exposed on its own as the +//! [`Http2Huffman`] codec (name `"h2-huffman"`), which *does* use the uniform +//! trait surface. +//! +//! ``` +//! use compcol::hpack::{HpackEncoder, HpackDecoder, HeaderField}; +//! +//! let mut enc = HpackEncoder::new(); +//! let mut dec = HpackDecoder::new(); +//! let block = enc.encode(&[ +//! HeaderField::new(b":method", b"GET"), +//! HeaderField::new(b"custom", b"value"), +//! ]); +//! let out = dec.decode(&block).unwrap(); +//! assert_eq!(out[0].name, b":method"); +//! assert_eq!(out[1].value, b"value"); +//! ``` +//! +//! Clean-room from RFC 7541 (the static/Huffman tables are transcribed from +//! its appendices). +//! +//! [RFC 7541]: https://www.rfc-editor.org/rfc/rfc7541 + +#![cfg_attr(docsrs, doc(cfg(feature = "hpack")))] + +extern crate alloc; +use alloc::vec::Vec; + +use crate::error::Error; + +pub mod huffman; +mod integer; +mod table; + +pub use huffman::Http2Huffman; + +use integer::{decode_int, encode_int}; +use table::DynamicTable; + +/// HTTP/2's protocol-default dynamic table size +/// (`SETTINGS_HEADER_TABLE_SIZE`, RFC 7540 §6.5.2). +pub const DEFAULT_TABLE_SIZE: usize = 4096; + +/// A decoded/encodable header field. +/// +/// `sensitive` marks a field that must never be indexed (RFC 7541 §7.1.3 — +/// e.g. `cookie`/`authorization`); the encoder emits it as "literal never +/// indexed" and never places it in the dynamic table. The decoder sets it on +/// fields received with that representation. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct HeaderField { + pub name: Vec, + pub value: Vec, + pub sensitive: bool, +} + +impl HeaderField { + /// A non-sensitive field. + pub fn new(name: &[u8], value: &[u8]) -> Self { + HeaderField { + name: name.to_vec(), + value: value.to_vec(), + sensitive: false, + } + } + + /// A sensitive field (never indexed). + pub fn sensitive(name: &[u8], value: &[u8]) -> Self { + HeaderField { + name: name.to_vec(), + value: value.to_vec(), + sensitive: true, + } + } +} + +// ─── encoder ───────────────────────────────────────────────────────────── + +/// HPACK encoder. Holds the dynamic table across [`encode`](Self::encode) +/// calls, one call per header block. +#[derive(Debug)] +pub struct HpackEncoder { + table: DynamicTable, + use_huffman: bool, + pending_size_update: Option, +} + +impl Default for HpackEncoder { + fn default() -> Self { + Self::new() + } +} + +impl HpackEncoder { + /// New encoder with the protocol-default 4096-byte dynamic table and + /// Huffman string coding enabled. + pub fn new() -> Self { + HpackEncoder { + table: DynamicTable::new(DEFAULT_TABLE_SIZE), + use_huffman: true, + pending_size_update: None, + } + } + + /// New encoder whose dynamic table is bounded to `max` bytes. A + /// dynamic-table-size-update (§6.3) is emitted at the start of the next + /// header block so the peer's decoder tracks the same bound. + pub fn with_max_table_size(max: usize) -> Self { + HpackEncoder { + table: DynamicTable::new(max), + use_huffman: true, + pending_size_update: Some(max), + } + } + + /// Enable/disable Huffman coding of string literals (default on). When + /// off, strings are emitted raw; when on, the shorter of Huffman/raw is + /// chosen per string (§5.2). + pub fn set_huffman(&mut self, on: bool) { + self.use_huffman = on; + } + + /// Encode one header block. + pub fn encode(&mut self, fields: &[HeaderField]) -> Vec { + let mut out = Vec::new(); + if let Some(max) = self.pending_size_update.take() { + // §6.3: 001 pattern, 5-bit prefix. + encode_int(&mut out, max, 5, 0x20); + } + for f in fields { + self.encode_field(&mut out, f); + } + out + } + + fn encode_field(&mut self, out: &mut Vec, f: &HeaderField) { + if f.sensitive { + // §6.2.3 literal never indexed (0001 pattern, 4-bit name index). + let name_idx = self.table.find(&f.name, &f.value).map(|m| m.index); + // Use only a name match (never index, so a value match is moot). + let name_idx = match name_idx { + Some(i) if self.table.get(i).map(|(n, _)| n == f.name).unwrap_or(false) => Some(i), + _ => None, + }; + self.emit_literal(out, 0x10, 4, name_idx, &f.name, &f.value); + return; + } + match self.table.find(&f.name, &f.value) { + Some(m) if m.value_matched => { + // §6.1 indexed header field (1 pattern, 7-bit index). + encode_int(out, m.index, 7, 0x80); + } + Some(m) => { + // Name match → §6.2.1 literal with incremental indexing. + self.emit_literal(out, 0x40, 6, Some(m.index), &f.name, &f.value); + self.table.insert(&f.name, &f.value); + } + None => { + self.emit_literal(out, 0x40, 6, None, &f.name, &f.value); + self.table.insert(&f.name, &f.value); + } + } + } + + fn emit_literal( + &self, + out: &mut Vec, + pattern: u8, + prefix: u32, + name_idx: Option, + name: &[u8], + value: &[u8], + ) { + match name_idx { + Some(i) => encode_int(out, i, prefix, pattern), + None => { + encode_int(out, 0, prefix, pattern); + self.emit_string(out, name); + } + } + self.emit_string(out, value); + } + + fn emit_string(&self, out: &mut Vec, s: &[u8]) { + if self.use_huffman && huffman::encoded_len(s) < s.len() { + let coded = huffman::encode(s); + encode_int(out, coded.len(), 7, 0x80); // H flag set + out.extend_from_slice(&coded); + } else { + encode_int(out, s.len(), 7, 0x00); + out.extend_from_slice(s); + } + } +} + +// ─── decoder ───────────────────────────────────────────────────────────── + +/// HPACK decoder. Holds the dynamic table across [`decode`](Self::decode) +/// calls, one call per header block. +#[derive(Debug)] +pub struct HpackDecoder { + table: DynamicTable, + /// Connection limit on the dynamic table size: a peer size-update may not + /// exceed this (§6.3). + size_limit: usize, +} + +impl Default for HpackDecoder { + fn default() -> Self { + Self::new() + } +} + +impl HpackDecoder { + /// New decoder with the protocol-default 4096-byte dynamic table. + pub fn new() -> Self { + Self::with_max_table_size(DEFAULT_TABLE_SIZE) + } + + /// New decoder whose dynamic table (and the size-update ceiling) is + /// `max` bytes. + pub fn with_max_table_size(max: usize) -> Self { + HpackDecoder { + table: DynamicTable::new(max), + size_limit: max, + } + } + + /// Decode one header block into its field list. Returns [`Error::Corrupt`] + /// on any malformed representation, bad table index, or an over-limit + /// size update; [`Error::UnexpectedEnd`] on truncation. + pub fn decode(&mut self, block: &[u8]) -> Result, Error> { + let mut fields = Vec::new(); + let mut pos = 0; + while pos < block.len() { + let b = block[pos]; + if b & 0x80 != 0 { + // §6.1 indexed header field. + let (idx, np) = decode_int(block, pos, 7)?; + pos = np; + if idx == 0 { + return Err(Error::Corrupt); + } + let (n, v) = self.table.get(idx).ok_or(Error::Corrupt)?; + fields.push(HeaderField::new(n, v)); + } else if b & 0x40 != 0 { + // §6.2.1 literal with incremental indexing. + let (name, value, np) = self.read_literal(block, pos, 6)?; + pos = np; + self.table.insert(&name, &value); + fields.push(HeaderField { + name, + value, + sensitive: false, + }); + } else if b & 0x20 != 0 { + // §6.3 dynamic table size update. + let (new_max, np) = decode_int(block, pos, 5)?; + pos = np; + if new_max > self.size_limit { + return Err(Error::Corrupt); + } + self.table.set_max_size(new_max); + } else { + // §6.2.2 (without indexing) or §6.2.3 (never indexed). Both + // have a 4-bit prefix; bit 0x10 distinguishes "never indexed". + let sensitive = b & 0x10 != 0; + let (name, value, np) = self.read_literal(block, pos, 4)?; + pos = np; + fields.push(HeaderField { + name, + value, + sensitive, + }); + } + } + Ok(fields) + } + + /// Read a literal field's name (indexed or string) and value (string) + /// starting at `pos`. `prefix` is the index field's prefix width. + fn read_literal( + &self, + block: &[u8], + pos: usize, + prefix: u32, + ) -> Result<(Vec, Vec, usize), Error> { + let (idx, mut p) = decode_int(block, pos, prefix)?; + let name = if idx == 0 { + let (n, np) = read_string(block, p)?; + p = np; + n + } else { + let (n, _) = self.table.get(idx).ok_or(Error::Corrupt)?; + n.to_vec() + }; + let (value, np) = read_string(block, p)?; + Ok((name, value, np)) + } + + /// Current dynamic table size limit (for tests/inspection). + #[cfg(test)] + pub(crate) fn table_max_size(&self) -> usize { + self.table.max_size() + } +} + +/// Read an HPACK string literal (§5.2) at `pos`: H-flagged 7-bit length, then +/// that many octets, Huffman-decoded if H was set. +fn read_string(block: &[u8], pos: usize) -> Result<(Vec, usize), Error> { + let first = *block.get(pos).ok_or(Error::UnexpectedEnd)?; + let huff = first & 0x80 != 0; + let (len, p) = decode_int(block, pos, 7)?; + let end = p.checked_add(len).ok_or(Error::Corrupt)?; + if end > block.len() { + return Err(Error::UnexpectedEnd); + } + let raw = &block[p..end]; + let data = if huff { + huffman::decode(raw)? + } else { + raw.to_vec() + }; + Ok((data, end)) +} + +#[cfg(test)] +mod tests; diff --git a/src/hpack/table.rs b/src/hpack/table.rs new file mode 100644 index 0000000..b1d7ed0 --- /dev/null +++ b/src/hpack/table.rs @@ -0,0 +1,195 @@ +//! HPACK index address space — RFC 7541 §2.3. +//! +//! A single index space overlays the static table (indices 1..=61, RFC 7541 +//! Appendix A) and the dynamic table (indices 62.., newest entry first, +//! §2.3.3). The dynamic table is a FIFO with byte-size accounting and +//! eviction (§4). + +extern crate alloc; +use alloc::collections::VecDeque; +use alloc::vec::Vec; + +/// Static table (RFC 7541 Appendix A), 61 `(name, value)` entries. Index 1 +/// is `STATIC_TABLE[0]`. +#[rustfmt::skip] +pub(crate) const STATIC_TABLE: [(&[u8], &[u8]); 61] = [ + (b":authority", b""), + (b":method", b"GET"), + (b":method", b"POST"), + (b":path", b"/"), + (b":path", b"/index.html"), + (b":scheme", b"http"), + (b":scheme", b"https"), + (b":status", b"200"), + (b":status", b"204"), + (b":status", b"206"), + (b":status", b"304"), + (b":status", b"400"), + (b":status", b"404"), + (b":status", b"500"), + (b"accept-charset", b""), + (b"accept-encoding", b"gzip, deflate"), + (b"accept-language", b""), + (b"accept-ranges", b""), + (b"accept", b""), + (b"access-control-allow-origin", b""), + (b"age", b""), + (b"allow", b""), + (b"authorization", b""), + (b"cache-control", b""), + (b"content-disposition", b""), + (b"content-encoding", b""), + (b"content-language", b""), + (b"content-length", b""), + (b"content-location", b""), + (b"content-range", b""), + (b"content-type", b""), + (b"cookie", b""), + (b"date", b""), + (b"etag", b""), + (b"expect", b""), + (b"expires", b""), + (b"from", b""), + (b"host", b""), + (b"if-match", b""), + (b"if-modified-since", b""), + (b"if-none-match", b""), + (b"if-range", b""), + (b"if-unmodified-since", b""), + (b"last-modified", b""), + (b"link", b""), + (b"location", b""), + (b"max-forwards", b""), + (b"proxy-authenticate", b""), + (b"proxy-authorization", b""), + (b"range", b""), + (b"referer", b""), + (b"refresh", b""), + (b"retry-after", b""), + (b"server", b""), + (b"set-cookie", b""), + (b"strict-transport-security", b""), + (b"transfer-encoding", b""), + (b"user-agent", b""), + (b"vary", b""), + (b"via", b""), + (b"www-authenticate", b""), +]; + +/// Number of static entries; the first dynamic index is `STATIC_LEN + 1`. +pub(crate) const STATIC_LEN: usize = STATIC_TABLE.len(); + +/// Per-entry overhead added to name+value lengths for size accounting +/// (RFC 7541 §4.1). +const ENTRY_OVERHEAD: usize = 32; + +/// A reference to an index in the combined static+dynamic space, plus +/// whether the value (not just the name) matched. +pub(crate) struct Match { + pub index: usize, + pub value_matched: bool, +} + +/// HPACK dynamic table: newest entry at the front. Size is bounded by +/// `max_size` (the connection's table-size limit); inserting evicts from the +/// back until the new entry fits (§4.4). +#[derive(Debug)] +pub(crate) struct DynamicTable { + entries: VecDeque<(Vec, Vec)>, + size: usize, + max_size: usize, +} + +impl DynamicTable { + pub fn new(max_size: usize) -> Self { + DynamicTable { + entries: VecDeque::new(), + size: 0, + max_size, + } + } + + #[cfg(test)] + pub fn max_size(&self) -> usize { + self.max_size + } + + /// Apply a dynamic-table size update (§6.3), evicting as needed. + pub fn set_max_size(&mut self, new_max: usize) { + self.max_size = new_max; + self.evict_to_fit(0); + } + + fn entry_size(name: &[u8], value: &[u8]) -> usize { + name.len() + value.len() + ENTRY_OVERHEAD + } + + fn evict_to_fit(&mut self, incoming: usize) { + while self.size + incoming > self.max_size { + match self.entries.pop_back() { + Some((n, v)) => self.size -= Self::entry_size(&n, &v), + None => break, + } + } + } + + /// Insert a new entry at the front (§4.4). If it is larger than the whole + /// table, the table ends up empty (the spec result of evicting everything). + pub fn insert(&mut self, name: &[u8], value: &[u8]) { + let need = Self::entry_size(name, value); + self.evict_to_fit(need); + if need <= self.max_size { + self.entries.push_front((name.to_vec(), value.to_vec())); + self.size += need; + } + } + + /// Look up a 1-based combined index. Returns `(name, value)`. + pub fn get(&self, index: usize) -> Option<(&[u8], &[u8])> { + if index == 0 { + return None; + } + if index <= STATIC_LEN { + let (n, v) = STATIC_TABLE[index - 1]; + return Some((n, v)); + } + let dyn_pos = index - STATIC_LEN - 1; + self.entries + .get(dyn_pos) + .map(|(n, v)| (n.as_slice(), v.as_slice())) + } + + /// Find the best index for `(name, value)`: prefer a full name+value + /// match, falling back to a name-only match. Searches the static table + /// first (so the canonical low indices win), then the dynamic table. + pub fn find(&self, name: &[u8], value: &[u8]) -> Option { + let mut name_only: Option = None; + for (i, (n, v)) in STATIC_TABLE.iter().enumerate() { + if *n == name { + if *v == value { + return Some(Match { + index: i + 1, + value_matched: true, + }); + } + name_only.get_or_insert(i + 1); + } + } + for (pos, (n, v)) in self.entries.iter().enumerate() { + if n.as_slice() == name { + let index = STATIC_LEN + 1 + pos; + if v.as_slice() == value { + return Some(Match { + index, + value_matched: true, + }); + } + name_only.get_or_insert(index); + } + } + name_only.map(|index| Match { + index, + value_matched: false, + }) + } +} diff --git a/src/hpack/tests.rs b/src/hpack/tests.rs new file mode 100644 index 0000000..23f36ae --- /dev/null +++ b/src/hpack/tests.rs @@ -0,0 +1,212 @@ +//! RFC 7541 Appendix C worked-example vectors + round-trip / error tests. + +use super::*; +use alloc::vec; + +fn f(name: &[u8], value: &[u8]) -> HeaderField { + HeaderField::new(name, value) +} + +/// RFC 7541 C.3 — request sequence WITHOUT Huffman. Asserts byte-exact +/// encoding (with the dynamic table evolving across the three blocks) and +/// that a decoder reproduces the field lists. +#[test] +fn rfc_c3_request_sequence_raw() { + let mut enc = HpackEncoder::new(); + enc.set_huffman(false); + let mut dec = HpackDecoder::new(); + + // C.3.1 + let req1 = [ + f(b":method", b"GET"), + f(b":scheme", b"http"), + f(b":path", b"/"), + f(b":authority", b"www.example.com"), + ]; + let b1 = enc.encode(&req1); + assert_eq!( + b1, + [ + 0x82, 0x86, 0x84, 0x41, 0x0f, 0x77, 0x77, 0x77, 0x2e, 0x65, 0x78, 0x61, 0x6d, 0x70, + 0x6c, 0x65, 0x2e, 0x63, 0x6f, 0x6d + ] + ); + assert_eq!(dec.decode(&b1).unwrap(), req1); + + // C.3.2 + let req2 = [ + f(b":method", b"GET"), + f(b":scheme", b"http"), + f(b":path", b"/"), + f(b":authority", b"www.example.com"), + f(b"cache-control", b"no-cache"), + ]; + let b2 = enc.encode(&req2); + assert_eq!( + b2, + [ + 0x82, 0x86, 0x84, 0xbe, 0x58, 0x08, 0x6e, 0x6f, 0x2d, 0x63, 0x61, 0x63, 0x68, 0x65 + ] + ); + assert_eq!(dec.decode(&b2).unwrap(), req2); + + // C.3.3 + let req3 = [ + f(b":method", b"GET"), + f(b":scheme", b"https"), + f(b":path", b"/index.html"), + f(b":authority", b"www.example.com"), + f(b"custom-key", b"custom-value"), + ]; + let b3 = enc.encode(&req3); + assert_eq!( + b3, + [ + 0x82, 0x87, 0x85, 0xbf, 0x40, 0x0a, 0x63, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x2d, 0x6b, + 0x65, 0x79, 0x0c, 0x63, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x2d, 0x76, 0x61, 0x6c, 0x75, + 0x65 + ] + ); + assert_eq!(dec.decode(&b3).unwrap(), req3); +} + +/// RFC 7541 C.4 — the same request sequence WITH Huffman string coding. +#[test] +fn rfc_c4_request_sequence_huffman() { + let mut enc = HpackEncoder::new(); // Huffman on by default + let mut dec = HpackDecoder::new(); + + let req1 = [ + f(b":method", b"GET"), + f(b":scheme", b"http"), + f(b":path", b"/"), + f(b":authority", b"www.example.com"), + ]; + let b1 = enc.encode(&req1); + assert_eq!( + b1, + [ + 0x82, 0x86, 0x84, 0x41, 0x8c, 0xf1, 0xe3, 0xc2, 0xe5, 0xf2, 0x3a, 0x6b, 0xa0, 0xab, + 0x90, 0xf4, 0xff + ] + ); + assert_eq!(dec.decode(&b1).unwrap(), req1); + + let req2 = [ + f(b":method", b"GET"), + f(b":scheme", b"http"), + f(b":path", b"/"), + f(b":authority", b"www.example.com"), + f(b"cache-control", b"no-cache"), + ]; + let b2 = enc.encode(&req2); + assert_eq!( + b2, + [ + 0x82, 0x86, 0x84, 0xbe, 0x58, 0x86, 0xa8, 0xeb, 0x10, 0x64, 0x9c, 0xbf + ] + ); + assert_eq!(dec.decode(&b2).unwrap(), req2); + + let req3 = [ + f(b":method", b"GET"), + f(b":scheme", b"https"), + f(b":path", b"/index.html"), + f(b":authority", b"www.example.com"), + f(b"custom-key", b"custom-value"), + ]; + let b3 = enc.encode(&req3); + assert_eq!( + b3, + [ + 0x82, 0x87, 0x85, 0xbf, 0x40, 0x88, 0x25, 0xa8, 0x49, 0xe9, 0x5b, 0xa9, 0x7d, 0x7f, + 0x89, 0x25, 0xa8, 0x49, 0xe9, 0x5b, 0xb8, 0xe8, 0xb4, 0xbf + ] + ); + assert_eq!(dec.decode(&b3).unwrap(), req3); +} + +/// Dynamic-table eviction at a small bound (the scenario RFC 7541 C.5/C.6 +/// exercise): each response entry is ~63–98 bytes, so a 256-byte table holds +/// at most a few. Verified behaviorally through a shared encoder/decoder. +#[test] +fn eviction_at_table_size_256() { + let mut enc = HpackEncoder::with_max_table_size(256); + let mut dec = HpackDecoder::with_max_table_size(256); + + let resp = [ + f(b":status", b"302"), + f(b"cache-control", b"private"), + f(b"date", b"Mon, 21 Oct 2013 20:13:21 GMT"), + f(b"location", b"https://www.example.com"), + ]; + let block = enc.encode(&resp); + // First byte is the queued size update (001 prefix → 0x3f continuation + // since 256 > 31): 0x3f 0xe1 0x01. + assert_eq!(&block[..3], &[0x3f, 0xe1, 0x01]); + assert_eq!(dec.decode(&block).unwrap(), resp); + assert_eq!(dec.table_max_size(), 256); + + // A second, identical response must still round-trip after eviction + // churn (entries indexed from the dynamic table where they survived). + let block2 = enc.encode(&resp); + assert_eq!(dec.decode(&block2).unwrap(), resp); +} + +#[test] +fn round_trip_many_fields() { + let mut enc = HpackEncoder::new(); + let mut dec = HpackDecoder::new(); + let fields: vec::Vec = (0..50) + .map(|i| { + let name = alloc::format!("x-header-{i}"); + let val = alloc::format!("value-{}-{}", i, "blahblah".repeat(i % 4)); + f(name.as_bytes(), val.as_bytes()) + }) + .collect(); + let block = enc.encode(&fields); + assert_eq!(dec.decode(&block).unwrap(), fields); +} + +#[test] +fn sensitive_field_never_indexed() { + let mut enc = HpackEncoder::new(); + let mut dec = HpackDecoder::new(); + let fields = [HeaderField::sensitive(b"authorization", b"secret-token")]; + let block = enc.encode(&fields); + // 0001 pattern with name index 23 (authorization) → 0x10 | 23 = 0x1f... + // (23 < 15? no: 4-bit prefix max is 15, so 0x1f then continuation 0x08). + assert_eq!(block[0], 0x1f); + let out = dec.decode(&block).unwrap(); + assert_eq!(out, fields); + assert!(out[0].sensitive); +} + +#[test] +fn decode_rejects_bad_index() { + let mut dec = HpackDecoder::new(); + // Indexed header field, index 0 → invalid. + assert!(matches!(dec.decode(&[0x80]), Err(Error::Corrupt))); + // Indexed header field, index 99 (no such dynamic entry) → invalid. + assert!(matches!(dec.decode(&[0xe3]), Err(Error::Corrupt))); +} + +#[test] +fn decode_rejects_oversized_size_update() { + let mut dec = HpackDecoder::with_max_table_size(256); + // Size update to 4096 (> 256 connection limit): 0x3f 0xe1 0x1f. + assert!(matches!( + dec.decode(&[0x3f, 0xe1, 0x1f]), + Err(Error::Corrupt) + )); +} + +#[test] +fn decode_rejects_truncated_string() { + let mut dec = HpackDecoder::new(); + // Literal, new name, length 5 but only 2 bytes follow. + assert!(matches!( + dec.decode(&[0x40, 0x05, b'a', b'b']), + Err(Error::UnexpectedEnd) + )); +} diff --git a/src/lha/dynamic_huff.rs b/src/lha/dynamic_huff.rs new file mode 100644 index 0000000..2c3ad34 --- /dev/null +++ b/src/lha/dynamic_huff.rs @@ -0,0 +1,448 @@ +//! LHA -lh2- : LZSS + dynamic (adaptive) Huffman over an 8 KiB window. +//! +//! `-lh2-` is the transitional LHA method: it keeps the adaptive-Huffman +//! literal/length coding of [`lh1`](super::lzhuf) but enlarges the window to +//! 8 KiB (13-bit positions, matches up to 256 bytes) **and** codes the match +//! *position* with a second adaptive Huffman tree rather than lh1's fixed +//! position buckets. Both sides update both trees identically after every +//! symbol, so no tables are carried in the stream. +//! +//! Coding, per symbol: +//! - A literal/length symbol from the **char tree** (alphabet `NC` = 510: +//! 256 byte literals + 254 length codes for match lengths 3..=256). Symbol +//! `< 256` is a literal; `256 + (len - MIN_MATCH)` is a match of that length. +//! - For a match, a **position-class** symbol `p` from the **position tree** +//! (alphabet `NP` = 14), where `p` is the number of significant bits of the +//! ring distance-1. For `p >= 2`, the low `p - 1` bits follow raw. +//! +//! Like `lh1`, the raw `-lh2-` stream is continuous and size-terminated (no +//! in-band end marker), so the decoder needs the uncompressed length out of +//! band via [`DecoderConfig::with_len`](super::DecoderConfig::with_len). +//! +//! Clean-room from the public LHA dynamic-Huffman *description* (the adaptive +//! sibling-property tree is Okumura's documented, public-domain `reconst` / +//! `update` procedure; the 8 KiB window, 256-byte match limit, and dynamic +//! position-class tree are reproduced from the format description, not copied +//! from any licensed source). Validated by this crate's own encoder/decoder +//! round-trip — there is no public `-lh2-` reference fixture (no mainstream +//! tool emits the method), so bit-exact interop with archives in the wild is +//! best-effort. + +extern crate alloc; +use alloc::vec; +use alloc::vec::Vec; + +use crate::error::Error; +use crate::lha::bits::{BitReader, BitWriter}; + +// ─── LZSS parameters (8 KiB window) ────────────────────────────────────── + +const DICBIT: usize = 13; +const N: usize = 1 << DICBIT; // 8192 ring buffer +const MIN_MATCH: usize = 3; +const MAXMATCH: usize = 256; +/// Char-tree alphabet: 256 literals + (MAXMATCH - MIN_MATCH + 1) length codes. +const NC: usize = 256 + (MAXMATCH - MIN_MATCH + 1); // 510 +/// Position-tree alphabet: bit-length classes 0..=DICBIT. +const NP: usize = DICBIT + 1; // 14 + +const MAX_FREQ: u32 = 0x8000; // adaptive-tree rebuild threshold + +// ─── generalized adaptive Huffman tree ─────────────────────────────────── + +/// Adaptive Huffman tree (Okumura's sibling-property layout, parameterized by +/// alphabet size). `son[v]` is the left child of internal node `v`, the right +/// child is `son[v] + 1`; a node is a leaf when `son[v] >= t`, with symbol +/// `son[v] - t`. `prnt[v]` is the parent; `prnt[symbol + t]` locates a leaf. +/// +/// This is the same machinery as [`super::lzhuf::Tree`] but with a runtime +/// alphabet size so both the 510-symbol char tree and the 14-symbol position +/// tree share one implementation. +struct AdaptiveTree { + t: usize, // 2 * n_char - 1 internal+leaf nodes + root: usize, // t - 1 + n_char: usize, + freq: Vec, + son: Vec, + prnt: Vec, +} + +impl AdaptiveTree { + fn new(n_char: usize) -> Self { + let t = n_char * 2 - 1; + let root = t - 1; + let mut freq = vec![0u32; t + 1]; + let mut son = vec![0usize; t]; + let mut prnt = vec![0usize; t + n_char]; + + for i in 0..n_char { + freq[i] = 1; + son[i] = i + t; + prnt[i + t] = i; + } + let mut i = 0usize; + let mut j = n_char; + while j <= root { + freq[j] = freq[i] + freq[i + 1]; + son[j] = i; + prnt[i] = j; + prnt[i + 1] = j; + i += 2; + j += 1; + } + freq[t] = 0xFFFF; + prnt[root] = 0; + Self { + t, + root, + n_char, + freq, + son, + prnt, + } + } + + fn reconstruct(&mut self) { + let t = self.t; + let n_char = self.n_char; + // Collect leaves to the front, halving frequencies. + let mut j = 0usize; + for i in 0..t { + if self.son[i] >= t { + self.freq[j] = self.freq[i].div_ceil(2); + self.son[j] = self.son[i]; + j += 1; + } + } + // Rebuild internal nodes, inserting each pair-sum at its sorted spot. + let mut i = 0usize; + for j in n_char..t { + let f = self.freq[i] + self.freq[i + 1]; + let mut k = j; + while k > 0 && self.freq[k - 1] > f { + k -= 1; + } + let mut m = j; + while m > k { + self.freq[m] = self.freq[m - 1]; + self.son[m] = self.son[m - 1]; + m -= 1; + } + self.freq[k] = f; + self.son[k] = i; + i += 2; + } + for i in 0..t { + let s = self.son[i]; + self.prnt[s] = i; + if s < t { + self.prnt[s + 1] = i; + } + } + } + + fn update(&mut self, c: usize) { + if self.freq[self.root] >= MAX_FREQ { + self.reconstruct(); + } + let mut node = self.prnt[c + self.t]; + loop { + self.freq[node] += 1; + let f = self.freq[node]; + if node < self.root && f > self.freq[node + 1] { + let mut l = node + 1; + while l < self.root && f > self.freq[l + 1] { + l += 1; + } + self.freq[node] = self.freq[l]; + self.freq[l] = f; + + let sn = self.son[node]; + let sl = self.son[l]; + self.prnt[sl] = node; + if sl < self.t { + self.prnt[sl + 1] = node; + } + self.prnt[sn] = l; + if sn < self.t { + self.prnt[sn + 1] = l; + } + self.son[node] = sl; + self.son[l] = sn; + + node = l; + } + node = self.prnt[node]; + if node == 0 { + break; + } + } + } + + /// Decode one symbol, descending from the root by branch bits, then + /// update the tree. + fn decode(&mut self, br: &mut BitReader<'_>) -> Result { + let mut c = self.son[self.root]; + let mut guard = 0usize; + while c < self.t { + let bit = br.get_bits(1) as usize; + let idx = c + bit; + if idx >= self.t { + return Err(Error::Corrupt); + } + c = self.son[idx]; + guard += 1; + if guard > self.t { + return Err(Error::Corrupt); + } + } + let sym = c - self.t; + if sym >= self.n_char { + return Err(Error::Corrupt); + } + self.update(sym); + Ok(sym) + } + + /// Encode one symbol (root-first branch bits), then update the tree. + fn encode(&mut self, bw: &mut BitWriter, c: usize) { + let mut path: Vec = Vec::with_capacity(32); + let mut k = self.prnt[c + self.t]; + loop { + let p = self.prnt[k]; + path.push((k - self.son[p]) as u8); + if p == self.root { + break; + } + k = p; + if path.len() >= self.t { + break; + } + } + for &bit in path.iter().rev() { + bw.put_bits(1, bit as u32); + } + self.update(c); + } +} + +// ─── position bit-length class ─────────────────────────────────────────── + +/// Number of significant bits of `pos` (0 for `pos == 0`). This is the +/// position-tree symbol; `pos` itself is recovered from the class plus +/// `class - 1` low bits. +fn pos_class(pos: usize) -> usize { + // pos < N = 2^13, so the class is at most DICBIT. + (usize::BITS - pos.leading_zeros()) as usize +} + +// ─── decoder ───────────────────────────────────────────────────────────── + +/// Decode an lh2 payload (length header already stripped) of declared length +/// `expected`. +pub fn decode_payload(payload: &[u8], expected: usize) -> Result, Error> { + let mut out: Vec = Vec::with_capacity(expected.min(1 << 20)); + if expected == 0 { + return Ok(out); + } + + let mut ctree = AdaptiveTree::new(NC); + let mut ptree = AdaptiveTree::new(NP); + let mut ring = vec![b' '; N]; + let mut r = 0usize; + let mut br = BitReader::new(payload); + + while out.len() < expected { + let c = ctree.decode(&mut br)?; + if br.overran() { + return Err(Error::UnexpectedEnd); + } + if c < 256 { + out.push(c as u8); + ring[r] = c as u8; + r = (r + 1) & (N - 1); + } else { + let len = (c - 256) + MIN_MATCH; + if len > MAXMATCH { + return Err(Error::Corrupt); + } + let pos = decode_position(&mut ptree, &mut br)?; + if br.overran() { + return Err(Error::UnexpectedEnd); + } + if pos >= N { + return Err(Error::Corrupt); + } + let src0 = (r + N - pos - 1) & (N - 1); + for k in 0..len { + if out.len() >= expected { + break; + } + let b = ring[(src0 + k) & (N - 1)]; + out.push(b); + ring[r] = b; + r = (r + 1) & (N - 1); + } + } + } + Ok(out) +} + +/// Decode a position: a bit-length class from the adaptive position tree, +/// then `class - 1` low bits (none for class 0 or 1). +fn decode_position(ptree: &mut AdaptiveTree, br: &mut BitReader<'_>) -> Result { + let class = ptree.decode(br)?; + if class == 0 { + return Ok(0); + } + if class == 1 { + return Ok(1); + } + let low = br.get_bits((class - 1) as u32) as usize; + Ok((1usize << (class - 1)) | low) +} + +// ─── encoder ───────────────────────────────────────────────────────────── + +/// Encode `data` into an lh2 payload (no length header). +pub fn encode_payload(data: &[u8]) -> Vec { + let mut bw = BitWriter::new(); + if data.is_empty() { + return bw.finish(); + } + + let mut ctree = AdaptiveTree::new(NC); + let mut ptree = AdaptiveTree::new(NP); + + for t in lz_parse(data) { + match t { + Tok::Lit(b) => ctree.encode(&mut bw, b as usize), + Tok::Match { len, pos } => { + ctree.encode(&mut bw, 256 + (len - MIN_MATCH)); + encode_position(&mut ptree, &mut bw, pos); + } + } + } + bw.finish() +} + +fn encode_position(ptree: &mut AdaptiveTree, bw: &mut BitWriter, pos: usize) { + let class = pos_class(pos); + ptree.encode(bw, class); + if class >= 2 { + bw.put_bits((class - 1) as u32, (pos & ((1 << (class - 1)) - 1)) as u32); + } +} + +enum Tok { + Lit(u8), + Match { len: usize, pos: usize }, +} + +/// Greedy LZSS parse producing distance-based positions (`pos` = distance-1 +/// in `0..N`), mirroring the decoder's `src = r - pos - 1`. +fn lz_parse(data: &[u8]) -> Vec { + let n = data.len(); + let mut tokens = Vec::new(); + + const HASH_BITS: u32 = 15; + const HASH_SIZE: usize = 1 << HASH_BITS; + let mut head = vec![usize::MAX; HASH_SIZE]; + let mut prev = vec![usize::MAX; n]; + + let hash3 = |d: &[u8], i: usize| -> usize { + let a = d[i] as usize; + let b = d[i + 1] as usize; + let c = d[i + 2] as usize; + ((a << 10) ^ (b << 5) ^ c).wrapping_mul(2654435761) >> (32 - HASH_BITS) & (HASH_SIZE - 1) + }; + + let max_chain = 128usize; + let mut i = 0usize; + while i < n { + let mut best_len = 0usize; + let mut best_pos = 0usize; + if i + MIN_MATCH <= n { + let h = hash3(data, i); + let mut cand = head[h]; + let mut chain = 0usize; + let max_match = MAXMATCH.min(n - i); + let min_pos = i.saturating_sub(N); + while cand != usize::MAX && cand >= min_pos && chain < max_chain { + let mut l = 0usize; + while l < max_match && data[cand + l] == data[i + l] { + l += 1; + } + if l > best_len { + best_len = l; + best_pos = i - cand - 1; // distance - 1 + if l >= max_match { + break; + } + } + cand = prev[cand]; + chain += 1; + } + } + + if best_len >= MIN_MATCH { + tokens.push(Tok::Match { + len: best_len, + pos: best_pos, + }); + let end = i + best_len; + while i < end { + if i + MIN_MATCH <= n { + let h = hash3(data, i); + prev[i] = head[h]; + head[h] = i; + } + i += 1; + } + } else { + tokens.push(Tok::Lit(data[i])); + if i + MIN_MATCH <= n { + let h = hash3(data, i); + prev[i] = head[h]; + head[h] = i; + } + i += 1; + } + } + tokens +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(data: &[u8]) { + let enc = encode_payload(data); + let dec = decode_payload(&enc, data.len()).unwrap(); + assert_eq!(dec, data, "round-trip mismatch ({} bytes)", data.len()); + } + + #[test] + fn round_trips() { + round_trip(b""); + round_trip(b"a"); + round_trip(b"abracadabra abracadabra abracadabra"); + round_trip(&[0u8; 1000]); + let mut v = Vec::new(); + for i in 0..5000u32 { + v.push((i.wrapping_mul(2654435761) >> 13) as u8); + } + round_trip(&v); + // long run to exercise 256-byte matches over the 8 KiB window + round_trip(&b"xyz".repeat(4000)); + } + + #[test] + fn pos_class_is_significant_bits() { + assert_eq!(pos_class(0), 0); + assert_eq!(pos_class(1), 1); + assert_eq!(pos_class(2), 2); + assert_eq!(pos_class(3), 2); + assert_eq!(pos_class(8191), 13); + } +} diff --git a/src/lha/mod.rs b/src/lha/mod.rs index e35688f..84e0d4f 100644 --- a/src/lha/mod.rs +++ b/src/lha/mod.rs @@ -1,5 +1,5 @@ -//! LHA / LZH compression methods: `-lh1-`, `-lh4-`, `-lh5-`, `-lh6-`, -//! `-lh7-`. +//! LHA / LZH compression methods: `-lh1-`, `-lh2-`, `-lh4-`, `-lh5-`, +//! `-lh6-`, `-lh7-`. //! //! These are LZSS sliding-dictionary back-reference schemes whose //! literal/length and position codes are Huffman-coded. They are the @@ -7,7 +7,9 @@ //! like the rar/zip-method codecs elsewhere in this crate. //! //! - `lh1`: 4 KiB dictionary, **adaptive** Huffman (the classic LZHUF -//! scheme of Yoshizaki & Okumura). +//! scheme of Yoshizaki & Okumura), fixed position code. +//! - `lh2`: 8 KiB dictionary, **adaptive** Huffman for both literals/lengths +//! *and* positions (see [`dynamic_huff`]). //! - `lh4`: 4 KiB dictionary, static Huffman. //! - `lh5`: 16 KiB dictionary, static Huffman — the dominant method. //! - `lh6`: 64 KiB dictionary, static Huffman. @@ -16,7 +18,9 @@ //! `lh4`/`lh5`/`lh6`/`lh7` share the static-Huffman block structure //! (Okumura's public-domain ar002 layout — see [`static_huff`]) and differ //! only in dictionary size and the number of position-code bits. `lh1` -//! uses the adaptive-Huffman tree-update scheme (see [`lzhuf`]). +//! uses the adaptive-Huffman tree-update scheme (see [`lzhuf`]); `lh2` +//! extends it to an 8 KiB window with an adaptive position tree (see +//! [`dynamic_huff`]). //! //! ## Framing — none //! @@ -27,8 +31,8 @@ //! //! - `lh4`/`lh5`/`lh6`/`lh7` are block-structured and self-terminate — decode //! the input and call `finish()`, like any other codec here. -//! - `lh1` is a continuous, size-terminated stream with no in-band end, so its -//! decoder needs the uncompressed length out of band via +//! - `lh1` and `lh2` are continuous, size-terminated streams with no in-band +//! end, so their decoders need the uncompressed length out of band via //! [`DecoderConfig::with_len`] (the size a container reader already has). //! `with_len` is accepted by every method and bounds decompressed size for //! decompression-bomb safety. @@ -52,6 +56,7 @@ extern crate alloc; use alloc::vec::Vec; mod bits; +pub mod dynamic_huff; mod huffman; pub mod lzhuf; pub mod static_huff; @@ -65,6 +70,7 @@ use static_huff::Params; #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum Method { Lh1, + Lh2, Lh4, Lh5, Lh6, @@ -75,14 +81,17 @@ impl Method { fn name(self) -> &'static str { match self { Method::Lh1 => "lh1", + Method::Lh2 => "lh2", Method::Lh4 => "lh4", Method::Lh5 => "lh5", Method::Lh6 => "lh6", Method::Lh7 => "lh7", } } + /// `lh4`/`lh5`/`lh6`/`lh7` use the static-Huffman block structure; `lh1` + /// and `lh2` use adaptive (dynamic) Huffman. fn is_static(self) -> bool { - !matches!(self, Method::Lh1) + !matches!(self, Method::Lh1 | Method::Lh2) } } @@ -153,6 +162,12 @@ define_method!( "lh1", "LHA `-lh1-`: 4 KiB dictionary, adaptive Huffman (LZHUF)." ); +define_method!( + Lh2, + Lh2, + "lh2", + "LHA `-lh2-`: 8 KiB dictionary, adaptive Huffman for literals/lengths and positions." +); define_method!( Lh4, Lh4, @@ -207,11 +222,13 @@ impl Encoder { } fn finalize(&mut self) { - let payload = if self.method.is_static() { - let params = Params::for_method(self.method.name()); - static_huff::encode_payload(&self.input, params) - } else { - lzhuf::encode_payload(&self.input) + let payload = match self.method { + Method::Lh1 => lzhuf::encode_payload(&self.input), + Method::Lh2 => dynamic_huff::encode_payload(&self.input), + _ => { + let params = Params::for_method(self.method.name()); + static_huff::encode_payload(&self.input, params) + } }; self.output.extend_from_slice(&payload); } @@ -302,13 +319,16 @@ impl Decoder { let params = Params::for_method(self.method.name()); static_huff::decode_payload(payload, self.expected_len, params)? } else { - // lh1 (LZHUF) is a continuous, size-terminated code stream with no - // in-band end marker — the uncompressed length must be supplied - // out of band via `DecoderConfig::with_len`. Without it we cannot - // know where the data ends (the trailing bits are padding), so we - // refuse rather than emit garbage. + // lh1 (LZHUF) and lh2 are continuous, size-terminated code streams + // with no in-band end marker — the uncompressed length must be + // supplied out of band via `DecoderConfig::with_len`. Without it + // we cannot know where the data ends (the trailing bits are + // padding), so we refuse rather than emit garbage. match self.expected_len { - Some(n) => lzhuf::decode_payload(payload, n)?, + Some(n) => match self.method { + Method::Lh2 => dynamic_huff::decode_payload(payload, n)?, + _ => lzhuf::decode_payload(payload, n)?, + }, None if payload.is_empty() => Vec::new(), None => return Err(Error::Unsupported), } diff --git a/src/lib.rs b/src/lib.rs index bbfd340..c9d7560 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -176,6 +176,9 @@ pub mod arc_squeeze; #[cfg(feature = "lha")] pub mod lha; +#[cfg(feature = "hpack")] +pub mod hpack; + #[cfg(feature = "factory")] pub mod factory; diff --git a/tests/hpack.rs b/tests/hpack.rs new file mode 100644 index 0000000..735a48a --- /dev/null +++ b/tests/hpack.rs @@ -0,0 +1,128 @@ +#![cfg(feature = "hpack")] +//! Integration tests for the HPACK module: the public header-codec API, the +//! `Http2Huffman` streaming codec, and factory-by-name lookup. + +use compcol::hpack::{HeaderField, HpackDecoder, HpackEncoder, Http2Huffman}; +use compcol::{Algorithm, Decoder, Encoder, Status}; + +/// Drive a streaming encoder to completion over the whole input. +fn run_enc(mut enc: E, data: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut buf = vec![0u8; 256]; + let mut consumed = 0; + while consumed < data.len() { + let (p, _) = enc.encode(&data[consumed..], &mut buf).unwrap(); + out.extend_from_slice(&buf[..p.written]); + consumed += p.consumed; + if p.consumed == 0 && p.written == 0 { + break; + } + } + loop { + let (p, st) = enc.finish(&mut buf).unwrap(); + out.extend_from_slice(&buf[..p.written]); + if matches!(st, Status::StreamEnd) { + break; + } + } + out +} + +/// Drive a streaming decoder to completion over the whole input. +fn run_dec(mut dec: D, data: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut buf = vec![0u8; 256]; + let mut consumed = 0; + while consumed < data.len() { + let (p, _) = dec.decode(&data[consumed..], &mut buf).unwrap(); + out.extend_from_slice(&buf[..p.written]); + consumed += p.consumed; + if p.consumed == 0 && p.written == 0 { + break; + } + } + loop { + let (p, st) = dec.finish(&mut buf).unwrap(); + out.extend_from_slice(&buf[..p.written]); + if matches!(st, Status::StreamEnd) { + break; + } + } + out +} + +#[test] +fn http2_huffman_codec_round_trips() { + let inputs: [&[u8]; 4] = [ + b"", + b"www.example.com", + b"the quick brown fox jumps over the lazy dog", + &[0u8, 1, 2, 254, 255, 128, 127], + ]; + for inp in inputs { + let encoded = run_enc(Http2Huffman::encoder(), inp); + let decoded = run_dec(Http2Huffman::decoder(), &encoded); + assert_eq!(decoded, inp, "round-trip mismatch for {inp:?}"); + } +} + +#[test] +fn http2_huffman_known_vector() { + // RFC 7541 C.4.1 string. + let encoded = run_enc(Http2Huffman::encoder(), b"www.example.com"); + assert_eq!( + encoded, + [ + 0xf1, 0xe3, 0xc2, 0xe5, 0xf2, 0x3a, 0x6b, 0xa0, 0xab, 0x90, 0xf4, 0xff + ] + ); +} + +#[cfg(feature = "factory")] +#[test] +fn factory_exposes_h2_huffman() { + assert!(compcol::factory::encoder_by_name("h2-huffman").is_some()); + assert!(compcol::factory::decoder_by_name("h2-huffman").is_some()); + assert!(compcol::factory::names().contains(&"h2-huffman")); +} + +#[test] +fn full_hpack_round_trip_across_blocks() { + // A single encoder/decoder pair carrying dynamic-table state across two + // header blocks (a realistic HTTP/2 connection). + let mut enc = HpackEncoder::new(); + let mut dec = HpackDecoder::new(); + + let block_a = [ + HeaderField::new(b":method", b"GET"), + HeaderField::new(b":path", b"/resource"), + HeaderField::new(b"accept", b"text/html"), + HeaderField::sensitive(b"authorization", b"Bearer xyz"), + ]; + let e = enc.encode(&block_a); + assert_eq!(dec.decode(&e).unwrap(), block_a); + + // Second block reuses fields now in the dynamic table. + let block_b = [ + HeaderField::new(b":method", b"GET"), + HeaderField::new(b":path", b"/resource"), + HeaderField::new(b"accept", b"text/html"), + ]; + let e = enc.encode(&block_b); + // Reused fields should compress to a few bytes (all indexed). + assert!(e.len() <= 4, "expected indexed reuse, got {e:?}"); + assert_eq!(dec.decode(&e).unwrap(), block_b); +} + +#[test] +fn raw_mode_matches_decoder() { + let mut enc = HpackEncoder::new(); + enc.set_huffman(false); + let mut dec = HpackDecoder::new(); + let fields = [ + HeaderField::new(b"x-custom-header", b"some long-ish value 123456"), + HeaderField::new(b"another", b"\x00\x01\x02 binary-ish"), + ]; + let block = enc.encode(&fields); + assert_eq!(dec.decode(&block).unwrap(), fields); +} diff --git a/tests/lha.rs b/tests/lha.rs index 5a81b85..5fae093 100644 --- a/tests/lha.rs +++ b/tests/lha.rs @@ -1,7 +1,7 @@ #![cfg(feature = "lha")] //! Streaming round-trip + error-path tests for the LHA/LZH methods. -use compcol::lha::{DecoderConfig, Lh1, Lh4, Lh5, Lh6, Lh7}; +use compcol::lha::{DecoderConfig, Lh1, Lh2, Lh4, Lh5, Lh6, Lh7}; use compcol::{Algorithm, Decoder, Encoder, Error, Status}; /// Encode `data` with `enc`, feeding `in_chunk` bytes at a time and @@ -186,6 +186,38 @@ fn roundtrip_lh1() { } } +#[test] +fn roundtrip_lh2() { + // lh2 is continuous + size-terminated like lh1, so it needs with_len. + for data in sample_inputs() { + roundtrip_mode::(&data, true); + } +} + +#[test] +fn lh2_without_len_refuses() { + // Non-empty lh2 stream with no out-of-band length must error (it has no + // in-band end marker), not emit garbage. + let data = b"some data to compress with lh2, repeated a bit".repeat(8); + let payload = encode_chunked(Lh2::encoder(), &data, 1 << 16, 1 << 16); + let mut dec = Lh2::decoder(); // default config: no expected_len + let mut out = vec![0u8; 4096]; + let _ = dec.decode(&payload, &mut out); // buffers the stream + assert!(matches!(dec.finish(&mut out), Err(Error::Unsupported))); +} + +#[test] +fn large_window_match_lh2() { + // A match reachable only with lh2's 8 KiB window. + let mut data = vec![0u8; 20_000]; + for (i, b) in data.iter_mut().enumerate() { + *b = (i % 241) as u8; + } + let head: Vec = data[..6000].to_vec(); + data.extend_from_slice(&head); + roundtrip_mode::(&data, true); +} + #[test] fn large_window_match_lh7() { // A match at a distance only reachable with the 128 KiB lh7 window. @@ -276,7 +308,7 @@ fn names_registered() { #[cfg(feature = "factory")] { let names = compcol::factory::names(); - for n in ["lh1", "lh4", "lh5", "lh6", "lh7"] { + for n in ["lh1", "lh2", "lh4", "lh5", "lh6", "lh7"] { assert!(names.contains(&n), "{n} not registered"); assert!(compcol::factory::encoder_by_name(n).is_some()); assert!(compcol::factory::decoder_by_name(n).is_some());