diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fcc8ad..13a5fbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- *(decoder bridge)* a decoder that buffers a whole block internally (notably + `bzip2`) could fail a naive decode loop with `UnexpectedEnd`. When the + caller's `output` buffer filled mid-block, the `RawDecoder`→`Decoder` bridge + reported `InputEmpty` (because the decoder had already absorbed all the input) + instead of `OutputFull`; a loop that stops on `InputEmpty` then called + `finish` on a half-drained stream and got `UnexpectedEnd`. The bridge now + returns `OutputFull` whenever the output buffer is full, which is always the + correct "drain and call again" signal. Affected `bzip2` round-trips whenever a + decoded block was larger than the output buffer. - *(lzma2/xz)* eliminated quadratic encode time on incompressible/low-match input. The match finder's hash head table was a fixed 64 Ki buckets, so as the input grew the per-bucket chains lengthened and every probe walked work that diff --git a/src/traits.rs b/src/traits.rs index d30e502..2a0fe64 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -364,6 +364,17 @@ impl Decoder for T { let p = self.raw_decode(input, output)?; let status = if p.done { Status::StreamEnd + } else if p.written == output.len() && !output.is_empty() { + // Output buffer is full. A decoder that buffers input internally + // (e.g. bzip2 absorbs a whole block before draining it) reports + // `consumed == input.len()` here even though it still has decoded + // bytes pending — so keying the status off `consumed` alone would + // wrongly say `InputEmpty` and a standard loop would stop early + // (then `finish` sees a half-drained stream → `UnexpectedEnd`). + // A filled output always means "drain and call again", which is + // exactly `OutputFull`'s contract; calling again with no remaining + // input simply yields `InputEmpty` once the pending bytes are out. + Status::OutputFull } else if p.consumed >= input.len() { Status::InputEmpty } else { diff --git a/tests/bzip2.rs b/tests/bzip2.rs index c7ac941..ab3107d 100644 --- a/tests/bzip2.rs +++ b/tests/bzip2.rs @@ -268,6 +268,57 @@ fn round_trip_streaming_one_byte() { assert_eq!(decoded, input); } +/// Decode following the *exact* streaming loop documented on the `Decoder` +/// trait — no defensive "drain after InputEmpty" workaround like +/// [`decode_chunked`] has. A naive caller breaks out of the decode loop on +/// `InputEmpty` and then calls `finish`. This is the pattern that regressed: +/// the decoder buffers a whole block internally, so when the small `output` +/// fills mid-block it used to report `InputEmpty` (instead of `OutputFull`), +/// the loop stopped with the block half-drained, and `finish` then failed with +/// `UnexpectedEnd`. +fn decode_documented_loop(encoded: &[u8], out_chunk: usize) -> Result, Error> { + let mut dec = Decoder::new(); + let mut buf = vec![0u8; out_chunk]; + let mut out = Vec::new(); + let mut consumed = 0; + loop { + let (p, status) = dec.decode(&encoded[consumed..], &mut buf)?; + out.extend_from_slice(&buf[..p.written]); + consumed += p.consumed; + match status { + Status::OutputFull => continue, + Status::InputEmpty => break, + Status::StreamEnd => return Ok(out), + } + } + loop { + let (p, status) = dec.finish(&mut buf)?; + out.extend_from_slice(&buf[..p.written]); + if matches!(status, Status::StreamEnd) { + break; + } + } + Ok(out) +} + +#[test] +fn round_trip_small_output_buffer_naive_loop() { + // Inputs larger than the output buffer force the decoder to drain a single + // decoded block across several `decode` calls. Before the fix this failed + // with `UnexpectedEnd` for any block bigger than `out_chunk`. + for &n in &[100_000usize, 600_000, 1_000_000] { + let input: Vec = (0..n) + .map(|i| (i.wrapping_mul(2654435761) >> 13) as u8) + .collect(); + let encoded = encode_all(&input); + for &out_chunk in &[1usize, 64, 4096, 65536] { + let decoded = decode_documented_loop(&encoded, out_chunk) + .unwrap_or_else(|e| panic!("n={n} out_chunk={out_chunk}: {e:?}")); + assert_eq!(decoded, input, "n={n} out_chunk={out_chunk}"); + } + } +} + // ─── reset / reuse ───────────────────────────────────────────────────── #[test]