Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- *(decoder bridge)* a decoder that buffers a whole block internally (notably
`bzip2`) could fail a naive decode loop with `UnexpectedEnd`. When the
caller's `output` buffer filled mid-block, the `RawDecoder`→`Decoder` bridge
reported `InputEmpty` (because the decoder had already absorbed all the input)
instead of `OutputFull`; a loop that stops on `InputEmpty` then called
`finish` on a half-drained stream and got `UnexpectedEnd`. The bridge now
returns `OutputFull` whenever the output buffer is full, which is always the
correct "drain and call again" signal. Affected `bzip2` round-trips whenever a
decoded block was larger than the output buffer.
- *(lzma2/xz)* eliminated quadratic encode time on incompressible/low-match
input. The match finder's hash head table was a fixed 64 Ki buckets, so as the
input grew the per-bucket chains lengthened and every probe walked work that
Expand Down
11 changes: 11 additions & 0 deletions src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,17 @@ impl<T: RawDecoder> Decoder for T {
let p = self.raw_decode(input, output)?;
let status = if p.done {
Status::StreamEnd
} else if p.written == output.len() && !output.is_empty() {
// Output buffer is full. A decoder that buffers input internally
// (e.g. bzip2 absorbs a whole block before draining it) reports
// `consumed == input.len()` here even though it still has decoded
// bytes pending — so keying the status off `consumed` alone would
// wrongly say `InputEmpty` and a standard loop would stop early
// (then `finish` sees a half-drained stream → `UnexpectedEnd`).
// A filled output always means "drain and call again", which is
// exactly `OutputFull`'s contract; calling again with no remaining
// input simply yields `InputEmpty` once the pending bytes are out.
Status::OutputFull
} else if p.consumed >= input.len() {
Status::InputEmpty
} else {
Expand Down
51 changes: 51 additions & 0 deletions tests/bzip2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,57 @@ fn round_trip_streaming_one_byte() {
assert_eq!(decoded, input);
}

/// Decode following the *exact* streaming loop documented on the `Decoder`
/// trait — no defensive "drain after InputEmpty" workaround like
/// [`decode_chunked`] has. A naive caller breaks out of the decode loop on
/// `InputEmpty` and then calls `finish`. This is the pattern that regressed:
/// the decoder buffers a whole block internally, so when the small `output`
/// fills mid-block it used to report `InputEmpty` (instead of `OutputFull`),
/// the loop stopped with the block half-drained, and `finish` then failed with
/// `UnexpectedEnd`.
fn decode_documented_loop(encoded: &[u8], out_chunk: usize) -> Result<Vec<u8>, Error> {
let mut dec = Decoder::new();
let mut buf = vec![0u8; out_chunk];
let mut out = Vec::new();
let mut consumed = 0;
loop {
let (p, status) = dec.decode(&encoded[consumed..], &mut buf)?;
out.extend_from_slice(&buf[..p.written]);
consumed += p.consumed;
match status {
Status::OutputFull => continue,
Status::InputEmpty => break,
Status::StreamEnd => return Ok(out),
}
}
loop {
let (p, status) = dec.finish(&mut buf)?;
out.extend_from_slice(&buf[..p.written]);
if matches!(status, Status::StreamEnd) {
break;
}
}
Ok(out)
}

#[test]
fn round_trip_small_output_buffer_naive_loop() {
// Inputs larger than the output buffer force the decoder to drain a single
// decoded block across several `decode` calls. Before the fix this failed
// with `UnexpectedEnd` for any block bigger than `out_chunk`.
for &n in &[100_000usize, 600_000, 1_000_000] {
let input: Vec<u8> = (0..n)
.map(|i| (i.wrapping_mul(2654435761) >> 13) as u8)
.collect();
let encoded = encode_all(&input);
for &out_chunk in &[1usize, 64, 4096, 65536] {
let decoded = decode_documented_loop(&encoded, out_chunk)
.unwrap_or_else(|e| panic!("n={n} out_chunk={out_chunk}: {e:?}"));
assert_eq!(decoded, input, "n={n} out_chunk={out_chunk}");
}
}
}

// ─── reset / reuse ─────────────────────────────────────────────────────

#[test]
Expand Down
Loading