From a2a63e15f4d34f5de35eb7dc32eadb0b02f77327 Mon Sep 17 00:00:00 2001 From: patrick Date: Sun, 31 May 2026 11:06:53 +0200 Subject: [PATCH] fix(curl): passthrough binary downloads to prevent UTF-8 corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `rtk curl ` silently corrupted any binary download (tarballs, zip, png, pdf, ELF, ...) because the response body was captured via `String::from_utf8_lossy` in `core::stream::exec_capture`. Every non-UTF-8 byte got replaced with U+FFFD (3 bytes: 0xEF 0xBF 0xBD), so gzip magic 1f 8b 08 00 arrived at downstream consumers as 1f ef bf bd 08 00 and `tar -xzf` complained "not in gzip format". Now `rtk curl` runs `Command::output()` directly to keep stdout as `Vec`, then checks `std::str::from_utf8(&bytes).is_err()`. If the response is not valid UTF-8 (i.e. the lossy conversion would corrupt it), raw bytes are written through to stdout via `write_all` and tracking is recorded as passthrough (0% savings — token counts over binary content have no meaning). The text/JSON code path is unchanged. Verified live on 18 real binary formats (rtk's own release artifacts, ripgrep, bat, fd, hyperfine, gh, tokei, kubectl ELF 51MB, rustup-init 20MB, W3C PDF, ISO 9899 PDF 4MB, GitHub avatar PNG, Wikimedia GIF, WebP sample, MP3/MP4/WAV samples) — all byte-identical to raw curl. 10 text regression tests (JSON/HTML/Markdown/Cargo.toml/RFC) confirm the text path keeps its existing behavior. Closes #1087 --- src/cmds/cloud/curl_cmd.rs | 91 +++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 10 deletions(-) diff --git a/src/cmds/cloud/curl_cmd.rs b/src/cmds/cloud/curl_cmd.rs index ed5b04a86..7073c2e42 100644 --- a/src/cmds/cloud/curl_cmd.rs +++ b/src/cmds/cloud/curl_cmd.rs @@ -5,13 +5,18 @@ //! The condensed-form-with-tee-hint path is reserved for non-JSON bodies on //! a real terminal where a human reads the output and the tee file gives the //! LLM a way to recover the raw response. +//! +//! Binary downloads (any non-UTF-8 byte sequence) are written through to +//! stdout as raw bytes, bypassing the UTF-8 lossy conversion that would +//! otherwise replace non-UTF-8 bytes with U+FFFD and corrupt the stream +//! (`#1087`). use crate::core::tee::force_tee_hint; use crate::core::tracking; -use crate::core::{stream::exec_capture, utils::resolved_command}; +use crate::core::utils::resolved_command; use anyhow::{Context, Result}; use std::borrow::Cow; -use std::io::IsTerminal; +use std::io::{IsTerminal, Write}; const MAX_RESPONSE_SIZE: usize = 500; @@ -28,22 +33,46 @@ pub fn run(args: &[String], verbose: u8) -> Result { eprintln!("Running: curl -s {}", args.join(" ")); } - let result = exec_capture(&mut cmd).context("Failed to run curl")?; + // Capture stdout as raw bytes (not UTF-8 String) so binary downloads + // survive intact. `String::from_utf8_lossy` would otherwise replace + // every non-UTF-8 byte with U+FFFD (3 bytes), corrupting e.g. gzip + // magic `1f 8b 08 00` into `1f ef bf bd 08 00` (#1087). + let output = cmd.output().context("Failed to run curl")?; + let exit_code = output.status.code().unwrap_or(1); // Skip filtering on failure: curl can return HTML error bodies that would // be misleading to summarize, and we want the real exit code surfaced. - if !result.success() { - let msg = if result.stderr.trim().is_empty() { - result.stdout.trim().to_string() + if !output.status.success() { + let stderr_str = String::from_utf8_lossy(&output.stderr); + let stdout_str = String::from_utf8_lossy(&output.stdout); + let msg = if stderr_str.trim().is_empty() { + stdout_str.trim().to_string() } else { - result.stderr.trim().to_string() + stderr_str.trim().to_string() }; eprintln!("FAILED: curl {}", msg); - return Ok(result.exit_code); + return Ok(exit_code); } - let exit_code = result.exit_code; - let raw = result.stdout; + // Binary detection: if the body is not valid UTF-8, `from_utf8_lossy` + // would replace every invalid byte with U+FFFD and corrupt the stream + // (gzip, zip, png, pdf, elf, ... — any binary format). Write raw bytes + // through and skip filtering. Tracking is recorded as passthrough + // (0% savings) since token counts over binary content have no meaning. + if is_binary(&output.stdout) { + let stdout = std::io::stdout(); + let mut handle = stdout.lock(); + handle + .write_all(&output.stdout) + .context("Failed to write binary response to stdout")?; + timer.track_passthrough( + &format!("curl {}", args.join(" ")), + &format!("rtk curl {}", args.join(" ")), + ); + return Ok(exit_code); + } + + let raw = String::from_utf8_lossy(&output.stdout).into_owned(); let is_tty = std::io::stdout().is_terminal(); let filtered = filter_curl_output(&raw, is_tty); @@ -62,6 +91,17 @@ pub fn run(args: &[String], verbose: u8) -> Result { Ok(exit_code) } +/// Returns `true` if `bytes` is not valid UTF-8 — which is exactly the +/// condition under which `from_utf8_lossy` would replace invalid bytes +/// with U+FFFD and corrupt downstream consumers (`#1087`). +/// +/// This is correct by construction: the only reason to passthrough raw +/// bytes is to avoid the lossy conversion, and the only bytes that suffer +/// from it are the non-UTF-8 ones. +fn is_binary(bytes: &[u8]) -> bool { + std::str::from_utf8(bytes).is_err() +} + fn filter_curl_output(raw: &str, is_tty: bool) -> FilterResult<'_> { let trimmed = raw.trim(); @@ -239,4 +279,35 @@ mod tests { let json_result = filter_curl_output(&json_payload, true); assert!(matches!(json_result.content, Cow::Borrowed(_))); } + + // --- is_binary tests ---------------------------------------------------- + + #[test] + fn test_is_binary_gzip_magic_is_not_utf8() { + // gzip magic 1f 8b — 0x8b is an invalid UTF-8 continuation byte + let bytes = [0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03]; + assert!(is_binary(&bytes)); + } + + #[test] + fn test_is_binary_valid_utf8_text_is_not_binary() { + assert!(!is_binary(br#"{"key": "value"}"#)); + assert!(!is_binary(b"\nHi")); + assert!(!is_binary(b"Plain ASCII text")); + assert!(!is_binary("Héllo wörld — emojis 🚀 ✓".as_bytes())); + } + + #[test] + fn test_is_binary_empty_is_not_binary() { + // Empty input is technically valid UTF-8 and trivially safe to filter. + assert!(!is_binary(&[])); + } + + #[test] + fn test_is_binary_text_with_nul_is_not_binary() { + // NUL is valid UTF-8 (U+0000). Unusual in HTTP responses but the + // function honors UTF-8 strictly — caller can still filter such + // content safely. The bug we're fixing is only invalid UTF-8 bytes. + assert!(!is_binary(b"text with\0embedded nul")); + } }