bug-ops
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 15 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/zeph-bench/Cargo.toml‎
Lines changed: 26 additions & 0 deletions b/‎crates/zeph-bench/Cargo.toml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎crates/zeph-bench/src/channel.rs‎
Lines changed: 268 additions & 0 deletions b/‎crates/zeph-bench/src/channel.rs‎
Lines changed: 268 additions & 0 deletions
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ## [Unreleased]
 
+### Added
+
+- **`zeph-bench` crate scaffold with `BenchmarkChannel`** (`#2828`): new optional crate `crates/zeph-bench/` gated on the `bench` feature flag (not included in `full`). `BenchmarkChannel` implements the `Channel` trait for headless benchmark execution: `recv()` drains an injected prompt queue, `send()`/`send_chunk()`/`flush_chunks()` accumulate LLM responses into a capture buffer, `send_usage()` records token stats, `confirm()` auto-approves, `elicit()` returns `Declined`, `send_tool_output()` is a no-op (tool outputs excluded from benchmark metrics). `DatasetRegistry` lists the 5 supported datasets: LongMemEval, LOCOMO, FRAMES, tau-bench, GAIA.
+
+- **`zeph bench` CLI subcommand** (`#2829`): top-level `bench` subcommand added to the `zeph` binary, gated on `#[cfg(feature = "bench")]`. Subcommands: `list` (print all datasets with cache status), `download --dataset <name>` (fetch and cache a dataset), `run --dataset <name> --output <path> [--scenario <id>] [--provider <name>] [--baseline] [--resume] [--no-deterministic]` (execute a benchmark), `show --results <path>` (pretty-print a results JSON file). Unknown dataset names and missing cache exit with code 1 and a diagnostic message.
+
+- **Deterministic mode for benchmark runs** (`#2831`): bench runner applies `GenerationOverrides { temperature: Some(0.0), seed: Some(0) }` to the active LLM provider before constructing the agent, ensuring reproducible results across runs. Disabled with `--no-deterministic`.
+
 ### Fixed
 
 - **MCP handshake timeout not enforced** (`#2815`): `connect()`, `connect_url()`, and `connect_url_with_headers()` now wrap `handler.serve(transport)` with `tokio::time::timeout(timeout, ...)`, returning `McpError::Timeout` on expiry. `list_tools()` applies the same guard to `list_all_tools()`. Previously, a stalled MCP server during the initialize handshake or tool listing would block `connect_all()` indefinitely, causing TUI startup to hang at "Connecting tools..." forever. Only `call_tool` had a timeout; the fix brings the other paths to parity.
 
@@ -123,6 +123,7 @@ uuid = "1.23"
 wiremock = "0.6.5"
 zeroize = { version = "1", default-features = false }
 zeph-a2a = { path = "crates/zeph-a2a", version = "0.18.5" }
+zeph-bench = { path = "crates/zeph-bench", version = "0.18.6" }
 zeph-acp = { path = "crates/zeph-acp", version = "0.18.5" }
 zeph-db = { path = "crates/zeph-db", version = "0.18.5", default-features = false }
 zeph-channels = { path = "crates/zeph-channels", version = "0.18.5" }
@@ -175,6 +176,7 @@ server  = ["gateway", "a2a", "otel"]
 chat    = ["discord", "slack"]
 ml      = ["candle", "pdf"]
 full    = ["desktop", "ide", "server", "chat", "pdf", "scheduler", "classifiers"]
+bench   = ["dep:zeph-bench"]
 
 # === Individual feature flags ===
 a2a = ["dep:zeph-a2a", "zeph-a2a?/server", "zeph-a2a?/ibct"]
@@ -245,6 +247,7 @@ zeph-skills.workspace = true
 zeph-subagent.workspace = true
 zeph-tools.workspace = true
 zeph-tui = { workspace = true, optional = true }
+zeph-bench = { workspace = true, optional = true }
 
 [dev-dependencies]
 serial_test.workspace = true
 
@@ -0,0 +1,26 @@
+[package]
+name = "zeph-bench"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+homepage.workspace = true
+documentation.workspace = true
+keywords.workspace = true
+categories.workspace = true
+description = "Benchmark harness for evaluating Zeph agent performance on standardized datasets"
+publish = true
+
+[dependencies]
+clap = { workspace = true, features = ["derive"] }
+serde.workspace = true
+serde_json.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["time"] }
+zeph-config.workspace = true
+zeph-core.workspace = true
+zeph-llm.workspace = true
+
+[lints]
+workspace = true
@@ -0,0 +1,268 @@
+// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
+// SPDX-License-Identifier: MIT OR Apache-2.0
+
+use std::collections::VecDeque;
+use std::time::Instant;
+
+use zeph_core::channel::{ChannelError, ChannelMessage, ToolOutputEvent};
+
+/// A single captured agent response for one benchmark prompt.
+#[derive(Debug, Clone)]
+pub struct CapturedResponse {
+    pub prompt_index: usize,
+    pub text: String,
+    pub elapsed: std::time::Duration,
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub context_window: u64,
+}
+
+/// Headless channel that feeds pre-loaded prompts and captures agent responses.
+///
+/// Used by the bench runner to drive the agent loop without a real terminal or
+/// network connection. `recv()` drains the prompt queue; `send()` / `flush_chunks()`
+/// accumulate the response into `responses`.
+pub struct BenchmarkChannel {
+    prompts: VecDeque<String>,
+    responses: Vec<CapturedResponse>,
+    current_index: usize,
+    total: usize,
+    // Streaming chunk accumulation
+    chunk_buffer: String,
+    chunk_start: Option<Instant>,
+    // Token usage for the current prompt (updated by send_usage)
+    pending_input_tokens: u64,
+    pending_output_tokens: u64,
+    pending_context_window: u64,
+}
+
+impl BenchmarkChannel {
+    /// Create a new channel from a list of prompt strings.
+    #[must_use]
+    pub fn new(prompts: Vec<String>) -> Self {
+        let total = prompts.len();
+        Self {
+            prompts: VecDeque::from(prompts),
+            responses: Vec::new(),
+            current_index: 0,
+            total,
+            chunk_buffer: String::new(),
+            chunk_start: None,
+            pending_input_tokens: 0,
+            pending_output_tokens: 0,
+            pending_context_window: 0,
+        }
+    }
+
+    /// Total number of prompts this channel was initialised with.
+    #[must_use]
+    pub fn total(&self) -> usize {
+        self.total
+    }
+
+    /// Consume and return all captured responses.
+    #[must_use]
+    pub fn into_responses(self) -> Vec<CapturedResponse> {
+        self.responses
+    }
+
+    /// Borrow the captured responses.
+    #[must_use]
+    pub fn responses(&self) -> &[CapturedResponse] {
+        &self.responses
+    }
+
+    fn flush_chunk_buffer(&mut self) {
+        if self.chunk_buffer.is_empty() {
+            return;
+        }
+        let elapsed = self
+            .chunk_start
+            .map_or(std::time::Duration::ZERO, |s| s.elapsed());
+        self.responses.push(CapturedResponse {
+            prompt_index: self.current_index.saturating_sub(1),
+            text: std::mem::take(&mut self.chunk_buffer),
+            elapsed,
+            input_tokens: self.pending_input_tokens,
+            output_tokens: self.pending_output_tokens,
+            context_window: self.pending_context_window,
+        });
+        self.chunk_start = None;
+        self.pending_input_tokens = 0;
+        self.pending_output_tokens = 0;
+        self.pending_context_window = 0;
+    }
+}
+
+impl zeph_core::channel::Channel for BenchmarkChannel {
+    async fn recv(&mut self) -> Result<Option<ChannelMessage>, ChannelError> {
+        match self.prompts.pop_front() {
+            Some(text) => {
+                self.current_index += 1;
+                Ok(Some(ChannelMessage {
+                    text,
+                    attachments: vec![],
+                }))
+            }
+            None => Ok(None),
+        }
+    }
+
+    fn supports_exit(&self) -> bool {
+        false
+    }
+
+    async fn send(&mut self, text: &str) -> Result<(), ChannelError> {
+        self.responses.push(CapturedResponse {
+            prompt_index: self.current_index.saturating_sub(1),
+            text: text.to_owned(),
+            elapsed: std::time::Duration::ZERO,
+            input_tokens: self.pending_input_tokens,
+            output_tokens: self.pending_output_tokens,
+            context_window: self.pending_context_window,
+        });
+        self.pending_input_tokens = 0;
+        self.pending_output_tokens = 0;
+        self.pending_context_window = 0;
+        Ok(())
+    }
+
+    async fn send_chunk(&mut self, chunk: &str) -> Result<(), ChannelError> {
+        if self.chunk_start.is_none() {
+            self.chunk_start = Some(Instant::now());
+        }
+        self.chunk_buffer.push_str(chunk);
+        Ok(())
+    }
+
+    async fn flush_chunks(&mut self) -> Result<(), ChannelError> {
+        self.flush_chunk_buffer();
+        Ok(())
+    }
+
+    async fn send_usage(
+        &mut self,
+        input_tokens: u64,
+        output_tokens: u64,
+        context_window: u64,
+    ) -> Result<(), ChannelError> {
+        self.pending_input_tokens = input_tokens;
+        self.pending_output_tokens = output_tokens;
+        self.pending_context_window = context_window;
+        Ok(())
+    }
+
+    // TODO(bench-runner): tool output is intentionally dropped here.
+    // The default trait impl calls self.send(&formatted), which would push tool output
+    // into responses and corrupt benchmark metrics. Override to no-op until Phase 2
+    // when tool calls are captured separately.
+    async fn send_tool_output(&mut self, _event: ToolOutputEvent<'_>) -> Result<(), ChannelError> {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use zeph_core::channel::{
+        Channel, ElicitationField, ElicitationFieldType, ElicitationRequest, ElicitationResponse,
+        ToolOutputEvent,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn recv_drains_queue_and_returns_none_when_empty() {
+        let mut ch = BenchmarkChannel::new(vec!["hello".into(), "world".into()]);
+        let msg1 = ch.recv().await.unwrap().unwrap();
+        assert_eq!(msg1.text, "hello");
+        let msg2 = ch.recv().await.unwrap().unwrap();
+        assert_eq!(msg2.text, "world");
+        let msg3 = ch.recv().await.unwrap();
+        assert!(msg3.is_none());
+    }
+
+    #[tokio::test]
+    async fn send_accumulates_response() {
+        let mut ch = BenchmarkChannel::new(vec!["prompt".into()]);
+        let _ = ch.recv().await.unwrap();
+        ch.send("response text").await.unwrap();
+        assert_eq!(ch.responses().len(), 1);
+        assert_eq!(ch.responses()[0].text, "response text");
+    }
+
+    #[tokio::test]
+    async fn confirm_returns_true() {
+        let mut ch = BenchmarkChannel::new(vec![]);
+        let result = ch.confirm("delete?").await.unwrap();
+        assert!(result);
+    }
+
+    #[tokio::test]
+    async fn elicit_returns_declined() {
+        let mut ch = BenchmarkChannel::new(vec![]);
+        let req = ElicitationRequest {
+            server_name: "test-server".into(),
+            message: "provide input".into(),
+            fields: vec![ElicitationField {
+                name: "field".into(),
+                description: None,
+                field_type: ElicitationFieldType::String,
+                required: true,
+            }],
+        };
+        let result = ch.elicit(req).await.unwrap();
+        assert!(matches!(result, ElicitationResponse::Declined));
+    }
+
+    #[tokio::test]
+    async fn send_chunk_and_flush_captures_response() {
+        let mut ch = BenchmarkChannel::new(vec!["p".into()]);
+        let _ = ch.recv().await.unwrap();
+        ch.send_chunk("part1").await.unwrap();
+        ch.send_chunk(" part2").await.unwrap();
+        ch.flush_chunks().await.unwrap();
+        assert_eq!(ch.responses().len(), 1);
+        assert_eq!(ch.responses()[0].text, "part1 part2");
+    }
+
+    #[tokio::test]
+    async fn supports_exit_returns_false() {
+        let ch = BenchmarkChannel::new(vec![]);
+        assert!(!ch.supports_exit());
+    }
+
+    #[tokio::test]
+    async fn send_usage_captured_on_send() {
+        let mut ch = BenchmarkChannel::new(vec!["p".into()]);
+        let _ = ch.recv().await.unwrap();
+        ch.send_usage(10, 20, 128_000).await.unwrap();
+        ch.send("answer").await.unwrap();
+        let r = &ch.responses()[0];
+        assert_eq!(r.input_tokens, 10);
+        assert_eq!(r.output_tokens, 20);
+        assert_eq!(r.context_window, 128_000);
+    }
+
+    #[tokio::test]
+    async fn send_tool_output_does_not_add_to_responses() {
+        let mut ch = BenchmarkChannel::new(vec!["p".into()]);
+        let _ = ch.recv().await.unwrap();
+        ch.send_tool_output(ToolOutputEvent {
+            tool_name: "bash",
+            body: "some tool output",
+            diff: None,
+            filter_stats: None,
+            kept_lines: None,
+            locations: None,
+            tool_call_id: "tc-1",
+            is_error: false,
+            parent_tool_use_id: None,
+            raw_response: None,
+            started_at: None,
+        })
+        .await
+        .unwrap();
+        // Tool output must not be captured as a benchmark response.
+        assert_eq!(ch.responses().len(), 0);
+    }
+}