Skip to content

Commit 7aef871

Browse files
committed
feat(bench): add zeph-bench crate scaffold, CLI subcommand, and deterministic mode
Closes #2828, #2829, #2831. Part of epic #2827. - New optional crate `crates/zeph-bench/` gated on `bench` feature flag (excluded from `full`). Implements `BenchmarkChannel` satisfying the `Channel` trait for headless benchmark execution: scripted prompt queue, response capture buffer, token usage recording, auto-confirm, elicit returns Declined, send_tool_output is a no-op. - `DatasetRegistry` with 5 built-in datasets: LongMemEval, LOCOMO, FRAMES, tau-bench, GAIA. - `zeph bench` CLI subcommand (list, download, run, show) gated on `cfg(feature = "bench")`. Unknown dataset and missing cache exit 1 with diagnostic message. - Deterministic mode: applies `GenerationOverrides { temperature: 0.0, seed: 0 }` before agent construction; disabled with `--no-deterministic`. - 16 unit tests covering channel behavior, dataset registry, and deterministic override (including skip-branch).
1 parent efda2e2 commit 7aef871

File tree

14 files changed

+686
-0
lines changed

14 files changed

+686
-0
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

77
## [Unreleased]
88

9+
### Added
10+
11+
- **`zeph-bench` crate scaffold with `BenchmarkChannel`** (`#2828`): new optional crate `crates/zeph-bench/` gated on the `bench` feature flag (not included in `full`). `BenchmarkChannel` implements the `Channel` trait for headless benchmark execution: `recv()` drains an injected prompt queue, `send()`/`send_chunk()`/`flush_chunks()` accumulate LLM responses into a capture buffer, `send_usage()` records token stats, `confirm()` auto-approves, `elicit()` returns `Declined`, `send_tool_output()` is a no-op (tool outputs excluded from benchmark metrics). `DatasetRegistry` lists the 5 supported datasets: LongMemEval, LOCOMO, FRAMES, tau-bench, GAIA.
12+
13+
- **`zeph bench` CLI subcommand** (`#2829`): top-level `bench` subcommand added to the `zeph` binary, gated on `#[cfg(feature = "bench")]`. Subcommands: `list` (print all datasets with cache status), `download --dataset <name>` (fetch and cache a dataset), `run --dataset <name> --output <path> [--scenario <id>] [--provider <name>] [--baseline] [--resume] [--no-deterministic]` (execute a benchmark), `show --results <path>` (pretty-print a results JSON file). Unknown dataset names and missing cache exit with code 1 and a diagnostic message.
14+
15+
- **Deterministic mode for benchmark runs** (`#2831`): bench runner applies `GenerationOverrides { temperature: Some(0.0), seed: Some(0) }` to the active LLM provider before constructing the agent, ensuring reproducible results across runs. Disabled with `--no-deterministic`.
16+
917
### Fixed
1018

1119
- **MCP handshake timeout not enforced** (`#2815`): `connect()`, `connect_url()`, and `connect_url_with_headers()` now wrap `handler.serve(transport)` with `tokio::time::timeout(timeout, ...)`, returning `McpError::Timeout` on expiry. `list_tools()` applies the same guard to `list_all_tools()`. Previously, a stalled MCP server during the initialize handshake or tool listing would block `connect_all()` indefinitely, causing TUI startup to hang at "Connecting tools..." forever. Only `call_tool` had a timeout; the fix brings the other paths to parity.

Cargo.lock

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ uuid = "1.23"
123123
wiremock = "0.6.5"
124124
zeroize = { version = "1", default-features = false }
125125
zeph-a2a = { path = "crates/zeph-a2a", version = "0.18.5" }
126+
zeph-bench = { path = "crates/zeph-bench", version = "0.18.6" }
126127
zeph-acp = { path = "crates/zeph-acp", version = "0.18.5" }
127128
zeph-db = { path = "crates/zeph-db", version = "0.18.5", default-features = false }
128129
zeph-channels = { path = "crates/zeph-channels", version = "0.18.5" }
@@ -175,6 +176,7 @@ server = ["gateway", "a2a", "otel"]
175176
chat = ["discord", "slack"]
176177
ml = ["candle", "pdf"]
177178
full = ["desktop", "ide", "server", "chat", "pdf", "scheduler", "classifiers"]
179+
bench = ["dep:zeph-bench"]
178180

179181
# === Individual feature flags ===
180182
a2a = ["dep:zeph-a2a", "zeph-a2a?/server", "zeph-a2a?/ibct"]
@@ -245,6 +247,7 @@ zeph-skills.workspace = true
245247
zeph-subagent.workspace = true
246248
zeph-tools.workspace = true
247249
zeph-tui = { workspace = true, optional = true }
250+
zeph-bench = { workspace = true, optional = true }
248251

249252
[dev-dependencies]
250253
serial_test.workspace = true

crates/zeph-bench/Cargo.toml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[package]
2+
name = "zeph-bench"
3+
version.workspace = true
4+
edition.workspace = true
5+
authors.workspace = true
6+
license.workspace = true
7+
repository.workspace = true
8+
homepage.workspace = true
9+
documentation.workspace = true
10+
keywords.workspace = true
11+
categories.workspace = true
12+
description = "Benchmark harness for evaluating Zeph agent performance on standardized datasets"
13+
publish = true
14+
15+
[dependencies]
16+
clap = { workspace = true, features = ["derive"] }
17+
serde.workspace = true
18+
serde_json.workspace = true
19+
thiserror.workspace = true
20+
tokio = { workspace = true, features = ["time"] }
21+
zeph-config.workspace = true
22+
zeph-core.workspace = true
23+
zeph-llm.workspace = true
24+
25+
[lints]
26+
workspace = true

crates/zeph-bench/src/channel.rs

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2+
// SPDX-License-Identifier: MIT OR Apache-2.0
3+
4+
use std::collections::VecDeque;
5+
use std::time::Instant;
6+
7+
use zeph_core::channel::{ChannelError, ChannelMessage, ToolOutputEvent};
8+
9+
/// A single captured agent response for one benchmark prompt.
10+
#[derive(Debug, Clone)]
11+
pub struct CapturedResponse {
12+
pub prompt_index: usize,
13+
pub text: String,
14+
pub elapsed: std::time::Duration,
15+
pub input_tokens: u64,
16+
pub output_tokens: u64,
17+
pub context_window: u64,
18+
}
19+
20+
/// Headless channel that feeds pre-loaded prompts and captures agent responses.
21+
///
22+
/// Used by the bench runner to drive the agent loop without a real terminal or
23+
/// network connection. `recv()` drains the prompt queue; `send()` / `flush_chunks()`
24+
/// accumulate the response into `responses`.
25+
pub struct BenchmarkChannel {
26+
prompts: VecDeque<String>,
27+
responses: Vec<CapturedResponse>,
28+
current_index: usize,
29+
total: usize,
30+
// Streaming chunk accumulation
31+
chunk_buffer: String,
32+
chunk_start: Option<Instant>,
33+
// Token usage for the current prompt (updated by send_usage)
34+
pending_input_tokens: u64,
35+
pending_output_tokens: u64,
36+
pending_context_window: u64,
37+
}
38+
39+
impl BenchmarkChannel {
40+
/// Create a new channel from a list of prompt strings.
41+
#[must_use]
42+
pub fn new(prompts: Vec<String>) -> Self {
43+
let total = prompts.len();
44+
Self {
45+
prompts: VecDeque::from(prompts),
46+
responses: Vec::new(),
47+
current_index: 0,
48+
total,
49+
chunk_buffer: String::new(),
50+
chunk_start: None,
51+
pending_input_tokens: 0,
52+
pending_output_tokens: 0,
53+
pending_context_window: 0,
54+
}
55+
}
56+
57+
/// Total number of prompts this channel was initialised with.
58+
#[must_use]
59+
pub fn total(&self) -> usize {
60+
self.total
61+
}
62+
63+
/// Consume and return all captured responses.
64+
#[must_use]
65+
pub fn into_responses(self) -> Vec<CapturedResponse> {
66+
self.responses
67+
}
68+
69+
/// Borrow the captured responses.
70+
#[must_use]
71+
pub fn responses(&self) -> &[CapturedResponse] {
72+
&self.responses
73+
}
74+
75+
fn flush_chunk_buffer(&mut self) {
76+
if self.chunk_buffer.is_empty() {
77+
return;
78+
}
79+
let elapsed = self
80+
.chunk_start
81+
.map_or(std::time::Duration::ZERO, |s| s.elapsed());
82+
self.responses.push(CapturedResponse {
83+
prompt_index: self.current_index.saturating_sub(1),
84+
text: std::mem::take(&mut self.chunk_buffer),
85+
elapsed,
86+
input_tokens: self.pending_input_tokens,
87+
output_tokens: self.pending_output_tokens,
88+
context_window: self.pending_context_window,
89+
});
90+
self.chunk_start = None;
91+
self.pending_input_tokens = 0;
92+
self.pending_output_tokens = 0;
93+
self.pending_context_window = 0;
94+
}
95+
}
96+
97+
impl zeph_core::channel::Channel for BenchmarkChannel {
98+
async fn recv(&mut self) -> Result<Option<ChannelMessage>, ChannelError> {
99+
match self.prompts.pop_front() {
100+
Some(text) => {
101+
self.current_index += 1;
102+
Ok(Some(ChannelMessage {
103+
text,
104+
attachments: vec![],
105+
}))
106+
}
107+
None => Ok(None),
108+
}
109+
}
110+
111+
fn supports_exit(&self) -> bool {
112+
false
113+
}
114+
115+
async fn send(&mut self, text: &str) -> Result<(), ChannelError> {
116+
self.responses.push(CapturedResponse {
117+
prompt_index: self.current_index.saturating_sub(1),
118+
text: text.to_owned(),
119+
elapsed: std::time::Duration::ZERO,
120+
input_tokens: self.pending_input_tokens,
121+
output_tokens: self.pending_output_tokens,
122+
context_window: self.pending_context_window,
123+
});
124+
self.pending_input_tokens = 0;
125+
self.pending_output_tokens = 0;
126+
self.pending_context_window = 0;
127+
Ok(())
128+
}
129+
130+
async fn send_chunk(&mut self, chunk: &str) -> Result<(), ChannelError> {
131+
if self.chunk_start.is_none() {
132+
self.chunk_start = Some(Instant::now());
133+
}
134+
self.chunk_buffer.push_str(chunk);
135+
Ok(())
136+
}
137+
138+
async fn flush_chunks(&mut self) -> Result<(), ChannelError> {
139+
self.flush_chunk_buffer();
140+
Ok(())
141+
}
142+
143+
async fn send_usage(
144+
&mut self,
145+
input_tokens: u64,
146+
output_tokens: u64,
147+
context_window: u64,
148+
) -> Result<(), ChannelError> {
149+
self.pending_input_tokens = input_tokens;
150+
self.pending_output_tokens = output_tokens;
151+
self.pending_context_window = context_window;
152+
Ok(())
153+
}
154+
155+
// TODO(bench-runner): tool output is intentionally dropped here.
156+
// The default trait impl calls self.send(&formatted), which would push tool output
157+
// into responses and corrupt benchmark metrics. Override to no-op until Phase 2
158+
// when tool calls are captured separately.
159+
async fn send_tool_output(&mut self, _event: ToolOutputEvent<'_>) -> Result<(), ChannelError> {
160+
Ok(())
161+
}
162+
}
163+
164+
#[cfg(test)]
165+
mod tests {
166+
use zeph_core::channel::{
167+
Channel, ElicitationField, ElicitationFieldType, ElicitationRequest, ElicitationResponse,
168+
ToolOutputEvent,
169+
};
170+
171+
use super::*;
172+
173+
#[tokio::test]
174+
async fn recv_drains_queue_and_returns_none_when_empty() {
175+
let mut ch = BenchmarkChannel::new(vec!["hello".into(), "world".into()]);
176+
let msg1 = ch.recv().await.unwrap().unwrap();
177+
assert_eq!(msg1.text, "hello");
178+
let msg2 = ch.recv().await.unwrap().unwrap();
179+
assert_eq!(msg2.text, "world");
180+
let msg3 = ch.recv().await.unwrap();
181+
assert!(msg3.is_none());
182+
}
183+
184+
#[tokio::test]
185+
async fn send_accumulates_response() {
186+
let mut ch = BenchmarkChannel::new(vec!["prompt".into()]);
187+
let _ = ch.recv().await.unwrap();
188+
ch.send("response text").await.unwrap();
189+
assert_eq!(ch.responses().len(), 1);
190+
assert_eq!(ch.responses()[0].text, "response text");
191+
}
192+
193+
#[tokio::test]
194+
async fn confirm_returns_true() {
195+
let mut ch = BenchmarkChannel::new(vec![]);
196+
let result = ch.confirm("delete?").await.unwrap();
197+
assert!(result);
198+
}
199+
200+
#[tokio::test]
201+
async fn elicit_returns_declined() {
202+
let mut ch = BenchmarkChannel::new(vec![]);
203+
let req = ElicitationRequest {
204+
server_name: "test-server".into(),
205+
message: "provide input".into(),
206+
fields: vec![ElicitationField {
207+
name: "field".into(),
208+
description: None,
209+
field_type: ElicitationFieldType::String,
210+
required: true,
211+
}],
212+
};
213+
let result = ch.elicit(req).await.unwrap();
214+
assert!(matches!(result, ElicitationResponse::Declined));
215+
}
216+
217+
#[tokio::test]
218+
async fn send_chunk_and_flush_captures_response() {
219+
let mut ch = BenchmarkChannel::new(vec!["p".into()]);
220+
let _ = ch.recv().await.unwrap();
221+
ch.send_chunk("part1").await.unwrap();
222+
ch.send_chunk(" part2").await.unwrap();
223+
ch.flush_chunks().await.unwrap();
224+
assert_eq!(ch.responses().len(), 1);
225+
assert_eq!(ch.responses()[0].text, "part1 part2");
226+
}
227+
228+
#[tokio::test]
229+
async fn supports_exit_returns_false() {
230+
let ch = BenchmarkChannel::new(vec![]);
231+
assert!(!ch.supports_exit());
232+
}
233+
234+
#[tokio::test]
235+
async fn send_usage_captured_on_send() {
236+
let mut ch = BenchmarkChannel::new(vec!["p".into()]);
237+
let _ = ch.recv().await.unwrap();
238+
ch.send_usage(10, 20, 128_000).await.unwrap();
239+
ch.send("answer").await.unwrap();
240+
let r = &ch.responses()[0];
241+
assert_eq!(r.input_tokens, 10);
242+
assert_eq!(r.output_tokens, 20);
243+
assert_eq!(r.context_window, 128_000);
244+
}
245+
246+
#[tokio::test]
247+
async fn send_tool_output_does_not_add_to_responses() {
248+
let mut ch = BenchmarkChannel::new(vec!["p".into()]);
249+
let _ = ch.recv().await.unwrap();
250+
ch.send_tool_output(ToolOutputEvent {
251+
tool_name: "bash",
252+
body: "some tool output",
253+
diff: None,
254+
filter_stats: None,
255+
kept_lines: None,
256+
locations: None,
257+
tool_call_id: "tc-1",
258+
is_error: false,
259+
parent_tool_use_id: None,
260+
raw_response: None,
261+
started_at: None,
262+
})
263+
.await
264+
.unwrap();
265+
// Tool output must not be captured as a benchmark response.
266+
assert_eq!(ch.responses().len(), 0);
267+
}
268+
}

0 commit comments

Comments
 (0)