bug-ops · bug-ops · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -126,11 +126,15 @@ jobs:
         run: cargo +nightly fmt --check
 
   lint-clippy:
-    name: Lint (clippy)
+    name: Lint (clippy, ${{ matrix.features }})
     needs: detect-changes
     if: needs.detect-changes.outputs.run-full-ci == 'true'
     runs-on: ubuntu-latest
     timeout-minutes: 10
+    strategy:
+      fail-fast: false
+      matrix:
+        features: [full, bench]
     env:
       RUSTC_WRAPPER: sccache
       SCCACHE_GHA_ENABLED: "true"
@@ -143,10 +147,10 @@ jobs:
       - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
         with:
           cache-targets: "false"
-          shared-key: "ci"
+          shared-key: "ci-clippy-${{ matrix.features }}"
       - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
       - name: Clippy
-        run: cargo clippy --profile ci --workspace --features full -- -D warnings
+        run: cargo clippy --profile ci --workspace --features ${{ matrix.features }} -- -D warnings
 
   build-tests:
     name: Build Tests (${{ matrix.os }})
@@ -334,7 +338,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        bundle: [desktop, ide, server, chat, ml]
+        bundle: [desktop, ide, server, chat, ml, bench]
         include:
           - bundle: ml
             allow_failure: true

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ### Added
 
+- **`zeph-bench` crate scaffold with `BenchmarkChannel`** (`#2828`): new optional crate `crates/zeph-bench/` gated on the `bench` feature flag (not included in `full`). `BenchmarkChannel` implements the `Channel` trait for headless benchmark execution: `recv()` drains an injected prompt queue, `send()`/`send_chunk()`/`flush_chunks()` accumulate LLM responses into a capture buffer, `send_usage()` records token stats, `confirm()` auto-approves, `elicit()` returns `Declined`, `send_tool_output()` is a no-op (tool outputs excluded from benchmark metrics). `DatasetRegistry` lists the 5 supported datasets: LongMemEval, LOCOMO, FRAMES, tau-bench, GAIA.
+
+- **`zeph bench` CLI subcommand** (`#2829`): top-level `bench` subcommand added to the `zeph` binary, gated on `#[cfg(feature = "bench")]`. Subcommands: `list` (print all datasets with cache status), `download --dataset <name>` (fetch and cache a dataset), `run --dataset <name> --output <path> [--scenario <id>] [--provider <name>] [--baseline] [--resume] [--no-deterministic]` (execute a benchmark), `show --results <path>` (pretty-print a results JSON file). Unknown dataset names and missing cache exit with code 1 and a diagnostic message.
+
+- **Deterministic mode for benchmark runs** (`#2831`): bench runner applies `GenerationOverrides { temperature: Some(0.0), seed: Some(0) }` to the active LLM provider before constructing the agent, ensuring reproducible results across runs. Disabled with `--no-deterministic`.
+
 - **Supervised bounded background task management** (`#2816`, `#2821`): introduced `BackgroundSupervisor` in `zeph-core` with per-class concurrency limits (Enrichment=4, Telemetry=8) and drop-on-overflow policy. Background tasks use an `InflightGuard` drop-guard to free concurrency slots immediately on completion. Metrics (`bg_inflight`, `bg_dropped`, `bg_completed`) added to `AgentMetrics`. `persist_message()` refactored into two phases: foreground commit (SQLite/Qdrant write, essential metrics) and background enrichment (summarization, graph extraction, persona extraction, trajectory extraction). Two fire-and-forget `tokio::spawn` sites in `corrections.rs` migrated to the supervisor. Foreground turns no longer await enrichment work; tail latency from post-persist processing is eliminated.
 
 - **Bounded Candle inference worker** (`#2818`): replaced `Arc<Mutex<ModelWeights>>` with a

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -123,6 +123,7 @@ uuid = "1.23"
 wiremock = "0.6.5"
 zeroize = { version = "1", default-features = false }
 zeph-a2a = { path = "crates/zeph-a2a", version = "0.18.5" }
+zeph-bench = { path = "crates/zeph-bench", version = "0.18.6" }
 zeph-acp = { path = "crates/zeph-acp", version = "0.18.5" }
 zeph-db = { path = "crates/zeph-db", version = "0.18.5", default-features = false }
 zeph-channels = { path = "crates/zeph-channels", version = "0.18.5" }
@@ -175,6 +176,7 @@ server  = ["gateway", "a2a", "otel"]
 chat    = ["discord", "slack"]
 ml      = ["candle", "pdf"]
 full    = ["desktop", "ide", "server", "chat", "pdf", "scheduler", "classifiers"]
+bench   = ["dep:zeph-bench"]
 
 # === Individual feature flags ===
 a2a = ["dep:zeph-a2a", "zeph-a2a?/server", "zeph-a2a?/ibct"]
@@ -245,6 +247,7 @@ zeph-skills.workspace = true
 zeph-subagent.workspace = true
 zeph-tools.workspace = true
 zeph-tui = { workspace = true, optional = true }
+zeph-bench = { workspace = true, optional = true }
 
 [dev-dependencies]
 serial_test.workspace = true

diff --git a/crates/zeph-bench/Cargo.toml b/crates/zeph-bench/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "zeph-bench"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+homepage.workspace = true
+documentation.workspace = true
+keywords.workspace = true
+categories.workspace = true
+description = "Benchmark harness for evaluating Zeph agent performance on standardized datasets"
+publish = true
+
+[dependencies]
+clap = { workspace = true, features = ["derive"] }
+serde.workspace = true
+serde_json.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["time"] }
+zeph-config.workspace = true
+zeph-core.workspace = true
+zeph-llm.workspace = true
+
+[lints]
+workspace = true