Skip to content

Commit 2fe5fce

Browse files
committed
feat(bench): implement LOCOMO, FRAMES, and GAIA dataset loaders
Closes #2836, #2837, #2839 Add shared Scenario/Evaluator traits and metric functions: - token_f1: whitespace-token overlap F1 score - exact_match: case-insensitive, punctuation-stripped equality - gaia_normalized_exact_match: strips articles, punctuation, collapses whitespace Loaders and evaluators: - LocomoLoader: parses lmlab/locomo JSON array; one Scenario per QA pair; LocomoEvaluator uses token F1 with threshold 0.5 - FramesLoader: parses google/frames-benchmark JSONL; stores reasoning_types in metadata; FramesEvaluator uses exact match - GaiaLoader: parses gaia-benchmark/GAIA JSONL with optional --level filter; GaiaEvaluator uses GAIA-normalized exact match 52 unit tests across all new modules; all 7788 workspace tests pass.
1 parent c57cdb9 commit 2fe5fce

6 files changed

Lines changed: 713 additions & 0 deletions

File tree

crates/zeph-bench/src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,17 @@ pub mod cli;
66
pub mod dataset;
77
pub mod deterministic;
88
pub mod error;
9+
pub mod loaders;
910
pub mod results;
11+
pub mod scenario;
1012

1113
pub use channel::BenchmarkChannel;
1214
pub use cli::BenchCommand;
1315
pub use dataset::{DatasetFormat, DatasetMeta, DatasetRegistry};
1416
pub use deterministic::apply_deterministic_overrides;
1517
pub use error::BenchError;
1618
pub use results::{Aggregate, BenchRun, ResultWriter, RunStatus, ScenarioResult};
19+
pub use scenario::{
20+
DatasetLoader, EvalResult, Evaluator, Scenario, exact_match, gaia_normalized_exact_match,
21+
token_f1,
22+
};
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2+
// SPDX-License-Identifier: MIT OR Apache-2.0
3+
4+
use std::{
5+
io::{BufRead as _, BufReader},
6+
path::Path,
7+
};
8+
9+
use serde::Deserialize;
10+
11+
use crate::{
12+
error::BenchError,
13+
scenario::{DatasetLoader, EvalResult, Evaluator, Scenario, exact_match},
14+
};
15+
16+
#[derive(Debug, Deserialize)]
17+
struct FramesRecord {
18+
#[serde(rename = "Prompt")]
19+
prompt: String,
20+
#[serde(rename = "Answer")]
21+
answer: String,
22+
reasoning_types: Option<serde_json::Value>,
23+
}
24+
25+
/// Loads FRAMES benchmark scenarios from a JSONL file.
26+
///
27+
/// Schema (google/frames-benchmark on HuggingFace):
28+
/// ```json
29+
/// {"Prompt": "...", "Answer": "...", "reasoning_types": [...], "wiki_links": [...]}
30+
/// ```
31+
///
32+
/// Each line becomes one [`Scenario`] with id `"frames_{line_number}"`.
33+
/// `reasoning_types` is stored in `metadata`.
34+
#[derive(Debug)]
35+
pub struct FramesLoader;
36+
37+
impl DatasetLoader for FramesLoader {
38+
fn name(&self) -> &'static str {
39+
"frames"
40+
}
41+
42+
/// # Errors
43+
///
44+
/// Returns [`BenchError::Io`] when the file cannot be read and
45+
/// [`BenchError::InvalidFormat`] when a JSONL line cannot be parsed.
46+
fn load(&self, path: &Path) -> Result<Vec<Scenario>, BenchError> {
47+
let file = std::fs::File::open(path)?;
48+
let reader = BufReader::new(file);
49+
50+
let mut scenarios = Vec::new();
51+
for (line_number, line) in reader.lines().enumerate() {
52+
let line = line?;
53+
let trimmed = line.trim();
54+
if trimmed.is_empty() {
55+
continue;
56+
}
57+
let record: FramesRecord = serde_json::from_str(trimmed)
58+
.map_err(|e| BenchError::InvalidFormat(format!("line {line_number}: {e}")))?;
59+
60+
let metadata = record.reasoning_types.unwrap_or(serde_json::Value::Null);
61+
62+
scenarios.push(Scenario {
63+
id: format!("frames_{line_number}"),
64+
prompt: record.prompt,
65+
expected: record.answer,
66+
metadata,
67+
});
68+
}
69+
Ok(scenarios)
70+
}
71+
}
72+
73+
/// Evaluates FRAMES responses using exact match.
74+
#[derive(Debug)]
75+
pub struct FramesEvaluator;
76+
77+
impl Evaluator for FramesEvaluator {
78+
fn evaluate(&self, scenario: &Scenario, agent_response: &str) -> EvalResult {
79+
let passed = exact_match(agent_response, &scenario.expected);
80+
EvalResult {
81+
scenario_id: scenario.id.clone(),
82+
score: if passed { 1.0 } else { 0.0 },
83+
passed,
84+
details: format!("exact_match={}", if passed { "true" } else { "false" }),
85+
}
86+
}
87+
}
88+
89+
#[cfg(test)]
90+
mod tests {
91+
use super::*;
92+
93+
const FIXTURE: &str = r#"{"Prompt": "What is 2+2?", "Answer": "4", "reasoning_types": ["math"], "wiki_links": []}
94+
{"Prompt": "Capital of France?", "Answer": "Paris", "reasoning_types": ["geography"]}
95+
"#;
96+
97+
fn load_from_str(jsonl: &str) -> Vec<Scenario> {
98+
let dir = tempfile::tempdir().unwrap();
99+
let path = dir.path().join("frames.jsonl");
100+
std::fs::write(&path, jsonl).unwrap();
101+
FramesLoader.load(&path).unwrap()
102+
}
103+
104+
#[test]
105+
fn load_parses_scenario_count() {
106+
let scenarios = load_from_str(FIXTURE);
107+
assert_eq!(scenarios.len(), 2);
108+
}
109+
110+
#[test]
111+
fn load_builds_correct_ids() {
112+
let scenarios = load_from_str(FIXTURE);
113+
assert_eq!(scenarios[0].id, "frames_0");
114+
assert_eq!(scenarios[1].id, "frames_1");
115+
}
116+
117+
#[test]
118+
fn load_maps_prompt_and_expected() {
119+
let scenarios = load_from_str(FIXTURE);
120+
assert_eq!(scenarios[0].prompt, "What is 2+2?");
121+
assert_eq!(scenarios[0].expected, "4");
122+
}
123+
124+
#[test]
125+
fn load_stores_reasoning_types_in_metadata() {
126+
let scenarios = load_from_str(FIXTURE);
127+
assert!(scenarios[0].metadata.is_array());
128+
}
129+
130+
#[test]
131+
fn evaluator_exact_match_passes() {
132+
let scenarios = load_from_str(FIXTURE);
133+
let result = FramesEvaluator.evaluate(&scenarios[0], "4");
134+
assert!(result.passed);
135+
assert!((result.score - 1.0).abs() < f64::EPSILON);
136+
}
137+
138+
#[test]
139+
fn evaluator_wrong_answer_fails() {
140+
let scenarios = load_from_str(FIXTURE);
141+
let result = FramesEvaluator.evaluate(&scenarios[0], "5");
142+
assert!(!result.passed);
143+
assert!(result.score < f64::EPSILON);
144+
}
145+
146+
#[test]
147+
fn evaluator_case_insensitive_match() {
148+
let scenarios = load_from_str(FIXTURE);
149+
let result = FramesEvaluator.evaluate(&scenarios[1], "paris");
150+
assert!(result.passed);
151+
}
152+
153+
#[test]
154+
fn load_invalid_jsonl_returns_error() {
155+
let dir = tempfile::tempdir().unwrap();
156+
let path = dir.path().join("bad.jsonl");
157+
std::fs::write(&path, "not json\n").unwrap();
158+
assert!(FramesLoader.load(&path).is_err());
159+
}
160+
}
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2+
// SPDX-License-Identifier: MIT OR Apache-2.0
3+
4+
use std::{
5+
io::{BufRead as _, BufReader},
6+
path::Path,
7+
};
8+
9+
use serde::Deserialize;
10+
11+
use crate::{
12+
error::BenchError,
13+
scenario::{DatasetLoader, EvalResult, Evaluator, Scenario, gaia_normalized_exact_match},
14+
};
15+
16+
#[derive(Debug, Deserialize)]
17+
struct GaiaRecord {
18+
task_id: String,
19+
#[serde(rename = "Question")]
20+
question: String,
21+
#[serde(rename = "Level")]
22+
level: u8,
23+
#[serde(rename = "Final answer")]
24+
final_answer: String,
25+
#[serde(rename = "Annotator Metadata")]
26+
annotator_metadata: Option<serde_json::Value>,
27+
}
28+
29+
/// Loads GAIA benchmark scenarios from a JSONL file.
30+
///
31+
/// Schema (gaia-benchmark/GAIA on HuggingFace):
32+
/// ```json
33+
/// {"task_id": "...", "Question": "...", "Level": 1, "Final answer": "...", "Annotator Metadata": {...}}
34+
/// ```
35+
///
36+
/// When `level` is `Some(n)`, only scenarios of that level are loaded.
37+
#[derive(Debug)]
38+
pub struct GaiaLoader {
39+
/// Optional level filter. When `Some`, only scenarios with a matching `Level` are loaded.
40+
pub level: Option<u8>,
41+
}
42+
43+
impl GaiaLoader {
44+
/// Create a loader that returns all levels.
45+
#[must_use]
46+
pub fn all_levels() -> Self {
47+
Self { level: None }
48+
}
49+
50+
/// Create a loader that filters to a specific difficulty level.
51+
#[must_use]
52+
pub fn with_level(level: u8) -> Self {
53+
Self { level: Some(level) }
54+
}
55+
}
56+
57+
impl DatasetLoader for GaiaLoader {
58+
fn name(&self) -> &'static str {
59+
"gaia"
60+
}
61+
62+
/// # Errors
63+
///
64+
/// Returns [`BenchError::Io`] when the file cannot be read and
65+
/// [`BenchError::InvalidFormat`] when a JSONL line cannot be parsed.
66+
fn load(&self, path: &Path) -> Result<Vec<Scenario>, BenchError> {
67+
let file = std::fs::File::open(path)?;
68+
let reader = BufReader::new(file);
69+
70+
let mut scenarios = Vec::new();
71+
for (line_number, line) in reader.lines().enumerate() {
72+
let line = line?;
73+
let trimmed = line.trim();
74+
if trimmed.is_empty() {
75+
continue;
76+
}
77+
let record: GaiaRecord = serde_json::from_str(trimmed)
78+
.map_err(|e| BenchError::InvalidFormat(format!("line {line_number}: {e}")))?;
79+
80+
if let Some(filter_level) = self.level
81+
&& record.level != filter_level
82+
{
83+
continue;
84+
}
85+
86+
let metadata = serde_json::json!({
87+
"level": record.level,
88+
"annotator_metadata": record.annotator_metadata,
89+
});
90+
91+
scenarios.push(Scenario {
92+
id: record.task_id,
93+
prompt: record.question,
94+
expected: record.final_answer,
95+
metadata,
96+
});
97+
}
98+
Ok(scenarios)
99+
}
100+
}
101+
102+
/// Evaluates GAIA responses using GAIA-normalized exact match.
103+
#[derive(Debug)]
104+
pub struct GaiaEvaluator;
105+
106+
impl Evaluator for GaiaEvaluator {
107+
fn evaluate(&self, scenario: &Scenario, agent_response: &str) -> EvalResult {
108+
let passed = gaia_normalized_exact_match(agent_response, &scenario.expected);
109+
EvalResult {
110+
scenario_id: scenario.id.clone(),
111+
score: if passed { 1.0 } else { 0.0 },
112+
passed,
113+
details: format!(
114+
"gaia_normalized_exact_match={}",
115+
if passed { "true" } else { "false" }
116+
),
117+
}
118+
}
119+
}
120+
121+
#[cfg(test)]
122+
mod tests {
123+
use super::*;
124+
125+
const FIXTURE: &str = r#"{"task_id": "t1", "Question": "What year did WWII end?", "Level": 1, "Final answer": "1945", "Annotator Metadata": {"difficulty": "easy"}}
126+
{"task_id": "t2", "Question": "Who wrote Hamlet?", "Level": 2, "Final answer": "Shakespeare", "Annotator Metadata": null}
127+
{"task_id": "t3", "Question": "Capital of Japan?", "Level": 1, "Final answer": "Tokyo", "Annotator Metadata": null}
128+
"#;
129+
130+
fn load_from_str(jsonl: &str, level: Option<u8>) -> Vec<Scenario> {
131+
let dir = tempfile::tempdir().unwrap();
132+
let path = dir.path().join("gaia.jsonl");
133+
std::fs::write(&path, jsonl).unwrap();
134+
GaiaLoader { level }.load(&path).unwrap()
135+
}
136+
137+
#[test]
138+
fn load_all_levels_parses_scenario_count() {
139+
let scenarios = load_from_str(FIXTURE, None);
140+
assert_eq!(scenarios.len(), 3);
141+
}
142+
143+
#[test]
144+
fn load_filters_by_level() {
145+
let scenarios = load_from_str(FIXTURE, Some(1));
146+
assert_eq!(scenarios.len(), 2);
147+
for s in &scenarios {
148+
assert_eq!(s.metadata["level"], 1);
149+
}
150+
}
151+
152+
#[test]
153+
fn load_maps_task_id_to_scenario_id() {
154+
let scenarios = load_from_str(FIXTURE, None);
155+
assert_eq!(scenarios[0].id, "t1");
156+
assert_eq!(scenarios[1].id, "t2");
157+
}
158+
159+
#[test]
160+
fn load_maps_prompt_and_expected() {
161+
let scenarios = load_from_str(FIXTURE, None);
162+
assert_eq!(scenarios[0].prompt, "What year did WWII end?");
163+
assert_eq!(scenarios[0].expected, "1945");
164+
}
165+
166+
#[test]
167+
fn load_stores_level_in_metadata() {
168+
let scenarios = load_from_str(FIXTURE, None);
169+
assert_eq!(scenarios[1].metadata["level"], 2);
170+
}
171+
172+
#[test]
173+
fn evaluator_normalized_match_passes() {
174+
let scenarios = load_from_str(FIXTURE, None);
175+
// "The 1945" should match "1945" after stripping article and comparing
176+
let result = GaiaEvaluator.evaluate(&scenarios[0], "1945");
177+
assert!(result.passed);
178+
}
179+
180+
#[test]
181+
fn evaluator_wrong_answer_fails() {
182+
let scenarios = load_from_str(FIXTURE, None);
183+
let result = GaiaEvaluator.evaluate(&scenarios[0], "1944");
184+
assert!(!result.passed);
185+
assert!(result.score < f64::EPSILON);
186+
}
187+
188+
#[test]
189+
fn evaluator_strips_article_the() {
190+
let scenarios = load_from_str(FIXTURE, None);
191+
// scenario[2]: expected = "Tokyo"
192+
let result = GaiaEvaluator.evaluate(&scenarios[2], "The Tokyo");
193+
assert!(result.passed);
194+
}
195+
196+
#[test]
197+
fn load_invalid_jsonl_returns_error() {
198+
let dir = tempfile::tempdir().unwrap();
199+
let path = dir.path().join("bad.jsonl");
200+
std::fs::write(&path, "not json\n").unwrap();
201+
assert!(GaiaLoader::all_levels().load(&path).is_err());
202+
}
203+
204+
#[test]
205+
fn all_levels_constructor() {
206+
let loader = GaiaLoader::all_levels();
207+
assert!(loader.level.is_none());
208+
}
209+
210+
#[test]
211+
fn with_level_constructor() {
212+
let loader = GaiaLoader::with_level(2);
213+
assert_eq!(loader.level, Some(2));
214+
}
215+
}

0 commit comments

Comments
 (0)