From 2d1d4cda23eaf145afade04b998381fc2b0f2ba5 Mon Sep 17 00:00:00 2001 From: Nick Mitchell Date: Tue, 17 Feb 2026 19:07:14 -0500 Subject: [PATCH] feat(bench): add ragcsv benchmark for CSV-based RAG evaluation Standalone benchmark that reads a CSV of evaluation queries, executes each as a span query with document fragments as context, then grades response accuracy via a second LLM call. Reports quantile statistics for accuracy and response time. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Nick Mitchell --- Cargo.lock | 1 + spnl/Cargo.toml | 5 + spnl/benches/README-RAGCSV.md | 148 ++++++++++ spnl/benches/ragcsv.rs | 491 ++++++++++++++++++++++++++++++++++ 4 files changed, 645 insertions(+) create mode 100644 spnl/benches/README-RAGCSV.md create mode 100644 spnl/benches/ragcsv.rs diff --git a/Cargo.lock b/Cargo.lock index 16547833..4181ba7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10328,6 +10328,7 @@ dependencies = [ "cargo-husky", "clap", "criterion", + "csv", "derive_builder", "dirs", "either", diff --git a/spnl/Cargo.toml b/spnl/Cargo.toml index 2bd65ca5..5e33ac13 100644 --- a/spnl/Cargo.toml +++ b/spnl/Cargo.toml @@ -115,6 +115,7 @@ tokio = { version = "1.44.1", features = ["rt-multi-thread"] } dirs = "6.0" reqwest = { version = "0.12", features = ["blocking", "json"] } serde_json = "1.0" +csv = "1.3" [[bench]] name = "haystack" @@ -137,3 +138,7 @@ required-features = ["tok"] name = "ruler" harness = false required-features = ["tok"] + +[[bench]] +name = "ragcsv" +harness = false diff --git a/spnl/benches/README-RAGCSV.md b/spnl/benches/README-RAGCSV.md new file mode 100644 index 00000000..60127530 --- /dev/null +++ b/spnl/benches/README-RAGCSV.md @@ -0,0 +1,148 @@ +# RAGCSV Benchmark for SPNL + +Evaluates RAG answer accuracy from a CSV of evaluation queries using SPNL span queries. Each row contains a question, document fragments, and an expected answer. The benchmark executes each query, then uses a second LLM call to grade the response accuracy on a 0-100 scale. + +## Overview + +1. Load a CSV file with evaluation data (questions, document fragments, expected answers) +2. For each row, construct a span query that provides the document fragments as context and asks the question +3. Grade the model's response against the expected answer using a second LLM query +4. Report quantile statistics for accuracy and response time + +## CSV Schema + +The CSV has no headers. Columns are positional: + +| Index | Content | +|-------|---------| +| 0 | Expected result | +| 1 | Fragments -- Python-style single-quoted JSON array of `{'page_content': '...', 'metadata': {'title': '...'}, ...}` | +| 2 | Hallucination detection proposal (unused) | +| 3 | Timestamp (unused) | +| 4 | Question to pose | + +## Configuration via Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `RAGCSV_FILE` | (required) | Path to the CSV file | +| `RAGCSV_MODEL` | `ollama/granite3.3:2b` | Model for inference | +| `RAGCSV_GRADING_MODEL` | same as `RAGCSV_MODEL` | Model for accuracy grading | +| `RAGCSV_CONCURRENCY` | `1` | Max parallel query executions | +| `RAGCSV_LIMIT` | all rows | Limit number of rows to process | +| `RAGCSV_MAX_TOKENS` | `512` | Max tokens for primary query | +| `RAGCSV_DEBUG` | `false` | Print first row's query/response details | + +## Usage Examples + +### Basic Usage + +```bash +RAGCSV_FILE=~/data/eval.csv cargo bench --bench ragcsv +``` + +### Debug Mode + +```bash +RAGCSV_FILE=~/data/eval.csv RAGCSV_DEBUG=1 RAGCSV_LIMIT=3 \ + cargo bench --bench ragcsv +``` + +### Concurrent Execution + +```bash +RAGCSV_FILE=~/data/eval.csv \ +RAGCSV_MODEL="ollama/granite3.3:2b" \ +RAGCSV_CONCURRENCY=4 \ +RAGCSV_LIMIT=100 \ + cargo bench --bench ragcsv +``` + +### Separate Grading Model + +```bash +RAGCSV_FILE=~/data/eval.csv \ +RAGCSV_MODEL="ollama/granite3.3:2b" \ +RAGCSV_GRADING_MODEL="ollama/llama3.2:3b" \ + cargo bench --bench ragcsv +``` + +## Output Format + +Progress is shown during execution: + +``` +[00:01:23] 42/100 | Avg Acc=72.3% | Pass(>=75%)=28/42 +``` + +Final report includes quantile statistics for accuracy and total time: + +``` +=== RAGCSV Eval Accuracy (n=100) === + min: 10.0% + p25: 55.0% + p50: 75.0% + p75: 85.0% + p90: 95.0% + p99: 100.0% + max: 100.0% + avg: 71.2% + pass (>=75%): 58/100 + +=== RAGCSV Eval Total Time (n=100) === + min: 234ms + p25: 456ms + p50: 623ms + p75: 891ms + p90: 1203ms + p99: 2456ms + max: 3012ms + avg: 702ms +``` + +## Implementation Details + +### Query Construction + +Each row produces a span query of the form: + +``` +g model + (cross + (system "You are a helpful assistant. Answer the question based only on the provided documents.") + (plus [user "Document: {title}\n{page_content}", ...]) + (user question) + ) + temperature + max_tokens +``` + +### Accuracy Grading + +After getting the model response, a second query grades accuracy: + +``` +g grading_model + (cross + (system "You are an accuracy evaluator...") + (user "Expected answer: ...\n\nActual answer: ...\n\nAccuracy score (0-100):") + ) + 0.0 // deterministic + 16 // small max_tokens +``` + +The grading model returns an integer 0-100, which is parsed from the response. + +### Concurrency + +Rows are dispatched as tokio tasks with a semaphore limiting parallelism to `RAGCSV_CONCURRENCY`. Results are collected via an mpsc channel. + +## Future Work + +- Capture TTFT (time to first token) and ITL (inter-token latency) metrics programmatically +- Support additional CSV schemas +- Add per-row result CSV export + +--- + +Made with Bob diff --git a/spnl/benches/ragcsv.rs b/spnl/benches/ragcsv.rs new file mode 100644 index 00000000..974cfecd --- /dev/null +++ b/spnl/benches/ragcsv.rs @@ -0,0 +1,491 @@ +mod bench_progress; + +use spnl::{ + ExecuteOptions, execute, + ir::{Message::Assistant, Query}, + spnl, +}; +use std::sync::Arc; +use std::time::Instant; +use tokio::sync::{Semaphore, mpsc}; + +// --------------------------------------------------------------------------- +// CSV row and fragment types +// --------------------------------------------------------------------------- + +#[derive(Debug)] +struct EvalRow { + index: usize, + expected: String, + fragments: Vec, + question: String, +} + +#[derive(Debug, serde::Deserialize)] +#[allow(dead_code)] +struct Fragment { + page_content: String, + metadata: FragmentMetadata, +} + +#[derive(Debug, serde::Deserialize)] +#[allow(dead_code)] +struct FragmentMetadata { + #[serde(default)] + title: String, +} + +// --------------------------------------------------------------------------- +// Metrics +// --------------------------------------------------------------------------- + +struct RowMetrics { + #[allow(dead_code)] + row_index: usize, + accuracy: f64, + #[allow(dead_code)] + ttft_ms: Option, + #[allow(dead_code)] + itl_ms: Option, + total_time_ms: f64, +} + +// --------------------------------------------------------------------------- +// CSV loading +// --------------------------------------------------------------------------- + +/// Convert Python repr-style text to valid JSON. +/// +/// Python's `repr()` mixes single- and double-quoted strings and uses +/// `None`, `True`, `False` instead of JSON's `null`, `true`, `false`. +/// This state machine handles both quote styles: +/// +/// - Single-quoted strings (`'...'`): converted to `"..."`, with `\'` → `'` +/// and any literal `"` inside → `\"` +/// - Double-quoted strings (`"..."`): passed through as-is (already JSON-compatible) +/// - Outside strings: `None` → `null`, `True` → `true`, `False` → `false` +fn python_repr_to_json(input: &str) -> String { + let mut out = String::with_capacity(input.len()); + let bytes = input.as_bytes(); + let len = bytes.len(); + let mut i = 0; + + while i < len { + let c = bytes[i] as char; + + match c { + '\'' => { + // Single-quoted string → convert to double-quoted + out.push('"'); + i += 1; + while i < len { + let sc = bytes[i] as char; + match sc { + '\\' if i + 1 < len => { + let next = bytes[i + 1] as char; + if next == '\'' { + // \' → plain apostrophe (no longer needs escaping) + out.push('\''); + i += 2; + } else { + // pass through other escapes + out.push('\\'); + out.push(next); + i += 2; + } + } + '\'' => { + // end of single-quoted string + out.push('"'); + i += 1; + break; + } + '"' => { + // literal double quote inside → must escape for JSON + out.push('\\'); + out.push('"'); + i += 1; + } + _ => { + out.push(sc); + i += 1; + } + } + } + } + '"' => { + // Double-quoted string → pass through as-is + out.push('"'); + i += 1; + while i < len { + let sc = bytes[i] as char; + match sc { + '\\' if i + 1 < len => { + // pass through escaped char + out.push('\\'); + out.push(bytes[i + 1] as char); + i += 2; + } + '"' => { + out.push('"'); + i += 1; + break; + } + _ => { + out.push(sc); + i += 1; + } + } + } + } + // Python keywords → JSON equivalents (only match outside strings) + 'N' if input[i..].starts_with("None") => { + out.push_str("null"); + i += 4; + } + 'T' if input[i..].starts_with("True") => { + out.push_str("true"); + i += 4; + } + 'F' if input[i..].starts_with("False") => { + out.push_str("false"); + i += 5; + } + _ => { + out.push(c); + i += 1; + } + } + } + + out +} + +fn load_csv(path: &str, limit: Option) -> Vec { + let mut rdr = csv::ReaderBuilder::new() + .has_headers(true) + .from_path(path) + .unwrap_or_else(|e| panic!("Failed to open CSV at {path}: {e}")); + + let mut rows = Vec::new(); + for (idx, result) in rdr.records().enumerate() { + if let Some(limit) = limit { + if idx >= limit { + break; + } + } + let record = result.unwrap_or_else(|e| panic!("CSV parse error at row {idx}: {e}")); + + let expected = record.get(0).unwrap_or("").to_string(); + let fragments_raw = record.get(1).unwrap_or("[]").to_string(); + // columns 2 (hallucination_proposal) and 3 (timestamp) are unused + let question = record.get(4).unwrap_or("").to_string(); + + // Parse fragments: convert Python repr to valid JSON + let fragments_json = python_repr_to_json(&fragments_raw); + let fragments: Vec = serde_json::from_str(&fragments_json).unwrap_or_else(|e| { + if idx == 0 { + eprintln!( + "Warning: failed to parse fragments for row {idx}: {e}\n raw: {}", + &fragments_raw[..fragments_raw.len().min(200)] + ); + } + vec![] + }); + + rows.push(EvalRow { + index: idx, + expected, + fragments, + question, + }); + } + + rows +} + +// --------------------------------------------------------------------------- +// Query construction helpers +// --------------------------------------------------------------------------- + +fn build_primary_query( + model: &str, + question: &str, + fragments: &[Fragment], + max_tokens: i32, +) -> Query { + let model = model.to_string(); + let system_prompt = + "You are a helpful assistant. Answer the question based only on the provided Documents." + .to_string(); + + let doc_messages: Vec = fragments + .iter() + .enumerate() + .map(|(idx, f)| { + let text = format!("Document {idx}: {}", f.page_content); + spnl!(user text) + }) + .collect(); + + let question = question.to_string(); + let temperature: f32 = 0.0; + + spnl!( + g model + (cross + (system system_prompt) + (plus doc_messages) + (user question) + ) + temperature + max_tokens + ) +} + +fn build_grading_query(model: &str, expected: &str, actual: &str) -> Query { + let model = model.to_string(); + let system_prompt = "You are an accuracy evaluator. Compare the expected answer to the actual answer and return ONLY a single integer 0-100 representing accuracy percentage. 100 means perfectly correct, 0 means completely wrong.".to_string(); + let user_prompt = format!( + "Expected answer: {expected}\n\nActual answer: {actual}\n\nAccuracy score (0-100):" + ); + let temperature: f32 = 0.0; + let max_tokens: i32 = 16; + + spnl!( + g model + (cross + (system system_prompt) + (user user_prompt) + ) + temperature + max_tokens + ) +} + +fn parse_accuracy(response: &str) -> f64 { + // Extract the first integer from the response + let trimmed = response.trim(); + trimmed + .split(|c: char| !c.is_ascii_digit()) + .find(|s| !s.is_empty()) + .and_then(|s| s.parse::().ok()) + .map(|v| v.clamp(0.0, 100.0)) + .unwrap_or(0.0) +} + +// --------------------------------------------------------------------------- +// Quantile computation +// --------------------------------------------------------------------------- + +fn compute_quantiles(values: &[f64]) -> (f64, f64, f64, f64, f64, f64, f64, f64) { + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let len = sorted.len(); + + let min = sorted[0]; + let p25 = sorted[len * 25 / 100]; + let p50 = sorted[len * 50 / 100]; + let p75 = sorted[len * 75 / 100]; + let p90 = sorted[len * 90 / 100]; + let p99 = sorted[(len * 99 / 100).min(len - 1)]; + let max = sorted[len - 1]; + let avg = sorted.iter().sum::() / len as f64; + + (min, p25, p50, p75, p90, p99, max, avg) +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +fn main() { + // Read env vars + let csv_path = std::env::var("RAGCSV_FILE") + .expect("RAGCSV_FILE environment variable is required (path to CSV file)"); + let model = + std::env::var("RAGCSV_MODEL").unwrap_or_else(|_| "ollama/granite3.3:2b".to_string()); + let grading_model = std::env::var("RAGCSV_GRADING_MODEL").unwrap_or_else(|_| model.clone()); + let concurrency: usize = std::env::var("RAGCSV_CONCURRENCY") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(1); + let limit: Option = std::env::var("RAGCSV_LIMIT") + .ok() + .and_then(|s| s.parse().ok()); + let max_tokens: i32 = std::env::var("RAGCSV_MAX_TOKENS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(512); + let debug = std::env::var("RAGCSV_DEBUG") + .map(|v| v == "1" || v.to_lowercase() == "true") + .unwrap_or(false); + + // Load CSV + let rows = load_csv(&csv_path, limit); + let total = rows.len(); + eprintln!("Loaded {total} rows from {csv_path}"); + eprintln!( + "Model: {model} | Grading: {grading_model} | Concurrency: {concurrency} | Max tokens: {max_tokens}" + ); + + if total == 0 { + eprintln!("No rows to process."); + return; + } + + // Build tokio runtime + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + runtime.block_on(async { + let semaphore = Arc::new(Semaphore::new(concurrency)); + let (tx, mut rx) = mpsc::channel::(total); + + let options = ExecuteOptions { + silent: true, + ..Default::default() + }; + + // Spawn all row tasks + for row in rows { + let sem = Arc::clone(&semaphore); + let tx = tx.clone(); + let model = model.clone(); + let grading_model = grading_model.clone(); + let options = ExecuteOptions { + silent: options.silent, + ..Default::default() + }; + + tokio::spawn(async move { + let _permit = sem.acquire().await.unwrap(); + + let row_idx = row.index; + let start = Instant::now(); + + // Build and execute primary query + let query = build_primary_query(&model, &row.question, &row.fragments, max_tokens); + + if debug && row_idx == 0 { + eprintln!("\n=== DEBUG: Row 0 Query ===\n{query:?}"); + } + + let actual = match execute(&query, &options).await { + Ok(Query::Message(Assistant(s))) => s, + Ok(other) => { + eprintln!("Row {row_idx}: unexpected response type: {other:?}"); + String::new() + } + Err(e) => { + eprintln!("Row {row_idx}: primary query error: {e}"); + String::new() + } + }; + + let total_time_ms = start.elapsed().as_secs_f64() * 1000.0; + + if debug && row_idx == 0 { + eprintln!("=== DEBUG: Row 0 Response ===\n{actual}"); + eprintln!("=== DEBUG: Row 0 Expected ===\n{}", row.expected); + } + + // Build and execute grading query + let grading_query = build_grading_query(&grading_model, &row.expected, &actual); + let accuracy = match execute(&grading_query, &options).await { + Ok(Query::Message(Assistant(s))) => { + let acc = parse_accuracy(&s); + if debug && row_idx == 0 { + eprintln!("=== DEBUG: Row 0 Grading Response ===\n{s}"); + eprintln!("=== DEBUG: Row 0 Parsed Accuracy === {acc}%"); + } + acc + } + Ok(_) => 0.0, + Err(e) => { + eprintln!("Row {row_idx}: grading query error: {e}"); + 0.0 + } + }; + + let _ = tx + .send(RowMetrics { + row_index: row_idx, + accuracy, + ttft_ms: None, + itl_ms: None, + total_time_ms, + }) + .await; + }); + } + + // Drop the sender so the receiver knows when all tasks are done + drop(tx); + + // Collect results with progress bar + let pb = bench_progress::create_benchmark_progress(total as u64, "RAGCSV Eval"); + let mut metrics: Vec = Vec::with_capacity(total); + let mut accuracy_sum = 0.0; + let mut pass_count = 0usize; + + while let Some(m) = rx.recv().await { + accuracy_sum += m.accuracy; + if m.accuracy >= 75.0 { + pass_count += 1; + } + metrics.push(m); + + let n = metrics.len(); + let avg_acc = accuracy_sum / n as f64; + pb.set_position(n as u64); + pb.set_message(format!( + "{n}/{total} | Avg Acc={avg_acc:.1}% | Pass(>=75%)={pass_count}/{n}" + )); + } + + bench_progress::finish_benchmark_progress( + &pb, + format!( + "Done {}/{total} | Avg Acc={:.1}% | Pass(>=75%)={pass_count}/{total}", + metrics.len(), + accuracy_sum / metrics.len().max(1) as f64 + ), + ); + + // Print final report + if metrics.is_empty() { + eprintln!("\nNo results collected."); + return; + } + + let accuracies: Vec = metrics.iter().map(|m| m.accuracy).collect(); + let (min, p25, p50, p75, p90, p99, max, avg) = compute_quantiles(&accuracies); + + eprintln!("\n=== RAGCSV Eval Accuracy (n={}) ===", metrics.len()); + eprintln!(" min: {min:.1}%"); + eprintln!(" p25: {p25:.1}%"); + eprintln!(" p50: {p50:.1}%"); + eprintln!(" p75: {p75:.1}%"); + eprintln!(" p90: {p90:.1}%"); + eprintln!(" p99: {p99:.1}%"); + eprintln!(" max: {max:.1}%"); + eprintln!(" avg: {avg:.1}%"); + eprintln!(" pass (>=75%): {pass_count}/{}", metrics.len()); + + let times: Vec = metrics.iter().map(|m| m.total_time_ms).collect(); + let (tmin, t25, t50, t75, t90, t99, tmax, tavg) = compute_quantiles(×); + + eprintln!("\n=== RAGCSV Eval Total Time (n={}) ===", metrics.len()); + eprintln!(" min: {tmin:.0}ms"); + eprintln!(" p25: {t25:.0}ms"); + eprintln!(" p50: {t50:.0}ms"); + eprintln!(" p75: {t75:.0}ms"); + eprintln!(" p90: {t90:.0}ms"); + eprintln!(" p99: {t99:.0}ms"); + eprintln!(" max: {tmax:.0}ms"); + eprintln!(" avg: {tavg:.0}ms"); + }); +}