From 2d1d4cda23eaf145afade04b998381fc2b0f2ba5 Mon Sep 17 00:00:00 2001
From: Nick Mitchell <nickm@us.ibm.com>
Date: Tue, 17 Feb 2026 19:07:14 -0500
Subject: [PATCH] feat(bench): add ragcsv benchmark for CSV-based RAG
 evaluation

Standalone benchmark that reads a CSV of evaluation queries, executes
each as a span query with document fragments as context, then grades
response accuracy via a second LLM call. Reports quantile statistics
for accuracy and response time.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Nick Mitchell <nickm@us.ibm.com>
---
 Cargo.lock                    |   1 +
 spnl/Cargo.toml               |   5 +
 spnl/benches/README-RAGCSV.md | 148 ++++++++++
 spnl/benches/ragcsv.rs        | 491 ++++++++++++++++++++++++++++++++++
 4 files changed, 645 insertions(+)
 create mode 100644 spnl/benches/README-RAGCSV.md
 create mode 100644 spnl/benches/ragcsv.rs

diff --git a/Cargo.lock b/Cargo.lock
index 16547833..4181ba7f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10328,6 +10328,7 @@ dependencies = [
  "cargo-husky",
  "clap",
  "criterion",
+ "csv",
  "derive_builder",
  "dirs",
  "either",
diff --git a/spnl/Cargo.toml b/spnl/Cargo.toml
index 2bd65ca5..5e33ac13 100644
--- a/spnl/Cargo.toml
+++ b/spnl/Cargo.toml
@@ -115,6 +115,7 @@ tokio = { version = "1.44.1", features = ["rt-multi-thread"] }
 dirs = "6.0"
 reqwest = { version = "0.12", features = ["blocking", "json"] }
 serde_json = "1.0"
+csv = "1.3"
 
 [[bench]]
 name = "haystack"
@@ -137,3 +138,7 @@ required-features = ["tok"]
 name = "ruler"
 harness = false
 required-features = ["tok"]
+
+[[bench]]
+name = "ragcsv"
+harness = false
diff --git a/spnl/benches/README-RAGCSV.md b/spnl/benches/README-RAGCSV.md
new file mode 100644
index 00000000..60127530
--- /dev/null
+++ b/spnl/benches/README-RAGCSV.md
@@ -0,0 +1,148 @@
+# RAGCSV Benchmark for SPNL
+
+Evaluates RAG answer accuracy from a CSV of evaluation queries using SPNL span queries. Each row contains a question, document fragments, and an expected answer. The benchmark executes each query, then uses a second LLM call to grade the response accuracy on a 0-100 scale.
+
+## Overview
+
+1. Load a CSV file with evaluation data (questions, document fragments, expected answers)
+2. For each row, construct a span query that provides the document fragments as context and asks the question
+3. Grade the model's response against the expected answer using a second LLM query
+4. Report quantile statistics for accuracy and response time
+
+## CSV Schema
+
+The CSV has no headers. Columns are positional:
+
+| Index | Content |
+|-------|---------|
+| 0 | Expected result |
+| 1 | Fragments -- Python-style single-quoted JSON array of `{'page_content': '...', 'metadata': {'title': '...'}, ...}` |
+| 2 | Hallucination detection proposal (unused) |
+| 3 | Timestamp (unused) |
+| 4 | Question to pose |
+
+## Configuration via Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `RAGCSV_FILE` | (required) | Path to the CSV file |
+| `RAGCSV_MODEL` | `ollama/granite3.3:2b` | Model for inference |
+| `RAGCSV_GRADING_MODEL` | same as `RAGCSV_MODEL` | Model for accuracy grading |
+| `RAGCSV_CONCURRENCY` | `1` | Max parallel query executions |
+| `RAGCSV_LIMIT` | all rows | Limit number of rows to process |
+| `RAGCSV_MAX_TOKENS` | `512` | Max tokens for primary query |
+| `RAGCSV_DEBUG` | `false` | Print first row's query/response details |
+
+## Usage Examples
+
+### Basic Usage
+
+```bash
+RAGCSV_FILE=~/data/eval.csv cargo bench --bench ragcsv
+```
+
+### Debug Mode
+
+```bash
+RAGCSV_FILE=~/data/eval.csv RAGCSV_DEBUG=1 RAGCSV_LIMIT=3 \
+  cargo bench --bench ragcsv
+```
+
+### Concurrent Execution
+
+```bash
+RAGCSV_FILE=~/data/eval.csv \
+RAGCSV_MODEL="ollama/granite3.3:2b" \
+RAGCSV_CONCURRENCY=4 \
+RAGCSV_LIMIT=100 \
+  cargo bench --bench ragcsv
+```
+
+### Separate Grading Model
+
+```bash
+RAGCSV_FILE=~/data/eval.csv \
+RAGCSV_MODEL="ollama/granite3.3:2b" \
+RAGCSV_GRADING_MODEL="ollama/llama3.2:3b" \
+  cargo bench --bench ragcsv
+```
+
+## Output Format
+
+Progress is shown during execution:
+
+```
+[00:01:23] 42/100 | Avg Acc=72.3% | Pass(>=75%)=28/42
+```
+
+Final report includes quantile statistics for accuracy and total time:
+
+```
+=== RAGCSV Eval Accuracy (n=100) ===
+  min:  10.0%
+  p25:  55.0%
+  p50:  75.0%
+  p75:  85.0%
+  p90:  95.0%
+  p99:  100.0%
+  max:  100.0%
+  avg:  71.2%
+  pass (>=75%): 58/100
+
+=== RAGCSV Eval Total Time (n=100) ===
+  min:  234ms
+  p25:  456ms
+  p50:  623ms
+  p75:  891ms
+  p90:  1203ms
+  p99:  2456ms
+  max:  3012ms
+  avg:  702ms
+```
+
+## Implementation Details
+
+### Query Construction
+
+Each row produces a span query of the form:
+
+```
+g model
+    (cross
+        (system "You are a helpful assistant. Answer the question based only on the provided documents.")
+        (plus [user "Document: {title}\n{page_content}", ...])
+        (user question)
+    )
+    temperature
+    max_tokens
+```
+
+### Accuracy Grading
+
+After getting the model response, a second query grades accuracy:
+
+```
+g grading_model
+    (cross
+        (system "You are an accuracy evaluator...")
+        (user "Expected answer: ...\n\nActual answer: ...\n\nAccuracy score (0-100):")
+    )
+    0.0   // deterministic
+    16    // small max_tokens
+```
+
+The grading model returns an integer 0-100, which is parsed from the response.
+
+### Concurrency
+
+Rows are dispatched as tokio tasks with a semaphore limiting parallelism to `RAGCSV_CONCURRENCY`. Results are collected via an mpsc channel.
+
+## Future Work
+
+- Capture TTFT (time to first token) and ITL (inter-token latency) metrics programmatically
+- Support additional CSV schemas
+- Add per-row result CSV export
+
+---
+
+Made with Bob
diff --git a/spnl/benches/ragcsv.rs b/spnl/benches/ragcsv.rs
new file mode 100644
index 00000000..974cfecd
--- /dev/null
+++ b/spnl/benches/ragcsv.rs
@@ -0,0 +1,491 @@
+mod bench_progress;
+
+use spnl::{
+    ExecuteOptions, execute,
+    ir::{Message::Assistant, Query},
+    spnl,
+};
+use std::sync::Arc;
+use std::time::Instant;
+use tokio::sync::{Semaphore, mpsc};
+
+// ---------------------------------------------------------------------------
+// CSV row and fragment types
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+struct EvalRow {
+    index: usize,
+    expected: String,
+    fragments: Vec<Fragment>,
+    question: String,
+}
+
+#[derive(Debug, serde::Deserialize)]
+#[allow(dead_code)]
+struct Fragment {
+    page_content: String,
+    metadata: FragmentMetadata,
+}
+
+#[derive(Debug, serde::Deserialize)]
+#[allow(dead_code)]
+struct FragmentMetadata {
+    #[serde(default)]
+    title: String,
+}
+
+// ---------------------------------------------------------------------------
+// Metrics
+// ---------------------------------------------------------------------------
+
+struct RowMetrics {
+    #[allow(dead_code)]
+    row_index: usize,
+    accuracy: f64,
+    #[allow(dead_code)]
+    ttft_ms: Option<f64>,
+    #[allow(dead_code)]
+    itl_ms: Option<f64>,
+    total_time_ms: f64,
+}
+
+// ---------------------------------------------------------------------------
+// CSV loading
+// ---------------------------------------------------------------------------
+
+/// Convert Python repr-style text to valid JSON.
+///
+/// Python's `repr()` mixes single- and double-quoted strings and uses
+/// `None`, `True`, `False` instead of JSON's `null`, `true`, `false`.
+/// This state machine handles both quote styles:
+///
+/// - Single-quoted strings (`'...'`): converted to `"..."`, with `\'` → `'`
+///   and any literal `"` inside → `\"`
+/// - Double-quoted strings (`"..."`): passed through as-is (already JSON-compatible)
+/// - Outside strings: `None` → `null`, `True` → `true`, `False` → `false`
+fn python_repr_to_json(input: &str) -> String {
+    let mut out = String::with_capacity(input.len());
+    let bytes = input.as_bytes();
+    let len = bytes.len();
+    let mut i = 0;
+
+    while i < len {
+        let c = bytes[i] as char;
+
+        match c {
+            '\'' => {
+                // Single-quoted string → convert to double-quoted
+                out.push('"');
+                i += 1;
+                while i < len {
+                    let sc = bytes[i] as char;
+                    match sc {
+                        '\\' if i + 1 < len => {
+                            let next = bytes[i + 1] as char;
+                            if next == '\'' {
+                                // \' → plain apostrophe (no longer needs escaping)
+                                out.push('\'');
+                                i += 2;
+                            } else {
+                                // pass through other escapes
+                                out.push('\\');
+                                out.push(next);
+                                i += 2;
+                            }
+                        }
+                        '\'' => {
+                            // end of single-quoted string
+                            out.push('"');
+                            i += 1;
+                            break;
+                        }
+                        '"' => {
+                            // literal double quote inside → must escape for JSON
+                            out.push('\\');
+                            out.push('"');
+                            i += 1;
+                        }
+                        _ => {
+                            out.push(sc);
+                            i += 1;
+                        }
+                    }
+                }
+            }
+            '"' => {
+                // Double-quoted string → pass through as-is
+                out.push('"');
+                i += 1;
+                while i < len {
+                    let sc = bytes[i] as char;
+                    match sc {
+                        '\\' if i + 1 < len => {
+                            // pass through escaped char
+                            out.push('\\');
+                            out.push(bytes[i + 1] as char);
+                            i += 2;
+                        }
+                        '"' => {
+                            out.push('"');
+                            i += 1;
+                            break;
+                        }
+                        _ => {
+                            out.push(sc);
+                            i += 1;
+                        }
+                    }
+                }
+            }
+            // Python keywords → JSON equivalents (only match outside strings)
+            'N' if input[i..].starts_with("None") => {
+                out.push_str("null");
+                i += 4;
+            }
+            'T' if input[i..].starts_with("True") => {
+                out.push_str("true");
+                i += 4;
+            }
+            'F' if input[i..].starts_with("False") => {
+                out.push_str("false");
+                i += 5;
+            }
+            _ => {
+                out.push(c);
+                i += 1;
+            }
+        }
+    }
+
+    out
+}
+
+fn load_csv(path: &str, limit: Option<usize>) -> Vec<EvalRow> {
+    let mut rdr = csv::ReaderBuilder::new()
+        .has_headers(true)
+        .from_path(path)
+        .unwrap_or_else(|e| panic!("Failed to open CSV at {path}: {e}"));
+
+    let mut rows = Vec::new();
+    for (idx, result) in rdr.records().enumerate() {
+        if let Some(limit) = limit {
+            if idx >= limit {
+                break;
+            }
+        }
+        let record = result.unwrap_or_else(|e| panic!("CSV parse error at row {idx}: {e}"));
+
+        let expected = record.get(0).unwrap_or("").to_string();
+        let fragments_raw = record.get(1).unwrap_or("[]").to_string();
+        // columns 2 (hallucination_proposal) and 3 (timestamp) are unused
+        let question = record.get(4).unwrap_or("").to_string();
+
+        // Parse fragments: convert Python repr to valid JSON
+        let fragments_json = python_repr_to_json(&fragments_raw);
+        let fragments: Vec<Fragment> = serde_json::from_str(&fragments_json).unwrap_or_else(|e| {
+            if idx == 0 {
+                eprintln!(
+                    "Warning: failed to parse fragments for row {idx}: {e}\n  raw: {}",
+                    &fragments_raw[..fragments_raw.len().min(200)]
+                );
+            }
+            vec![]
+        });
+
+        rows.push(EvalRow {
+            index: idx,
+            expected,
+            fragments,
+            question,
+        });
+    }
+
+    rows
+}
+
+// ---------------------------------------------------------------------------
+// Query construction helpers
+// ---------------------------------------------------------------------------
+
+fn build_primary_query(
+    model: &str,
+    question: &str,
+    fragments: &[Fragment],
+    max_tokens: i32,
+) -> Query {
+    let model = model.to_string();
+    let system_prompt =
+        "You are a helpful assistant. Answer the question based only on the provided Documents."
+            .to_string();
+
+    let doc_messages: Vec<Query> = fragments
+        .iter()
+        .enumerate()
+        .map(|(idx, f)| {
+            let text = format!("Document {idx}: {}", f.page_content);
+            spnl!(user text)
+        })
+        .collect();
+
+    let question = question.to_string();
+    let temperature: f32 = 0.0;
+
+    spnl!(
+        g model
+            (cross
+                (system system_prompt)
+                (plus doc_messages)
+                (user question)
+            )
+            temperature
+            max_tokens
+    )
+}
+
+fn build_grading_query(model: &str, expected: &str, actual: &str) -> Query {
+    let model = model.to_string();
+    let system_prompt = "You are an accuracy evaluator. Compare the expected answer to the actual answer and return ONLY a single integer 0-100 representing accuracy percentage. 100 means perfectly correct, 0 means completely wrong.".to_string();
+    let user_prompt = format!(
+        "Expected answer: {expected}\n\nActual answer: {actual}\n\nAccuracy score (0-100):"
+    );
+    let temperature: f32 = 0.0;
+    let max_tokens: i32 = 16;
+
+    spnl!(
+        g model
+            (cross
+                (system system_prompt)
+                (user user_prompt)
+            )
+            temperature
+            max_tokens
+    )
+}
+
+fn parse_accuracy(response: &str) -> f64 {
+    // Extract the first integer from the response
+    let trimmed = response.trim();
+    trimmed
+        .split(|c: char| !c.is_ascii_digit())
+        .find(|s| !s.is_empty())
+        .and_then(|s| s.parse::<f64>().ok())
+        .map(|v| v.clamp(0.0, 100.0))
+        .unwrap_or(0.0)
+}
+
+// ---------------------------------------------------------------------------
+// Quantile computation
+// ---------------------------------------------------------------------------
+
+fn compute_quantiles(values: &[f64]) -> (f64, f64, f64, f64, f64, f64, f64, f64) {
+    let mut sorted = values.to_vec();
+    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let len = sorted.len();
+
+    let min = sorted[0];
+    let p25 = sorted[len * 25 / 100];
+    let p50 = sorted[len * 50 / 100];
+    let p75 = sorted[len * 75 / 100];
+    let p90 = sorted[len * 90 / 100];
+    let p99 = sorted[(len * 99 / 100).min(len - 1)];
+    let max = sorted[len - 1];
+    let avg = sorted.iter().sum::<f64>() / len as f64;
+
+    (min, p25, p50, p75, p90, p99, max, avg)
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+fn main() {
+    // Read env vars
+    let csv_path = std::env::var("RAGCSV_FILE")
+        .expect("RAGCSV_FILE environment variable is required (path to CSV file)");
+    let model =
+        std::env::var("RAGCSV_MODEL").unwrap_or_else(|_| "ollama/granite3.3:2b".to_string());
+    let grading_model = std::env::var("RAGCSV_GRADING_MODEL").unwrap_or_else(|_| model.clone());
+    let concurrency: usize = std::env::var("RAGCSV_CONCURRENCY")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(1);
+    let limit: Option<usize> = std::env::var("RAGCSV_LIMIT")
+        .ok()
+        .and_then(|s| s.parse().ok());
+    let max_tokens: i32 = std::env::var("RAGCSV_MAX_TOKENS")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(512);
+    let debug = std::env::var("RAGCSV_DEBUG")
+        .map(|v| v == "1" || v.to_lowercase() == "true")
+        .unwrap_or(false);
+
+    // Load CSV
+    let rows = load_csv(&csv_path, limit);
+    let total = rows.len();
+    eprintln!("Loaded {total} rows from {csv_path}");
+    eprintln!(
+        "Model: {model} | Grading: {grading_model} | Concurrency: {concurrency} | Max tokens: {max_tokens}"
+    );
+
+    if total == 0 {
+        eprintln!("No rows to process.");
+        return;
+    }
+
+    // Build tokio runtime
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async {
+        let semaphore = Arc::new(Semaphore::new(concurrency));
+        let (tx, mut rx) = mpsc::channel::<RowMetrics>(total);
+
+        let options = ExecuteOptions {
+            silent: true,
+            ..Default::default()
+        };
+
+        // Spawn all row tasks
+        for row in rows {
+            let sem = Arc::clone(&semaphore);
+            let tx = tx.clone();
+            let model = model.clone();
+            let grading_model = grading_model.clone();
+            let options = ExecuteOptions {
+                silent: options.silent,
+                ..Default::default()
+            };
+
+            tokio::spawn(async move {
+                let _permit = sem.acquire().await.unwrap();
+
+                let row_idx = row.index;
+                let start = Instant::now();
+
+                // Build and execute primary query
+                let query = build_primary_query(&model, &row.question, &row.fragments, max_tokens);
+
+                if debug && row_idx == 0 {
+                    eprintln!("\n=== DEBUG: Row 0 Query ===\n{query:?}");
+                }
+
+                let actual = match execute(&query, &options).await {
+                    Ok(Query::Message(Assistant(s))) => s,
+                    Ok(other) => {
+                        eprintln!("Row {row_idx}: unexpected response type: {other:?}");
+                        String::new()
+                    }
+                    Err(e) => {
+                        eprintln!("Row {row_idx}: primary query error: {e}");
+                        String::new()
+                    }
+                };
+
+                let total_time_ms = start.elapsed().as_secs_f64() * 1000.0;
+
+                if debug && row_idx == 0 {
+                    eprintln!("=== DEBUG: Row 0 Response ===\n{actual}");
+                    eprintln!("=== DEBUG: Row 0 Expected ===\n{}", row.expected);
+                }
+
+                // Build and execute grading query
+                let grading_query = build_grading_query(&grading_model, &row.expected, &actual);
+                let accuracy = match execute(&grading_query, &options).await {
+                    Ok(Query::Message(Assistant(s))) => {
+                        let acc = parse_accuracy(&s);
+                        if debug && row_idx == 0 {
+                            eprintln!("=== DEBUG: Row 0 Grading Response ===\n{s}");
+                            eprintln!("=== DEBUG: Row 0 Parsed Accuracy === {acc}%");
+                        }
+                        acc
+                    }
+                    Ok(_) => 0.0,
+                    Err(e) => {
+                        eprintln!("Row {row_idx}: grading query error: {e}");
+                        0.0
+                    }
+                };
+
+                let _ = tx
+                    .send(RowMetrics {
+                        row_index: row_idx,
+                        accuracy,
+                        ttft_ms: None,
+                        itl_ms: None,
+                        total_time_ms,
+                    })
+                    .await;
+            });
+        }
+
+        // Drop the sender so the receiver knows when all tasks are done
+        drop(tx);
+
+        // Collect results with progress bar
+        let pb = bench_progress::create_benchmark_progress(total as u64, "RAGCSV Eval");
+        let mut metrics: Vec<RowMetrics> = Vec::with_capacity(total);
+        let mut accuracy_sum = 0.0;
+        let mut pass_count = 0usize;
+
+        while let Some(m) = rx.recv().await {
+            accuracy_sum += m.accuracy;
+            if m.accuracy >= 75.0 {
+                pass_count += 1;
+            }
+            metrics.push(m);
+
+            let n = metrics.len();
+            let avg_acc = accuracy_sum / n as f64;
+            pb.set_position(n as u64);
+            pb.set_message(format!(
+                "{n}/{total} | Avg Acc={avg_acc:.1}% | Pass(>=75%)={pass_count}/{n}"
+            ));
+        }
+
+        bench_progress::finish_benchmark_progress(
+            &pb,
+            format!(
+                "Done {}/{total} | Avg Acc={:.1}% | Pass(>=75%)={pass_count}/{total}",
+                metrics.len(),
+                accuracy_sum / metrics.len().max(1) as f64
+            ),
+        );
+
+        // Print final report
+        if metrics.is_empty() {
+            eprintln!("\nNo results collected.");
+            return;
+        }
+
+        let accuracies: Vec<f64> = metrics.iter().map(|m| m.accuracy).collect();
+        let (min, p25, p50, p75, p90, p99, max, avg) = compute_quantiles(&accuracies);
+
+        eprintln!("\n=== RAGCSV Eval Accuracy (n={}) ===", metrics.len());
+        eprintln!("  min:  {min:.1}%");
+        eprintln!("  p25:  {p25:.1}%");
+        eprintln!("  p50:  {p50:.1}%");
+        eprintln!("  p75:  {p75:.1}%");
+        eprintln!("  p90:  {p90:.1}%");
+        eprintln!("  p99:  {p99:.1}%");
+        eprintln!("  max:  {max:.1}%");
+        eprintln!("  avg:  {avg:.1}%");
+        eprintln!("  pass (>=75%): {pass_count}/{}", metrics.len());
+
+        let times: Vec<f64> = metrics.iter().map(|m| m.total_time_ms).collect();
+        let (tmin, t25, t50, t75, t90, t99, tmax, tavg) = compute_quantiles(&times);
+
+        eprintln!("\n=== RAGCSV Eval Total Time (n={}) ===", metrics.len());
+        eprintln!("  min:  {tmin:.0}ms");
+        eprintln!("  p25:  {t25:.0}ms");
+        eprintln!("  p50:  {t50:.0}ms");
+        eprintln!("  p75:  {t75:.0}ms");
+        eprintln!("  p90:  {t90:.0}ms");
+        eprintln!("  p99:  {t99:.0}ms");
+        eprintln!("  max:  {tmax:.0}ms");
+        eprintln!("  avg:  {tavg:.0}ms");
+    });
+}