From e445b3446bcca5ad3d4822758f16f661ee0b4508 Mon Sep 17 00:00:00 2001
From: Nick Mitchell <nickm@us.ibm.com>
Date: Tue, 17 Feb 2026 09:50:57 -0500
Subject: [PATCH] feat(bench): add map-reduce chunking to NIAH benchmark

Implement chunk-based map-reduce structure in niah.rs benchmark, matching
the pattern from haystack.rs. This allows testing retrieval performance
with different chunking strategies.

Changes:
- Add chunk parameter to run_niah_test() function
- Implement map-reduce logic: split context into chunks, process each
  chunk separately (map), then combine results (reduce)
- Add BENCH_CHUNK_SIZES environment variable (default: 0,2,4)
- Always include chunk size in benchmark IDs for proper filtering
- Update documentation with chunking examples and filtering guide

Made with Bob

Signed-off-by: Nick Mitchell <nickm@us.ibm.com>
---
 spnl/benches/README-NIAH.md | 147 ++++++++++++++++-
 spnl/benches/niah.rs        | 313 +++++++++++++++++++++++-------------
 2 files changed, 343 insertions(+), 117 deletions(-)

diff --git a/spnl/benches/README-NIAH.md b/spnl/benches/README-NIAH.md
index c2af3828..b09abb27 100644
--- a/spnl/benches/README-NIAH.md
+++ b/spnl/benches/README-NIAH.md
@@ -61,9 +61,60 @@ Enable debug output for the first sample to see detailed information:
 BENCH_DEBUG=1 cargo bench --bench niah --features tok
 ```
 
+## Map-Reduce Chunking
+
+The benchmark supports splitting the context into chunks and using a map-reduce approach, similar to the haystack benchmark. This can help test how well models handle distributed retrieval tasks.
+
+### Basic Chunking
+
+Split context into 2 chunks:
+```bash
+BENCH_CHUNK_SIZES="2" cargo bench --bench niah --features tok
+```
+
+Test multiple chunk sizes:
+```bash
+BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok
+```
+
+### How It Works
+
+When `chunk > 0`:
+1. **Map step**: Context is split into N chunks (based on token count)
+2. Each chunk gets its own query to answer the question
+3. **Reduce step**: If multiple chunks exist, a final query combines the answers
+
+When `chunk = 0` (default): Uses the original non-chunked query
+
+### Filtering Chunked Benchmarks
+
+All benchmarks include the chunk size in their ID: `"chunk={size}/len={length}/depth={percent}"`.
+
+Run only chunk=0 (non-chunked) benchmarks:
+```bash
+cargo bench --bench niah --features tok -- "chunk=0"
+```
+
+Run only chunk=2 benchmarks:
+```bash
+cargo bench --bench niah --features tok -- "chunk=2"
+```
+
+Run only chunked benchmarks (chunk > 0):
+```bash
+cargo bench --bench niah --features tok -- "chunk=(2|4)"
+```
+
+Compare all chunk sizes for a specific configuration:
+```bash
+cargo bench --bench niah --features tok -- "len=2000/depth=50"
+```
+
 ## Command-line Filtering
 
-Use Criterion's built-in filtering to run specific benchmark configurations. The benchmark IDs follow the pattern: `retrieval/len={context_length}/depth={depth_percent}`
+Use Criterion's built-in filtering to run specific benchmark configurations. The benchmark IDs follow these patterns:
+- Non-chunked: `retrieval/len={context_length}/depth={depth_percent}`
+- Chunked: `retrieval/chunk={chunk_size}/len={context_length}/depth={depth_percent}`
 
 ### Filter by Context Length
 
@@ -89,6 +140,35 @@ Run only middle depth:
 cargo bench --bench niah --features tok -- "depth=50"
 ```
 
+### Filter by Chunk Size
+
+All benchmarks include chunk size in their ID, making filtering straightforward.
+
+Run only chunk=0 (non-chunked) benchmarks:
+```bash
+cargo bench --bench niah --features tok -- "chunk=0"
+```
+
+Run only chunk=2 benchmarks:
+```bash
+cargo bench --bench niah --features tok -- "chunk=2"
+```
+
+Run only chunked benchmarks (chunk > 0):
+```bash
+cargo bench --bench niah --features tok -- "chunk=(2|4)"
+```
+
+Run chunk=2 with specific context length:
+```bash
+cargo bench --bench niah --features tok -- "chunk=2/len=4000"
+```
+
+Compare all chunk sizes for same configuration:
+```bash
+cargo bench --bench niah --features tok -- "len=2000/depth=50"
+```
+
 ### Filter by Specific Configuration
 
 Run a single specific configuration:
@@ -96,6 +176,11 @@ Run a single specific configuration:
 cargo bench --bench niah --features tok -- "len=2000/depth=50"
 ```
 
+Run chunked configuration:
+```bash
+cargo bench --bench niah --features tok -- "chunk=2/len=2000/depth=50"
+```
+
 Run multiple specific configurations:
 ```bash
 cargo bench --bench niah --features tok -- "len=1000/depth=(0|50|100)"
@@ -158,6 +243,23 @@ BENCH_DEPTH_PERCENTAGES="0,100" cargo bench --bench niah --features tok
 BENCH_DEPTH_PERCENTAGES="0,10,20,30,40,50,60,70,80,90,100" cargo bench --bench niah --features tok
 ```
 
+### `BENCH_CHUNK_SIZES` (default: `"0,2,4"`)
+
+Comma-separated chunk counts for map-reduce (0 means no chunking):
+```bash
+# Test with default chunk sizes (no chunking, 2-way, and 4-way)
+cargo bench --bench niah --features tok
+
+# Test with no chunking only
+BENCH_CHUNK_SIZES="0" cargo bench --bench niah --features tok
+
+# Test with 2-way chunking only
+BENCH_CHUNK_SIZES="2" cargo bench --bench niah --features tok
+
+# Test only chunked variants
+BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok
+```
+
 ### `BENCH_MODEL` (default: `"ollama/granite3.3:8b"`)
 
 Model to use for inference (Ollama format):
@@ -220,16 +322,42 @@ BENCH_SAMPLE_SIZE=1 \
 cargo bench --bench niah --features tok -- "len=2000/depth=50"
 ```
 
+### Test Map-Reduce Chunking
+```bash
+# Compare non-chunked vs chunked performance
+BENCH_CHUNK_SIZES="0,2,4" \
+BENCH_CONTEXT_LENGTHS="4000,8000" \
+BENCH_DEPTH_PERCENTAGES="50" \
+cargo bench --bench niah --features tok
+```
+
+### Quick Chunking Test
+```bash
+# Fast test of chunking with minimal samples
+BENCH_SAMPLE_SIZE=3 \
+BENCH_CHUNK_SIZES="2" \
+BENCH_CONTEXT_LENGTHS="2000" \
+BENCH_DEPTH_PERCENTAGES="50" \
+cargo bench --bench niah --features tok
+```
+
 ## Progress Bars
 
 The benchmark displays real-time progress with running statistics:
+
+Non-chunked:
 ```
 [00:45] ⠋ len=2000 depth=50% | n=7 | Acc=85.7% | Perfect=6/7
 ```
 
+Chunked:
+```
+[00:45] ⠋ chunk=2 len=2000 depth=50% | n=7 | Acc=85.7% | Perfect=6/7
+```
+
 - **Elapsed time**: `[00:45]`
 - **Spinner**: `⠋` (animated)
-- **Configuration**: `len=2000 depth=50%`
+- **Configuration**: `chunk=2 len=2000 depth=50%` (or just `len=2000 depth=50%` for non-chunked)
 - **Sample count**: `n=7`
 - **Running accuracy**: `Acc=85.7%`
 - **Perfect retrievals**: `Perfect=6/7` (responses that got the exact answer)
@@ -238,6 +366,7 @@ The benchmark displays real-time progress with running statistics:
 
 After each configuration completes, you'll see detailed statistics:
 
+Non-chunked:
 ```
 === Accuracy Stats: len=2000 depth=50% (n=10) ===
   avg:  85.0%
@@ -251,6 +380,20 @@ After each configuration completes, you'll see detailed statistics:
   perfect: 8/10
 ```
 
+Chunked:
+```
+=== Accuracy Stats: chunk=2 len=2000 depth=50% (n=10) ===
+  avg:  85.0%
+  min:  0.0%
+  p25:  100.0%
+  p50:  100.0%
+  p75:  100.0%
+  p90:  100.0%
+  p99:  100.0%
+  max:  100.0%
+  perfect: 8/10
+```
+
 - **avg**: Mean accuracy across all samples
 - **min/max**: Minimum and maximum accuracy
 - **p25/p50/p75/p90/p99**: Percentile statistics
diff --git a/spnl/benches/niah.rs b/spnl/benches/niah.rs
index e27947bb..3c567e93 100644
--- a/spnl/benches/niah.rs
+++ b/spnl/benches/niah.rs
@@ -26,6 +26,7 @@
 //! - `BENCH_MEASUREMENT_TIME`: Measurement time in seconds (default: 60)
 //! - `BENCH_CONTEXT_LENGTHS`: Comma-separated context lengths in TOKENS (default: "1000,2000,4000,8000")
 //! - `BENCH_DEPTH_PERCENTAGES`: Comma-separated depth percentages (default: "0,25,50,75,100")
+//! - `BENCH_CHUNK_SIZES`: Comma-separated chunk counts for map-reduce (default: "0,2,4")
 //! - `BENCH_MODEL`: Model to use for inference (default: "ollama/granite3.3:2b")
 //! - `BENCH_TOKENIZER_MODEL`: HuggingFace model for tokenizer (default: "ibm-granite/granite-3.3-2b-instruct")
 //! - `BENCH_FINAL_CONTEXT_LENGTH_BUFFER`: Buffer for system/question/response (default: 200)
@@ -47,6 +48,21 @@
 //! # Custom configuration
 //! BENCH_SAMPLE_SIZE=20 BENCH_CONTEXT_LENGTHS="2000,4000,8000" \
 //!   cargo bench --bench niah --features tok
+//!
+//! # Test with map-reduce chunking (split context into 2 or 4 chunks)
+//! BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok
+//!
+//! # Filter to run only chunk=0 (non-chunked) benchmarks
+//! cargo bench --bench niah --features tok -- "chunk=0"
+//!
+//! # Filter to run only chunk=2 benchmarks
+//! cargo bench --bench niah --features tok -- "chunk=2"
+//!
+//! # Run only chunked benchmarks (chunk > 0)
+//! cargo bench --bench niah --features tok -- "chunk=(2|4)"
+//!
+//! # Compare all chunk sizes for same configuration
+//! cargo bench --bench niah --features tok -- "len=2000/depth=50"
 //! ```
 
 mod bench_progress;
@@ -369,6 +385,7 @@ async fn run_niah_test(
     tokenizer: &Tokenizer,
     max_context_length: usize,
     debug: bool,
+    chunk: usize,
 ) -> Result<f64, Box<dyn std::error::Error>> {
     // Generate context with needle inserted
     let context_with_needle = generate_context(config, tokenizer, max_context_length)?;
@@ -410,16 +427,61 @@ async fn run_niah_test(
     let question = &config.question;
     let max_tokens = 300; // Match original Python implementation
 
-    let query: Query = spnl!(
-        g model
-            (cross
-                (system system_prompt)
-                (user context_with_needle)
-                (user question)
+    let query: Query = if chunk > 0 {
+        // Split context into token-based chunks for map-reduce
+        let encoding = tokenizer
+            .encode(context_with_needle.as_str(), false)
+            .map_err(|e| format!("Encoding error: {}", e))?;
+        let tokens = encoding.get_ids();
+
+        // Calculate chunk size in tokens
+        let chunk_size_tokens = (tokens.len() + chunk - 1) / chunk; // Round up division
+
+        let chunks: Vec<Query> = tokens
+            .chunks(chunk_size_tokens)
+            .map(|chunk_tokens| tokenizer.decode(chunk_tokens, false).unwrap_or_default())
+            .map(|chunk_text| {
+                spnl!(
+                    g model
+                        (cross
+                            (system system_prompt)
+                            (user chunk_text)
+                            (user question)
+                        )
+                        temperature
+                        max_tokens
+                )
+            })
+            .collect();
+
+        if chunks.len() == 1 {
+            chunks[0].clone()
+        } else {
+            // Reduce step: combine answers from all chunks
+            spnl!(
+                g model
+                    (cross
+                        (system system_prompt)
+                        (plus chunks)
+                        (user "Based on the above responses, what is the final answer to the question? Be concise and direct.")
+                    )
+                    temperature
+                    max_tokens
             )
-            temperature
-            max_tokens
-    );
+        }
+    } else {
+        // Original non-chunked query
+        spnl!(
+            g model
+                (cross
+                    (system system_prompt)
+                    (user context_with_needle)
+                    (user question)
+                )
+                temperature
+                max_tokens
+        )
+    };
 
     // Execute query
     let options = ExecuteOptions {
@@ -515,11 +577,20 @@ fn niah_benchmark(c: &mut Criterion) {
         })
         .unwrap_or_else(|| vec![0, 25, 50, 75, 100]);
 
-    let model = std::env::var("BENCH_MODEL").unwrap_or_else(|_| "ollama/granite3.3:8b".to_string());
+    let chunk_sizes: Vec<usize> = std::env::var("BENCH_CHUNK_SIZES")
+        .ok()
+        .and_then(|s| {
+            s.split(',')
+                .map(|n| n.trim().parse().ok())
+                .collect::<Option<Vec<_>>>()
+        })
+        .unwrap_or_else(|| vec![0, 2, 4]); // default: no chunking, 2-way, 4-way
+
+    let model = std::env::var("BENCH_MODEL").unwrap_or_else(|_| "ollama/granite3.3:2b".to_string());
 
     // Tokenizer model (HuggingFace format) - separate from inference model
     let tokenizer_model = std::env::var("BENCH_TOKENIZER_MODEL")
-        .unwrap_or_else(|_| "ibm-granite/granite-3.3-8b-instruct".to_string());
+        .unwrap_or_else(|_| "ibm-granite/granite-3.3-2b-instruct".to_string());
 
     let final_context_length_buffer = std::env::var("BENCH_FINAL_CONTEXT_LENGTH_BUFFER")
         .ok()
@@ -545,6 +616,7 @@ fn niah_benchmark(c: &mut Criterion) {
     eprintln!("Model: {}", model);
     eprintln!("Context lengths (tokens): {:?}", context_lengths);
     eprintln!("Depth percentages: {:?}", depth_percentages);
+    eprintln!("Chunk sizes: {:?}", chunk_sizes);
     eprintln!("Sample size: {}", sample_size);
     eprintln!("Temperature: {}", temperature);
     eprintln!(
@@ -552,113 +624,124 @@ fn niah_benchmark(c: &mut Criterion) {
         final_context_length_buffer
     );
 
-    // Run benchmarks for each combination of context length and depth
-    for context_length in &context_lengths {
-        for depth_percent in &depth_percentages {
-            let accuracy_values = Arc::new(Mutex::new(Vec::new()));
-            let accuracy_clone = Arc::clone(&accuracy_values);
-
-            // Create progress bar
-            let base_msg = format!("len={} depth={}%", context_length, depth_percent);
-            let pb =
-                bench_progress::create_benchmark_progress(sample_size as u64, base_msg.clone());
-            let pb_clone = Arc::clone(&pb);
-            let base_msg = Arc::new(base_msg);
-            let base_msg_clone = Arc::clone(&base_msg);
-
-            group.bench_with_input(
-                BenchmarkId::new(
-                    "retrieval",
-                    format!("len={}/depth={}", context_length, depth_percent),
-                ),
-                &(*context_length, *depth_percent),
-                |b, &(len, depth)| {
-                    b.to_async(&runtime).iter(|| {
-                        let accuracy_clone = Arc::clone(&accuracy_clone);
-                        let pb = Arc::clone(&pb_clone);
-                        let base_msg = Arc::clone(&base_msg_clone);
-                        let model = model.clone();
-                        let tokenizer = tokenizer.clone();
-                        let debug_counter = Arc::clone(&debug_counter);
-
-                        async move {
-                            // Only debug first sample
-                            let mut counter = debug_counter.lock().unwrap();
-                            let should_debug = debug && *counter == 0;
-                            *counter += 1;
-                            drop(counter);
-                            let config = NeedleConfig {
-                                context_length: len,
-                                depth_percent: depth,
-                                final_context_length_buffer,
-                                ..Default::default()
-                            };
-
-                            let accuracy = run_niah_test(
-                                &config,
-                                &model,
-                                temperature,
-                                &tokenizer,
-                                max_context_length,
-                                should_debug,
-                            )
-                            .await
-                            .unwrap_or(0.0);
-
-                            // Collect metrics
-                            accuracy_clone.lock().unwrap().push(accuracy);
-
-                            // Update progress bar
-                            let accuracies = accuracy_clone.lock().unwrap();
-                            let total_count = accuracies.len();
-                            let avg_acc = accuracies.iter().sum::<f64>() / total_count as f64;
-                            let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count();
-                            drop(accuracies);
-
-                            pb.set_message(format!(
-                                "{} | n={} | Acc={:.1}% | Perfect={}/{}",
-                                base_msg,
-                                total_count,
-                                avg_acc * 100.0,
-                                perfect_count,
-                                total_count
-                            ));
-                            pb.inc(1);
-
-                            accuracy
-                        }
-                    });
-                },
-            );
+    // Run benchmarks for each combination of context length, depth, and chunk size
+    for chunk_size in &chunk_sizes {
+        for context_length in &context_lengths {
+            for depth_percent in &depth_percentages {
+                let accuracy_values = Arc::new(Mutex::new(Vec::new()));
+                let accuracy_clone = Arc::clone(&accuracy_values);
+
+                // Create progress bar
+                let base_msg = format!(
+                    "chunk={} len={} depth={}%",
+                    chunk_size, context_length, depth_percent
+                );
+                let pb =
+                    bench_progress::create_benchmark_progress(sample_size as u64, base_msg.clone());
+                let pb_clone = Arc::clone(&pb);
+                let base_msg = Arc::new(base_msg);
+                let base_msg_clone = Arc::clone(&base_msg);
+
+                let bench_id = format!(
+                    "chunk={}/len={}/depth={}",
+                    chunk_size, context_length, depth_percent
+                );
 
-            // Finish progress bar
-            bench_progress::finish_benchmark_progress(
-                &pb,
-                format!("✓ len={} depth={}%", context_length, depth_percent),
-            );
+                group.bench_with_input(
+                    BenchmarkId::new("retrieval", bench_id),
+                    &(*context_length, *depth_percent, *chunk_size),
+                    |b, &(len, depth, chunk)| {
+                        b.to_async(&runtime).iter(|| {
+                            let accuracy_clone = Arc::clone(&accuracy_clone);
+                            let pb = Arc::clone(&pb_clone);
+                            let base_msg = Arc::clone(&base_msg_clone);
+                            let model = model.clone();
+                            let tokenizer = tokenizer.clone();
+                            let debug_counter = Arc::clone(&debug_counter);
+
+                            async move {
+                                // Only debug first sample
+                                let mut counter = debug_counter.lock().unwrap();
+                                let should_debug = debug && *counter == 0;
+                                *counter += 1;
+                                drop(counter);
+                                let config = NeedleConfig {
+                                    context_length: len,
+                                    depth_percent: depth,
+                                    final_context_length_buffer,
+                                    ..Default::default()
+                                };
+
+                                let accuracy = run_niah_test(
+                                    &config,
+                                    &model,
+                                    temperature,
+                                    &tokenizer,
+                                    max_context_length,
+                                    should_debug,
+                                    chunk,
+                                )
+                                .await
+                                .unwrap_or(0.0);
+
+                                // Collect metrics
+                                accuracy_clone.lock().unwrap().push(accuracy);
+
+                                // Update progress bar
+                                let accuracies = accuracy_clone.lock().unwrap();
+                                let total_count = accuracies.len();
+                                let avg_acc = accuracies.iter().sum::<f64>() / total_count as f64;
+                                let perfect_count =
+                                    accuracies.iter().filter(|&&a| a >= 1.0).count();
+                                drop(accuracies);
+
+                                pb.set_message(format!(
+                                    "{} | n={} | Acc={:.1}% | Perfect={}/{}",
+                                    base_msg,
+                                    total_count,
+                                    avg_acc * 100.0,
+                                    perfect_count,
+                                    total_count
+                                ));
+                                pb.inc(1);
+
+                                accuracy
+                            }
+                        });
+                    },
+                );
 
-            // Print statistics
-            let accuracies = accuracy_values.lock().unwrap();
-            if !accuracies.is_empty() {
-                let (min, p25, p50, p75, p90, p99, max) = compute_quantiles(&accuracies);
-                let avg = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
-                let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count();
-
-                eprintln!(
-                    "\n=== Accuracy Stats: len={} depth={}% (n={}) ===",
-                    context_length,
-                    depth_percent,
-                    accuracies.len()
+                // Finish progress bar
+                let finish_msg = format!(
+                    "✓ chunk={} len={} depth={}%",
+                    chunk_size, context_length, depth_percent
                 );
-                eprintln!("  avg:  {:.1}%", avg * 100.0);
-                eprintln!("  min:  {:.1}%", min * 100.0);
-                eprintln!("  p25:  {:.1}%", p25 * 100.0);
-                eprintln!("  p50:  {:.1}%", p50 * 100.0);
-                eprintln!("  p75:  {:.1}%", p75 * 100.0);
-                eprintln!("  p90:  {:.1}%", p90 * 100.0);
-                eprintln!("  p99:  {:.1}%", p99 * 100.0);
-                eprintln!("  max:  {:.1}%", max * 100.0);
-                eprintln!("  perfect: {}/{}\n", perfect_count, accuracies.len());
+                bench_progress::finish_benchmark_progress(&pb, finish_msg);
+
+                // Print statistics
+                let accuracies = accuracy_values.lock().unwrap();
+                if !accuracies.is_empty() {
+                    let (min, p25, p50, p75, p90, p99, max) = compute_quantiles(&accuracies);
+                    let avg = accuracies.iter().sum::<f64>() / accuracies.len() as f64;
+                    let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count();
+
+                    eprintln!(
+                        "\n=== Accuracy Stats: chunk={} len={} depth={}% (n={}) ===",
+                        chunk_size,
+                        context_length,
+                        depth_percent,
+                        accuracies.len()
+                    );
+                    eprintln!("  avg:  {:.1}%", avg * 100.0);
+                    eprintln!("  min:  {:.1}%", min * 100.0);
+                    eprintln!("  p25:  {:.1}%", p25 * 100.0);
+                    eprintln!("  p50:  {:.1}%", p50 * 100.0);
+                    eprintln!("  p75:  {:.1}%", p75 * 100.0);
+                    eprintln!("  p90:  {:.1}%", p90 * 100.0);
+                    eprintln!("  p99:  {:.1}%", p99 * 100.0);
+                    eprintln!("  max:  {:.1}%", max * 100.0);
+                    eprintln!("  perfect: {}/{}\n", perfect_count, accuracies.len());
+                }
             }
         }
     }