From e445b3446bcca5ad3d4822758f16f661ee0b4508 Mon Sep 17 00:00:00 2001 From: Nick Mitchell Date: Tue, 17 Feb 2026 09:50:57 -0500 Subject: [PATCH] feat(bench): add map-reduce chunking to NIAH benchmark Implement chunk-based map-reduce structure in niah.rs benchmark, matching the pattern from haystack.rs. This allows testing retrieval performance with different chunking strategies. Changes: - Add chunk parameter to run_niah_test() function - Implement map-reduce logic: split context into chunks, process each chunk separately (map), then combine results (reduce) - Add BENCH_CHUNK_SIZES environment variable (default: 0,2,4) - Always include chunk size in benchmark IDs for proper filtering - Update documentation with chunking examples and filtering guide Made with Bob Signed-off-by: Nick Mitchell --- spnl/benches/README-NIAH.md | 147 ++++++++++++++++- spnl/benches/niah.rs | 313 +++++++++++++++++++++++------------- 2 files changed, 343 insertions(+), 117 deletions(-) diff --git a/spnl/benches/README-NIAH.md b/spnl/benches/README-NIAH.md index c2af3828..b09abb27 100644 --- a/spnl/benches/README-NIAH.md +++ b/spnl/benches/README-NIAH.md @@ -61,9 +61,60 @@ Enable debug output for the first sample to see detailed information: BENCH_DEBUG=1 cargo bench --bench niah --features tok ``` +## Map-Reduce Chunking + +The benchmark supports splitting the context into chunks and using a map-reduce approach, similar to the haystack benchmark. This can help test how well models handle distributed retrieval tasks. + +### Basic Chunking + +Split context into 2 chunks: +```bash +BENCH_CHUNK_SIZES="2" cargo bench --bench niah --features tok +``` + +Test multiple chunk sizes: +```bash +BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok +``` + +### How It Works + +When `chunk > 0`: +1. **Map step**: Context is split into N chunks (based on token count) +2. Each chunk gets its own query to answer the question +3. **Reduce step**: If multiple chunks exist, a final query combines the answers + +When `chunk = 0` (default): Uses the original non-chunked query + +### Filtering Chunked Benchmarks + +All benchmarks include the chunk size in their ID: `"chunk={size}/len={length}/depth={percent}"`. + +Run only chunk=0 (non-chunked) benchmarks: +```bash +cargo bench --bench niah --features tok -- "chunk=0" +``` + +Run only chunk=2 benchmarks: +```bash +cargo bench --bench niah --features tok -- "chunk=2" +``` + +Run only chunked benchmarks (chunk > 0): +```bash +cargo bench --bench niah --features tok -- "chunk=(2|4)" +``` + +Compare all chunk sizes for a specific configuration: +```bash +cargo bench --bench niah --features tok -- "len=2000/depth=50" +``` + ## Command-line Filtering -Use Criterion's built-in filtering to run specific benchmark configurations. The benchmark IDs follow the pattern: `retrieval/len={context_length}/depth={depth_percent}` +Use Criterion's built-in filtering to run specific benchmark configurations. The benchmark IDs follow these patterns: +- Non-chunked: `retrieval/len={context_length}/depth={depth_percent}` +- Chunked: `retrieval/chunk={chunk_size}/len={context_length}/depth={depth_percent}` ### Filter by Context Length @@ -89,6 +140,35 @@ Run only middle depth: cargo bench --bench niah --features tok -- "depth=50" ``` +### Filter by Chunk Size + +All benchmarks include chunk size in their ID, making filtering straightforward. + +Run only chunk=0 (non-chunked) benchmarks: +```bash +cargo bench --bench niah --features tok -- "chunk=0" +``` + +Run only chunk=2 benchmarks: +```bash +cargo bench --bench niah --features tok -- "chunk=2" +``` + +Run only chunked benchmarks (chunk > 0): +```bash +cargo bench --bench niah --features tok -- "chunk=(2|4)" +``` + +Run chunk=2 with specific context length: +```bash +cargo bench --bench niah --features tok -- "chunk=2/len=4000" +``` + +Compare all chunk sizes for same configuration: +```bash +cargo bench --bench niah --features tok -- "len=2000/depth=50" +``` + ### Filter by Specific Configuration Run a single specific configuration: @@ -96,6 +176,11 @@ Run a single specific configuration: cargo bench --bench niah --features tok -- "len=2000/depth=50" ``` +Run chunked configuration: +```bash +cargo bench --bench niah --features tok -- "chunk=2/len=2000/depth=50" +``` + Run multiple specific configurations: ```bash cargo bench --bench niah --features tok -- "len=1000/depth=(0|50|100)" @@ -158,6 +243,23 @@ BENCH_DEPTH_PERCENTAGES="0,100" cargo bench --bench niah --features tok BENCH_DEPTH_PERCENTAGES="0,10,20,30,40,50,60,70,80,90,100" cargo bench --bench niah --features tok ``` +### `BENCH_CHUNK_SIZES` (default: `"0,2,4"`) + +Comma-separated chunk counts for map-reduce (0 means no chunking): +```bash +# Test with default chunk sizes (no chunking, 2-way, and 4-way) +cargo bench --bench niah --features tok + +# Test with no chunking only +BENCH_CHUNK_SIZES="0" cargo bench --bench niah --features tok + +# Test with 2-way chunking only +BENCH_CHUNK_SIZES="2" cargo bench --bench niah --features tok + +# Test only chunked variants +BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok +``` + ### `BENCH_MODEL` (default: `"ollama/granite3.3:8b"`) Model to use for inference (Ollama format): @@ -220,16 +322,42 @@ BENCH_SAMPLE_SIZE=1 \ cargo bench --bench niah --features tok -- "len=2000/depth=50" ``` +### Test Map-Reduce Chunking +```bash +# Compare non-chunked vs chunked performance +BENCH_CHUNK_SIZES="0,2,4" \ +BENCH_CONTEXT_LENGTHS="4000,8000" \ +BENCH_DEPTH_PERCENTAGES="50" \ +cargo bench --bench niah --features tok +``` + +### Quick Chunking Test +```bash +# Fast test of chunking with minimal samples +BENCH_SAMPLE_SIZE=3 \ +BENCH_CHUNK_SIZES="2" \ +BENCH_CONTEXT_LENGTHS="2000" \ +BENCH_DEPTH_PERCENTAGES="50" \ +cargo bench --bench niah --features tok +``` + ## Progress Bars The benchmark displays real-time progress with running statistics: + +Non-chunked: ``` [00:45] ⠋ len=2000 depth=50% | n=7 | Acc=85.7% | Perfect=6/7 ``` +Chunked: +``` +[00:45] ⠋ chunk=2 len=2000 depth=50% | n=7 | Acc=85.7% | Perfect=6/7 +``` + - **Elapsed time**: `[00:45]` - **Spinner**: `⠋` (animated) -- **Configuration**: `len=2000 depth=50%` +- **Configuration**: `chunk=2 len=2000 depth=50%` (or just `len=2000 depth=50%` for non-chunked) - **Sample count**: `n=7` - **Running accuracy**: `Acc=85.7%` - **Perfect retrievals**: `Perfect=6/7` (responses that got the exact answer) @@ -238,6 +366,7 @@ The benchmark displays real-time progress with running statistics: After each configuration completes, you'll see detailed statistics: +Non-chunked: ``` === Accuracy Stats: len=2000 depth=50% (n=10) === avg: 85.0% @@ -251,6 +380,20 @@ After each configuration completes, you'll see detailed statistics: perfect: 8/10 ``` +Chunked: +``` +=== Accuracy Stats: chunk=2 len=2000 depth=50% (n=10) === + avg: 85.0% + min: 0.0% + p25: 100.0% + p50: 100.0% + p75: 100.0% + p90: 100.0% + p99: 100.0% + max: 100.0% + perfect: 8/10 +``` + - **avg**: Mean accuracy across all samples - **min/max**: Minimum and maximum accuracy - **p25/p50/p75/p90/p99**: Percentile statistics diff --git a/spnl/benches/niah.rs b/spnl/benches/niah.rs index e27947bb..3c567e93 100644 --- a/spnl/benches/niah.rs +++ b/spnl/benches/niah.rs @@ -26,6 +26,7 @@ //! - `BENCH_MEASUREMENT_TIME`: Measurement time in seconds (default: 60) //! - `BENCH_CONTEXT_LENGTHS`: Comma-separated context lengths in TOKENS (default: "1000,2000,4000,8000") //! - `BENCH_DEPTH_PERCENTAGES`: Comma-separated depth percentages (default: "0,25,50,75,100") +//! - `BENCH_CHUNK_SIZES`: Comma-separated chunk counts for map-reduce (default: "0,2,4") //! - `BENCH_MODEL`: Model to use for inference (default: "ollama/granite3.3:2b") //! - `BENCH_TOKENIZER_MODEL`: HuggingFace model for tokenizer (default: "ibm-granite/granite-3.3-2b-instruct") //! - `BENCH_FINAL_CONTEXT_LENGTH_BUFFER`: Buffer for system/question/response (default: 200) @@ -47,6 +48,21 @@ //! # Custom configuration //! BENCH_SAMPLE_SIZE=20 BENCH_CONTEXT_LENGTHS="2000,4000,8000" \ //! cargo bench --bench niah --features tok +//! +//! # Test with map-reduce chunking (split context into 2 or 4 chunks) +//! BENCH_CHUNK_SIZES="2,4" cargo bench --bench niah --features tok +//! +//! # Filter to run only chunk=0 (non-chunked) benchmarks +//! cargo bench --bench niah --features tok -- "chunk=0" +//! +//! # Filter to run only chunk=2 benchmarks +//! cargo bench --bench niah --features tok -- "chunk=2" +//! +//! # Run only chunked benchmarks (chunk > 0) +//! cargo bench --bench niah --features tok -- "chunk=(2|4)" +//! +//! # Compare all chunk sizes for same configuration +//! cargo bench --bench niah --features tok -- "len=2000/depth=50" //! ``` mod bench_progress; @@ -369,6 +385,7 @@ async fn run_niah_test( tokenizer: &Tokenizer, max_context_length: usize, debug: bool, + chunk: usize, ) -> Result> { // Generate context with needle inserted let context_with_needle = generate_context(config, tokenizer, max_context_length)?; @@ -410,16 +427,61 @@ async fn run_niah_test( let question = &config.question; let max_tokens = 300; // Match original Python implementation - let query: Query = spnl!( - g model - (cross - (system system_prompt) - (user context_with_needle) - (user question) + let query: Query = if chunk > 0 { + // Split context into token-based chunks for map-reduce + let encoding = tokenizer + .encode(context_with_needle.as_str(), false) + .map_err(|e| format!("Encoding error: {}", e))?; + let tokens = encoding.get_ids(); + + // Calculate chunk size in tokens + let chunk_size_tokens = (tokens.len() + chunk - 1) / chunk; // Round up division + + let chunks: Vec = tokens + .chunks(chunk_size_tokens) + .map(|chunk_tokens| tokenizer.decode(chunk_tokens, false).unwrap_or_default()) + .map(|chunk_text| { + spnl!( + g model + (cross + (system system_prompt) + (user chunk_text) + (user question) + ) + temperature + max_tokens + ) + }) + .collect(); + + if chunks.len() == 1 { + chunks[0].clone() + } else { + // Reduce step: combine answers from all chunks + spnl!( + g model + (cross + (system system_prompt) + (plus chunks) + (user "Based on the above responses, what is the final answer to the question? Be concise and direct.") + ) + temperature + max_tokens ) - temperature - max_tokens - ); + } + } else { + // Original non-chunked query + spnl!( + g model + (cross + (system system_prompt) + (user context_with_needle) + (user question) + ) + temperature + max_tokens + ) + }; // Execute query let options = ExecuteOptions { @@ -515,11 +577,20 @@ fn niah_benchmark(c: &mut Criterion) { }) .unwrap_or_else(|| vec![0, 25, 50, 75, 100]); - let model = std::env::var("BENCH_MODEL").unwrap_or_else(|_| "ollama/granite3.3:8b".to_string()); + let chunk_sizes: Vec = std::env::var("BENCH_CHUNK_SIZES") + .ok() + .and_then(|s| { + s.split(',') + .map(|n| n.trim().parse().ok()) + .collect::>>() + }) + .unwrap_or_else(|| vec![0, 2, 4]); // default: no chunking, 2-way, 4-way + + let model = std::env::var("BENCH_MODEL").unwrap_or_else(|_| "ollama/granite3.3:2b".to_string()); // Tokenizer model (HuggingFace format) - separate from inference model let tokenizer_model = std::env::var("BENCH_TOKENIZER_MODEL") - .unwrap_or_else(|_| "ibm-granite/granite-3.3-8b-instruct".to_string()); + .unwrap_or_else(|_| "ibm-granite/granite-3.3-2b-instruct".to_string()); let final_context_length_buffer = std::env::var("BENCH_FINAL_CONTEXT_LENGTH_BUFFER") .ok() @@ -545,6 +616,7 @@ fn niah_benchmark(c: &mut Criterion) { eprintln!("Model: {}", model); eprintln!("Context lengths (tokens): {:?}", context_lengths); eprintln!("Depth percentages: {:?}", depth_percentages); + eprintln!("Chunk sizes: {:?}", chunk_sizes); eprintln!("Sample size: {}", sample_size); eprintln!("Temperature: {}", temperature); eprintln!( @@ -552,113 +624,124 @@ fn niah_benchmark(c: &mut Criterion) { final_context_length_buffer ); - // Run benchmarks for each combination of context length and depth - for context_length in &context_lengths { - for depth_percent in &depth_percentages { - let accuracy_values = Arc::new(Mutex::new(Vec::new())); - let accuracy_clone = Arc::clone(&accuracy_values); - - // Create progress bar - let base_msg = format!("len={} depth={}%", context_length, depth_percent); - let pb = - bench_progress::create_benchmark_progress(sample_size as u64, base_msg.clone()); - let pb_clone = Arc::clone(&pb); - let base_msg = Arc::new(base_msg); - let base_msg_clone = Arc::clone(&base_msg); - - group.bench_with_input( - BenchmarkId::new( - "retrieval", - format!("len={}/depth={}", context_length, depth_percent), - ), - &(*context_length, *depth_percent), - |b, &(len, depth)| { - b.to_async(&runtime).iter(|| { - let accuracy_clone = Arc::clone(&accuracy_clone); - let pb = Arc::clone(&pb_clone); - let base_msg = Arc::clone(&base_msg_clone); - let model = model.clone(); - let tokenizer = tokenizer.clone(); - let debug_counter = Arc::clone(&debug_counter); - - async move { - // Only debug first sample - let mut counter = debug_counter.lock().unwrap(); - let should_debug = debug && *counter == 0; - *counter += 1; - drop(counter); - let config = NeedleConfig { - context_length: len, - depth_percent: depth, - final_context_length_buffer, - ..Default::default() - }; - - let accuracy = run_niah_test( - &config, - &model, - temperature, - &tokenizer, - max_context_length, - should_debug, - ) - .await - .unwrap_or(0.0); - - // Collect metrics - accuracy_clone.lock().unwrap().push(accuracy); - - // Update progress bar - let accuracies = accuracy_clone.lock().unwrap(); - let total_count = accuracies.len(); - let avg_acc = accuracies.iter().sum::() / total_count as f64; - let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count(); - drop(accuracies); - - pb.set_message(format!( - "{} | n={} | Acc={:.1}% | Perfect={}/{}", - base_msg, - total_count, - avg_acc * 100.0, - perfect_count, - total_count - )); - pb.inc(1); - - accuracy - } - }); - }, - ); + // Run benchmarks for each combination of context length, depth, and chunk size + for chunk_size in &chunk_sizes { + for context_length in &context_lengths { + for depth_percent in &depth_percentages { + let accuracy_values = Arc::new(Mutex::new(Vec::new())); + let accuracy_clone = Arc::clone(&accuracy_values); + + // Create progress bar + let base_msg = format!( + "chunk={} len={} depth={}%", + chunk_size, context_length, depth_percent + ); + let pb = + bench_progress::create_benchmark_progress(sample_size as u64, base_msg.clone()); + let pb_clone = Arc::clone(&pb); + let base_msg = Arc::new(base_msg); + let base_msg_clone = Arc::clone(&base_msg); + + let bench_id = format!( + "chunk={}/len={}/depth={}", + chunk_size, context_length, depth_percent + ); - // Finish progress bar - bench_progress::finish_benchmark_progress( - &pb, - format!("✓ len={} depth={}%", context_length, depth_percent), - ); + group.bench_with_input( + BenchmarkId::new("retrieval", bench_id), + &(*context_length, *depth_percent, *chunk_size), + |b, &(len, depth, chunk)| { + b.to_async(&runtime).iter(|| { + let accuracy_clone = Arc::clone(&accuracy_clone); + let pb = Arc::clone(&pb_clone); + let base_msg = Arc::clone(&base_msg_clone); + let model = model.clone(); + let tokenizer = tokenizer.clone(); + let debug_counter = Arc::clone(&debug_counter); + + async move { + // Only debug first sample + let mut counter = debug_counter.lock().unwrap(); + let should_debug = debug && *counter == 0; + *counter += 1; + drop(counter); + let config = NeedleConfig { + context_length: len, + depth_percent: depth, + final_context_length_buffer, + ..Default::default() + }; + + let accuracy = run_niah_test( + &config, + &model, + temperature, + &tokenizer, + max_context_length, + should_debug, + chunk, + ) + .await + .unwrap_or(0.0); + + // Collect metrics + accuracy_clone.lock().unwrap().push(accuracy); + + // Update progress bar + let accuracies = accuracy_clone.lock().unwrap(); + let total_count = accuracies.len(); + let avg_acc = accuracies.iter().sum::() / total_count as f64; + let perfect_count = + accuracies.iter().filter(|&&a| a >= 1.0).count(); + drop(accuracies); + + pb.set_message(format!( + "{} | n={} | Acc={:.1}% | Perfect={}/{}", + base_msg, + total_count, + avg_acc * 100.0, + perfect_count, + total_count + )); + pb.inc(1); + + accuracy + } + }); + }, + ); - // Print statistics - let accuracies = accuracy_values.lock().unwrap(); - if !accuracies.is_empty() { - let (min, p25, p50, p75, p90, p99, max) = compute_quantiles(&accuracies); - let avg = accuracies.iter().sum::() / accuracies.len() as f64; - let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count(); - - eprintln!( - "\n=== Accuracy Stats: len={} depth={}% (n={}) ===", - context_length, - depth_percent, - accuracies.len() + // Finish progress bar + let finish_msg = format!( + "✓ chunk={} len={} depth={}%", + chunk_size, context_length, depth_percent ); - eprintln!(" avg: {:.1}%", avg * 100.0); - eprintln!(" min: {:.1}%", min * 100.0); - eprintln!(" p25: {:.1}%", p25 * 100.0); - eprintln!(" p50: {:.1}%", p50 * 100.0); - eprintln!(" p75: {:.1}%", p75 * 100.0); - eprintln!(" p90: {:.1}%", p90 * 100.0); - eprintln!(" p99: {:.1}%", p99 * 100.0); - eprintln!(" max: {:.1}%", max * 100.0); - eprintln!(" perfect: {}/{}\n", perfect_count, accuracies.len()); + bench_progress::finish_benchmark_progress(&pb, finish_msg); + + // Print statistics + let accuracies = accuracy_values.lock().unwrap(); + if !accuracies.is_empty() { + let (min, p25, p50, p75, p90, p99, max) = compute_quantiles(&accuracies); + let avg = accuracies.iter().sum::() / accuracies.len() as f64; + let perfect_count = accuracies.iter().filter(|&&a| a >= 1.0).count(); + + eprintln!( + "\n=== Accuracy Stats: chunk={} len={} depth={}% (n={}) ===", + chunk_size, + context_length, + depth_percent, + accuracies.len() + ); + eprintln!(" avg: {:.1}%", avg * 100.0); + eprintln!(" min: {:.1}%", min * 100.0); + eprintln!(" p25: {:.1}%", p25 * 100.0); + eprintln!(" p50: {:.1}%", p50 * 100.0); + eprintln!(" p75: {:.1}%", p75 * 100.0); + eprintln!(" p90: {:.1}%", p90 * 100.0); + eprintln!(" p99: {:.1}%", p99 * 100.0); + eprintln!(" max: {:.1}%", max * 100.0); + eprintln!(" perfect: {}/{}\n", perfect_count, accuracies.len()); + } } } }