From aa6077cd04401ce7e80b9d0cb62c5b0b93070b48 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Wed, 17 Sep 2025 15:42:29 -0700
Subject: [PATCH] v2 proposal

---
 development/notes/eval_protocol_v2.md | 277 ++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 development/notes/eval_protocol_v2.md

diff --git a/development/notes/eval_protocol_v2.md b/development/notes/eval_protocol_v2.md
new file mode 100644
index 00000000..a8b8b130
--- /dev/null
+++ b/development/notes/eval_protocol_v2.md
@@ -0,0 +1,277 @@
+Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator).
+
+## Complete Examples
+
+### 1. Pointwise Evaluator (100 separate tests)
+
+```python
+# conftest.py
+import pytest
+from eval_protocol.framework import EvaluationFramework, EvaluationRow
+
+MATH_DATASET = [
+    {"problem": "What is 2+2?", "answer": "4"},
+    {"problem": "What is 3*3?", "answer": "9"},
+    {"problem": "What is 10/2?", "answer": "5"},
+    {"problem": "What is 15-7?", "answer": "8"},
+    {"problem": "What is 6*7?", "answer": "42"},
+    # ... 95 more rows
+]
+
+@pytest.fixture
+def math_dataset():
+    """Raw math dataset fixture"""
+    return MATH_DATASET
+
+@pytest.fixture
+def preprocess_fn():
+    """Preprocessing function for the dataset"""
+    def _preprocess(item):
+        return {
+            "messages": [{"role": "user", "content": item["problem"]}],
+            "expected_answer": item["answer"]
+        }
+    return _preprocess
+
+@pytest.fixture(params=[
+    {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100},
+    {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100},
+    {"model": "claude-3", "temperature": 0.3, "max_tokens": 100}
+])
+def completion_params(request):
+    """Completion parameters - parametrized across different models"""
+    return request.param
+
+# Pointwise fixture - parametrized across BOTH completion params AND dataset rows
+@pytest.fixture(params=range(len(MATH_DATASET)))
+def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request):
+    """Single evaluation row - parametrized across completion params AND dataset rows"""
+    framework = EvaluationFramework()
+
+    # Get the specific row based on parametrization
+    row_index = request.param
+    raw_item = math_dataset[row_index]
+    processed_item = preprocess_fn(raw_item)
+
+    # Run the completion
+    result = await framework.run_completion(processed_item, completion_params)
+
+    return EvaluationRow(
+        input_data=processed_item,
+        completion_params=completion_params,
+        completion_response=result
+    )
+
+# Batch fixture - parametrized across completion params only
+@pytest.fixture
+async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params):
+    """All evaluation rows - parametrized across completion params only"""
+    framework = EvaluationFramework()
+
+    # Process all rows
+    processed_items = [preprocess_fn(item) for item in math_dataset]
+
+    # Run completions for all rows
+    results = []
+    for item in processed_items:
+        result = await framework.run_completion(item, completion_params)
+        results.append(EvaluationRow(
+            input_data=item,
+            completion_params=completion_params,
+            completion_response=result
+        ))
+
+    return results
+```
+
+```python
+# test_math_evaluation.py
+import pytest
+import re
+from eval_protocol.framework import EvaluationRow
+
+# POINTWISE EVALUATOR - 100 separate tests (one per row per model)
+def test_math_accuracy_pointwise(evaluation_row_pointwise):
+    """Pointwise evaluator - runs once per row per completion param"""
+    response = evaluation_row_pointwise.completion_response
+    expected = evaluation_row_pointwise.input_data["expected_answer"]
+
+    # Extract numeric answer from response
+    numbers = re.findall(r'-?\d+\.?\d*', response)
+    if not numbers:
+        pytest.fail(f"Could not extract number from response: {response}")
+
+    predicted = float(numbers[0])
+    expected_num = float(expected)
+
+    # Assert the answer is correct
+    assert abs(predicted - expected_num) < 0.01, \
+        f"Expected {expected_num}, got {predicted} in response: {response}"
+
+# BATCH EVALUATOR - 3 tests total (one per model)
+def test_math_accuracy_batch(evaluation_rows_batch):
+    """Batch evaluator - runs once per completion param with all rows"""
+    total_correct = 0
+    total_samples = len(evaluation_rows_batch)
+    failed_rows = []
+
+    for i, row in enumerate(evaluation_rows_batch):
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        # Extract numeric answer
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if not numbers:
+            failed_rows.append({
+                "index": i,
+                "problem": row.input_data["messages"][0]["content"],
+                "expected": expected,
+                "response": response,
+                "error": "Could not extract number"
+            })
+            continue
+
+        predicted = float(numbers[0])
+        expected_num = float(expected)
+
+        if abs(predicted - expected_num) < 0.01:
+            total_correct += 1
+        else:
+            failed_rows.append({
+                "index": i,
+                "problem": row.input_data["messages"][0]["content"],
+                "expected": expected,
+                "predicted": predicted,
+                "response": response,
+                "error": f"Expected {expected_num}, got {predicted}"
+            })
+
+    # Calculate accuracy
+    accuracy = total_correct / total_samples
+
+    # Print detailed results for debugging
+    print(f"\nBatch Evaluation Results:")
+    print(f"Total samples: {total_samples}")
+    print(f"Correct: {total_correct}")
+    print(f"Accuracy: {accuracy:.2f}")
+
+    if failed_rows:
+        print(f"\nFailed rows ({len(failed_rows)}):")
+        for row in failed_rows[:10]:  # Show first 10 failures
+            print(f"  Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})")
+        if len(failed_rows) > 10:
+            print(f"  ... and {len(failed_rows) - 10} more failures")
+
+    # Assertions
+    assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8"
+    assert total_correct > 0, "No correct answers found"
+
+# Additional batch evaluator with model-specific assertions
+def test_math_accuracy_with_model_info(evaluation_rows_batch):
+    """Batch evaluator with model-specific assertions"""
+    model = evaluation_rows_batch[0].completion_params["model"]
+    temperature = evaluation_rows_batch[0].completion_params["temperature"]
+
+    total_correct = 0
+    for row in evaluation_rows_batch:
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if numbers:
+            predicted = float(numbers[0])
+            expected_num = float(expected)
+            if abs(predicted - expected_num) < 0.01:
+                total_correct += 1
+
+    accuracy = total_correct / len(evaluation_rows_batch)
+
+    # Model-specific assertions
+    if model == "gpt-4":
+        assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low"
+    elif model == "gpt-3.5-turbo":
+        assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low"
+    elif model == "claude-3":
+        assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low"
+
+    print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}")
+
+# Optional: Debug function for specific rows
+def test_math_accuracy_debug_specific_rows(evaluation_rows_batch):
+    """Debug function to test specific rows - only runs on first few rows"""
+    # Only test first 5 rows for debugging
+    debug_rows = evaluation_rows_batch[:5]
+
+    for i, row in enumerate(debug_rows):
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if not numbers:
+            pytest.fail(f"Row {i}: Could not extract number from response: {response}")
+
+        predicted = float(numbers[0])
+        expected_num = float(expected)
+
+        assert abs(predicted - expected_num) < 0.01, \
+            f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}"
+```
+
+### 2. Running the Tests
+
+```bash
+# Run pointwise evaluator (100 rows × 3 models = 300 tests)
+pytest test_math_evaluation.py::test_math_accuracy_pointwise -v
+
+# Run batch evaluator (3 models = 3 tests)
+pytest test_math_evaluation.py::test_math_accuracy_batch -v
+
+# Run all tests (300 + 3 = 303 tests total)
+pytest test_math_evaluation.py -v
+
+# Run with specific model
+pytest test_math_evaluation.py -k "gpt-4" -v
+
+# Run only batch tests
+pytest test_math_evaluation.py -k "batch" -v
+
+# Run only pointwise tests
+pytest test_math_evaluation.py -k "pointwise" -v
+```
+
+### 3. Expected Output
+
+**Pointwise evaluator output:**
+```
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED
+# ... 97 more tests for completion_params0
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED
+# ... 100 tests for completion_params1
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED
+# ... 100 tests for completion_params2
+```
+
+**Batch evaluator output:**
+```
+test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED
+test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED
+test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED
+```
+
+### 4. Key Differences
+
+**Pointwise Evaluator:**
+- **Test count**: 100 rows × 3 models = 300 tests
+- **Benefits**: Easy to debug individual rows, clear failure reporting per row
+- **Use case**: When you want to see exactly which rows fail and why
+- **Pytest output**: Each row gets its own test result
+
+**Batch Evaluator:**
+- **Test count**: 3 models = 3 tests
+- **Benefits**: Faster execution, easier to manage, good for overall accuracy
+- **Use case**: When you care about overall performance across the dataset
+- **Pytest output**: One test result per model with detailed internal reporting
+
+Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach!