|
| 1 | +Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator). |
| 2 | + |
| 3 | +## Complete Examples |
| 4 | + |
| 5 | +### 1. Pointwise Evaluator (100 separate tests) |
| 6 | + |
| 7 | +```python |
| 8 | +# conftest.py |
| 9 | +import pytest |
| 10 | +from eval_protocol.framework import EvaluationFramework, EvaluationRow |
| 11 | + |
| 12 | +MATH_DATASET = [ |
| 13 | + {"problem": "What is 2+2?", "answer": "4"}, |
| 14 | + {"problem": "What is 3*3?", "answer": "9"}, |
| 15 | + {"problem": "What is 10/2?", "answer": "5"}, |
| 16 | + {"problem": "What is 15-7?", "answer": "8"}, |
| 17 | + {"problem": "What is 6*7?", "answer": "42"}, |
| 18 | + # ... 95 more rows |
| 19 | +] |
| 20 | + |
| 21 | +@pytest.fixture |
| 22 | +def math_dataset(): |
| 23 | + """Raw math dataset fixture""" |
| 24 | + return MATH_DATASET |
| 25 | + |
| 26 | +@pytest.fixture |
| 27 | +def preprocess_fn(): |
| 28 | + """Preprocessing function for the dataset""" |
| 29 | + def _preprocess(item): |
| 30 | + return { |
| 31 | + "messages": [{"role": "user", "content": item["problem"]}], |
| 32 | + "expected_answer": item["answer"] |
| 33 | + } |
| 34 | + return _preprocess |
| 35 | + |
| 36 | +@pytest.fixture(params=[ |
| 37 | + {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100}, |
| 38 | + {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100}, |
| 39 | + {"model": "claude-3", "temperature": 0.3, "max_tokens": 100} |
| 40 | +]) |
| 41 | +def completion_params(request): |
| 42 | + """Completion parameters - parametrized across different models""" |
| 43 | + return request.param |
| 44 | + |
| 45 | +# Pointwise fixture - parametrized across BOTH completion params AND dataset rows |
| 46 | +@pytest.fixture(params=range(len(MATH_DATASET))) |
| 47 | +def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request): |
| 48 | + """Single evaluation row - parametrized across completion params AND dataset rows""" |
| 49 | + framework = EvaluationFramework() |
| 50 | + |
| 51 | + # Get the specific row based on parametrization |
| 52 | + row_index = request.param |
| 53 | + raw_item = math_dataset[row_index] |
| 54 | + processed_item = preprocess_fn(raw_item) |
| 55 | + |
| 56 | + # Run the completion |
| 57 | + result = await framework.run_completion(processed_item, completion_params) |
| 58 | + |
| 59 | + return EvaluationRow( |
| 60 | + input_data=processed_item, |
| 61 | + completion_params=completion_params, |
| 62 | + completion_response=result |
| 63 | + ) |
| 64 | + |
| 65 | +# Batch fixture - parametrized across completion params only |
| 66 | +@pytest.fixture |
| 67 | +async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params): |
| 68 | + """All evaluation rows - parametrized across completion params only""" |
| 69 | + framework = EvaluationFramework() |
| 70 | + |
| 71 | + # Process all rows |
| 72 | + processed_items = [preprocess_fn(item) for item in math_dataset] |
| 73 | + |
| 74 | + # Run completions for all rows |
| 75 | + results = [] |
| 76 | + for item in processed_items: |
| 77 | + result = await framework.run_completion(item, completion_params) |
| 78 | + results.append(EvaluationRow( |
| 79 | + input_data=item, |
| 80 | + completion_params=completion_params, |
| 81 | + completion_response=result |
| 82 | + )) |
| 83 | + |
| 84 | + return results |
| 85 | +``` |
| 86 | + |
| 87 | +```python |
| 88 | +# test_math_evaluation.py |
| 89 | +import pytest |
| 90 | +import re |
| 91 | +from eval_protocol.framework import EvaluationRow |
| 92 | + |
| 93 | +# POINTWISE EVALUATOR - 100 separate tests (one per row per model) |
| 94 | +def test_math_accuracy_pointwise(evaluation_row_pointwise): |
| 95 | + """Pointwise evaluator - runs once per row per completion param""" |
| 96 | + response = evaluation_row_pointwise.completion_response |
| 97 | + expected = evaluation_row_pointwise.input_data["expected_answer"] |
| 98 | + |
| 99 | + # Extract numeric answer from response |
| 100 | + numbers = re.findall(r'-?\d+\.?\d*', response) |
| 101 | + if not numbers: |
| 102 | + pytest.fail(f"Could not extract number from response: {response}") |
| 103 | + |
| 104 | + predicted = float(numbers[0]) |
| 105 | + expected_num = float(expected) |
| 106 | + |
| 107 | + # Assert the answer is correct |
| 108 | + assert abs(predicted - expected_num) < 0.01, \ |
| 109 | + f"Expected {expected_num}, got {predicted} in response: {response}" |
| 110 | + |
| 111 | +# BATCH EVALUATOR - 3 tests total (one per model) |
| 112 | +def test_math_accuracy_batch(evaluation_rows_batch): |
| 113 | + """Batch evaluator - runs once per completion param with all rows""" |
| 114 | + total_correct = 0 |
| 115 | + total_samples = len(evaluation_rows_batch) |
| 116 | + failed_rows = [] |
| 117 | + |
| 118 | + for i, row in enumerate(evaluation_rows_batch): |
| 119 | + response = row.completion_response |
| 120 | + expected = row.input_data["expected_answer"] |
| 121 | + |
| 122 | + # Extract numeric answer |
| 123 | + numbers = re.findall(r'-?\d+\.?\d*', response) |
| 124 | + if not numbers: |
| 125 | + failed_rows.append({ |
| 126 | + "index": i, |
| 127 | + "problem": row.input_data["messages"][0]["content"], |
| 128 | + "expected": expected, |
| 129 | + "response": response, |
| 130 | + "error": "Could not extract number" |
| 131 | + }) |
| 132 | + continue |
| 133 | + |
| 134 | + predicted = float(numbers[0]) |
| 135 | + expected_num = float(expected) |
| 136 | + |
| 137 | + if abs(predicted - expected_num) < 0.01: |
| 138 | + total_correct += 1 |
| 139 | + else: |
| 140 | + failed_rows.append({ |
| 141 | + "index": i, |
| 142 | + "problem": row.input_data["messages"][0]["content"], |
| 143 | + "expected": expected, |
| 144 | + "predicted": predicted, |
| 145 | + "response": response, |
| 146 | + "error": f"Expected {expected_num}, got {predicted}" |
| 147 | + }) |
| 148 | + |
| 149 | + # Calculate accuracy |
| 150 | + accuracy = total_correct / total_samples |
| 151 | + |
| 152 | + # Print detailed results for debugging |
| 153 | + print(f"\nBatch Evaluation Results:") |
| 154 | + print(f"Total samples: {total_samples}") |
| 155 | + print(f"Correct: {total_correct}") |
| 156 | + print(f"Accuracy: {accuracy:.2f}") |
| 157 | + |
| 158 | + if failed_rows: |
| 159 | + print(f"\nFailed rows ({len(failed_rows)}):") |
| 160 | + for row in failed_rows[:10]: # Show first 10 failures |
| 161 | + print(f" Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})") |
| 162 | + if len(failed_rows) > 10: |
| 163 | + print(f" ... and {len(failed_rows) - 10} more failures") |
| 164 | + |
| 165 | + # Assertions |
| 166 | + assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8" |
| 167 | + assert total_correct > 0, "No correct answers found" |
| 168 | + |
| 169 | +# Additional batch evaluator with model-specific assertions |
| 170 | +def test_math_accuracy_with_model_info(evaluation_rows_batch): |
| 171 | + """Batch evaluator with model-specific assertions""" |
| 172 | + model = evaluation_rows_batch[0].completion_params["model"] |
| 173 | + temperature = evaluation_rows_batch[0].completion_params["temperature"] |
| 174 | + |
| 175 | + total_correct = 0 |
| 176 | + for row in evaluation_rows_batch: |
| 177 | + response = row.completion_response |
| 178 | + expected = row.input_data["expected_answer"] |
| 179 | + |
| 180 | + numbers = re.findall(r'-?\d+\.?\d*', response) |
| 181 | + if numbers: |
| 182 | + predicted = float(numbers[0]) |
| 183 | + expected_num = float(expected) |
| 184 | + if abs(predicted - expected_num) < 0.01: |
| 185 | + total_correct += 1 |
| 186 | + |
| 187 | + accuracy = total_correct / len(evaluation_rows_batch) |
| 188 | + |
| 189 | + # Model-specific assertions |
| 190 | + if model == "gpt-4": |
| 191 | + assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low" |
| 192 | + elif model == "gpt-3.5-turbo": |
| 193 | + assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low" |
| 194 | + elif model == "claude-3": |
| 195 | + assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low" |
| 196 | + |
| 197 | + print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}") |
| 198 | + |
| 199 | +# Optional: Debug function for specific rows |
| 200 | +def test_math_accuracy_debug_specific_rows(evaluation_rows_batch): |
| 201 | + """Debug function to test specific rows - only runs on first few rows""" |
| 202 | + # Only test first 5 rows for debugging |
| 203 | + debug_rows = evaluation_rows_batch[:5] |
| 204 | + |
| 205 | + for i, row in enumerate(debug_rows): |
| 206 | + response = row.completion_response |
| 207 | + expected = row.input_data["expected_answer"] |
| 208 | + |
| 209 | + numbers = re.findall(r'-?\d+\.?\d*', response) |
| 210 | + if not numbers: |
| 211 | + pytest.fail(f"Row {i}: Could not extract number from response: {response}") |
| 212 | + |
| 213 | + predicted = float(numbers[0]) |
| 214 | + expected_num = float(expected) |
| 215 | + |
| 216 | + assert abs(predicted - expected_num) < 0.01, \ |
| 217 | + f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}" |
| 218 | +``` |
| 219 | + |
| 220 | +### 2. Running the Tests |
| 221 | + |
| 222 | +```bash |
| 223 | +# Run pointwise evaluator (100 rows × 3 models = 300 tests) |
| 224 | +pytest test_math_evaluation.py::test_math_accuracy_pointwise -v |
| 225 | + |
| 226 | +# Run batch evaluator (3 models = 3 tests) |
| 227 | +pytest test_math_evaluation.py::test_math_accuracy_batch -v |
| 228 | + |
| 229 | +# Run all tests (300 + 3 = 303 tests total) |
| 230 | +pytest test_math_evaluation.py -v |
| 231 | + |
| 232 | +# Run with specific model |
| 233 | +pytest test_math_evaluation.py -k "gpt-4" -v |
| 234 | + |
| 235 | +# Run only batch tests |
| 236 | +pytest test_math_evaluation.py -k "batch" -v |
| 237 | + |
| 238 | +# Run only pointwise tests |
| 239 | +pytest test_math_evaluation.py -k "pointwise" -v |
| 240 | +``` |
| 241 | + |
| 242 | +### 3. Expected Output |
| 243 | + |
| 244 | +**Pointwise evaluator output:** |
| 245 | +``` |
| 246 | +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED |
| 247 | +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED |
| 248 | +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED |
| 249 | +# ... 97 more tests for completion_params0 |
| 250 | +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED |
| 251 | +# ... 100 tests for completion_params1 |
| 252 | +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED |
| 253 | +# ... 100 tests for completion_params2 |
| 254 | +``` |
| 255 | + |
| 256 | +**Batch evaluator output:** |
| 257 | +``` |
| 258 | +test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED |
| 259 | +test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED |
| 260 | +test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED |
| 261 | +``` |
| 262 | + |
| 263 | +### 4. Key Differences |
| 264 | + |
| 265 | +**Pointwise Evaluator:** |
| 266 | +- **Test count**: 100 rows × 3 models = 300 tests |
| 267 | +- **Benefits**: Easy to debug individual rows, clear failure reporting per row |
| 268 | +- **Use case**: When you want to see exactly which rows fail and why |
| 269 | +- **Pytest output**: Each row gets its own test result |
| 270 | + |
| 271 | +**Batch Evaluator:** |
| 272 | +- **Test count**: 3 models = 3 tests |
| 273 | +- **Benefits**: Faster execution, easier to manage, good for overall accuracy |
| 274 | +- **Use case**: When you care about overall performance across the dataset |
| 275 | +- **Pytest output**: One test result per model with detailed internal reporting |
| 276 | + |
| 277 | +Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach! |
0 commit comments