Skip to content

Commit aa6077c

Browse files
author
Dylan Huang
committed
v2 proposal
1 parent fc15602 commit aa6077c

File tree

1 file changed

+277
-0
lines changed

1 file changed

+277
-0
lines changed
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator).
2+
3+
## Complete Examples
4+
5+
### 1. Pointwise Evaluator (100 separate tests)
6+
7+
```python
8+
# conftest.py
9+
import pytest
10+
from eval_protocol.framework import EvaluationFramework, EvaluationRow
11+
12+
MATH_DATASET = [
13+
{"problem": "What is 2+2?", "answer": "4"},
14+
{"problem": "What is 3*3?", "answer": "9"},
15+
{"problem": "What is 10/2?", "answer": "5"},
16+
{"problem": "What is 15-7?", "answer": "8"},
17+
{"problem": "What is 6*7?", "answer": "42"},
18+
# ... 95 more rows
19+
]
20+
21+
@pytest.fixture
22+
def math_dataset():
23+
"""Raw math dataset fixture"""
24+
return MATH_DATASET
25+
26+
@pytest.fixture
27+
def preprocess_fn():
28+
"""Preprocessing function for the dataset"""
29+
def _preprocess(item):
30+
return {
31+
"messages": [{"role": "user", "content": item["problem"]}],
32+
"expected_answer": item["answer"]
33+
}
34+
return _preprocess
35+
36+
@pytest.fixture(params=[
37+
{"model": "gpt-4", "temperature": 0.7, "max_tokens": 100},
38+
{"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100},
39+
{"model": "claude-3", "temperature": 0.3, "max_tokens": 100}
40+
])
41+
def completion_params(request):
42+
"""Completion parameters - parametrized across different models"""
43+
return request.param
44+
45+
# Pointwise fixture - parametrized across BOTH completion params AND dataset rows
46+
@pytest.fixture(params=range(len(MATH_DATASET)))
47+
def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request):
48+
"""Single evaluation row - parametrized across completion params AND dataset rows"""
49+
framework = EvaluationFramework()
50+
51+
# Get the specific row based on parametrization
52+
row_index = request.param
53+
raw_item = math_dataset[row_index]
54+
processed_item = preprocess_fn(raw_item)
55+
56+
# Run the completion
57+
result = await framework.run_completion(processed_item, completion_params)
58+
59+
return EvaluationRow(
60+
input_data=processed_item,
61+
completion_params=completion_params,
62+
completion_response=result
63+
)
64+
65+
# Batch fixture - parametrized across completion params only
66+
@pytest.fixture
67+
async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params):
68+
"""All evaluation rows - parametrized across completion params only"""
69+
framework = EvaluationFramework()
70+
71+
# Process all rows
72+
processed_items = [preprocess_fn(item) for item in math_dataset]
73+
74+
# Run completions for all rows
75+
results = []
76+
for item in processed_items:
77+
result = await framework.run_completion(item, completion_params)
78+
results.append(EvaluationRow(
79+
input_data=item,
80+
completion_params=completion_params,
81+
completion_response=result
82+
))
83+
84+
return results
85+
```
86+
87+
```python
88+
# test_math_evaluation.py
89+
import pytest
90+
import re
91+
from eval_protocol.framework import EvaluationRow
92+
93+
# POINTWISE EVALUATOR - 100 separate tests (one per row per model)
94+
def test_math_accuracy_pointwise(evaluation_row_pointwise):
95+
"""Pointwise evaluator - runs once per row per completion param"""
96+
response = evaluation_row_pointwise.completion_response
97+
expected = evaluation_row_pointwise.input_data["expected_answer"]
98+
99+
# Extract numeric answer from response
100+
numbers = re.findall(r'-?\d+\.?\d*', response)
101+
if not numbers:
102+
pytest.fail(f"Could not extract number from response: {response}")
103+
104+
predicted = float(numbers[0])
105+
expected_num = float(expected)
106+
107+
# Assert the answer is correct
108+
assert abs(predicted - expected_num) < 0.01, \
109+
f"Expected {expected_num}, got {predicted} in response: {response}"
110+
111+
# BATCH EVALUATOR - 3 tests total (one per model)
112+
def test_math_accuracy_batch(evaluation_rows_batch):
113+
"""Batch evaluator - runs once per completion param with all rows"""
114+
total_correct = 0
115+
total_samples = len(evaluation_rows_batch)
116+
failed_rows = []
117+
118+
for i, row in enumerate(evaluation_rows_batch):
119+
response = row.completion_response
120+
expected = row.input_data["expected_answer"]
121+
122+
# Extract numeric answer
123+
numbers = re.findall(r'-?\d+\.?\d*', response)
124+
if not numbers:
125+
failed_rows.append({
126+
"index": i,
127+
"problem": row.input_data["messages"][0]["content"],
128+
"expected": expected,
129+
"response": response,
130+
"error": "Could not extract number"
131+
})
132+
continue
133+
134+
predicted = float(numbers[0])
135+
expected_num = float(expected)
136+
137+
if abs(predicted - expected_num) < 0.01:
138+
total_correct += 1
139+
else:
140+
failed_rows.append({
141+
"index": i,
142+
"problem": row.input_data["messages"][0]["content"],
143+
"expected": expected,
144+
"predicted": predicted,
145+
"response": response,
146+
"error": f"Expected {expected_num}, got {predicted}"
147+
})
148+
149+
# Calculate accuracy
150+
accuracy = total_correct / total_samples
151+
152+
# Print detailed results for debugging
153+
print(f"\nBatch Evaluation Results:")
154+
print(f"Total samples: {total_samples}")
155+
print(f"Correct: {total_correct}")
156+
print(f"Accuracy: {accuracy:.2f}")
157+
158+
if failed_rows:
159+
print(f"\nFailed rows ({len(failed_rows)}):")
160+
for row in failed_rows[:10]: # Show first 10 failures
161+
print(f" Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})")
162+
if len(failed_rows) > 10:
163+
print(f" ... and {len(failed_rows) - 10} more failures")
164+
165+
# Assertions
166+
assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8"
167+
assert total_correct > 0, "No correct answers found"
168+
169+
# Additional batch evaluator with model-specific assertions
170+
def test_math_accuracy_with_model_info(evaluation_rows_batch):
171+
"""Batch evaluator with model-specific assertions"""
172+
model = evaluation_rows_batch[0].completion_params["model"]
173+
temperature = evaluation_rows_batch[0].completion_params["temperature"]
174+
175+
total_correct = 0
176+
for row in evaluation_rows_batch:
177+
response = row.completion_response
178+
expected = row.input_data["expected_answer"]
179+
180+
numbers = re.findall(r'-?\d+\.?\d*', response)
181+
if numbers:
182+
predicted = float(numbers[0])
183+
expected_num = float(expected)
184+
if abs(predicted - expected_num) < 0.01:
185+
total_correct += 1
186+
187+
accuracy = total_correct / len(evaluation_rows_batch)
188+
189+
# Model-specific assertions
190+
if model == "gpt-4":
191+
assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low"
192+
elif model == "gpt-3.5-turbo":
193+
assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low"
194+
elif model == "claude-3":
195+
assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low"
196+
197+
print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}")
198+
199+
# Optional: Debug function for specific rows
200+
def test_math_accuracy_debug_specific_rows(evaluation_rows_batch):
201+
"""Debug function to test specific rows - only runs on first few rows"""
202+
# Only test first 5 rows for debugging
203+
debug_rows = evaluation_rows_batch[:5]
204+
205+
for i, row in enumerate(debug_rows):
206+
response = row.completion_response
207+
expected = row.input_data["expected_answer"]
208+
209+
numbers = re.findall(r'-?\d+\.?\d*', response)
210+
if not numbers:
211+
pytest.fail(f"Row {i}: Could not extract number from response: {response}")
212+
213+
predicted = float(numbers[0])
214+
expected_num = float(expected)
215+
216+
assert abs(predicted - expected_num) < 0.01, \
217+
f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}"
218+
```
219+
220+
### 2. Running the Tests
221+
222+
```bash
223+
# Run pointwise evaluator (100 rows × 3 models = 300 tests)
224+
pytest test_math_evaluation.py::test_math_accuracy_pointwise -v
225+
226+
# Run batch evaluator (3 models = 3 tests)
227+
pytest test_math_evaluation.py::test_math_accuracy_batch -v
228+
229+
# Run all tests (300 + 3 = 303 tests total)
230+
pytest test_math_evaluation.py -v
231+
232+
# Run with specific model
233+
pytest test_math_evaluation.py -k "gpt-4" -v
234+
235+
# Run only batch tests
236+
pytest test_math_evaluation.py -k "batch" -v
237+
238+
# Run only pointwise tests
239+
pytest test_math_evaluation.py -k "pointwise" -v
240+
```
241+
242+
### 3. Expected Output
243+
244+
**Pointwise evaluator output:**
245+
```
246+
test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED
247+
test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED
248+
test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED
249+
# ... 97 more tests for completion_params0
250+
test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED
251+
# ... 100 tests for completion_params1
252+
test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED
253+
# ... 100 tests for completion_params2
254+
```
255+
256+
**Batch evaluator output:**
257+
```
258+
test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED
259+
test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED
260+
test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED
261+
```
262+
263+
### 4. Key Differences
264+
265+
**Pointwise Evaluator:**
266+
- **Test count**: 100 rows × 3 models = 300 tests
267+
- **Benefits**: Easy to debug individual rows, clear failure reporting per row
268+
- **Use case**: When you want to see exactly which rows fail and why
269+
- **Pytest output**: Each row gets its own test result
270+
271+
**Batch Evaluator:**
272+
- **Test count**: 3 models = 3 tests
273+
- **Benefits**: Faster execution, easier to manage, good for overall accuracy
274+
- **Use case**: When you care about overall performance across the dataset
275+
- **Pytest output**: One test result per model with detailed internal reporting
276+
277+
Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach!

0 commit comments

Comments
 (0)