-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathtest_apps_coding.py
More file actions
60 lines (49 loc) · 1.86 KB
/
test_apps_coding.py
File metadata and controls
60 lines (49 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Pytest test for APPS coding evaluation using the evaluation_test decorator.
This test demonstrates how to evaluate code correctness for competitive programming problems
using the actual evaluate_apps_solution function from apps_coding_reward.py.
"""
import json
from typing import Any, Dict, List
from eval_protocol.models import EvaluateResult, EvaluationRow, Message
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
from eval_protocol.rewards.apps_coding_reward import evaluate_apps_solution
def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Convert entries from APPS dataset to EvaluationRow objects.
"""
return [
EvaluationRow(messages=[Message(role="user", content=row["question"])], ground_truth=row["input_output"])
for row in data
]
@evaluation_test(
input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
dataset_adapter=apps_dataset_to_evaluation_row,
completion_params=[
{
"temperature": 0.0,
"max_tokens": 4096,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
passed_threshold=0.33,
rollout_processor=SingleTurnRolloutProcessor(),
num_runs=1,
mode="pointwise",
)
def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
"""
Evaluation function that tests APPS coding problems using evaluate_apps_solution.
Args:
row: EvaluationRow containing the conversation messages and ground_truth as JSON string
Returns:
EvaluationRow with the evaluation result
"""
# Use evaluate_apps_solution directly
result = evaluate_apps_solution(
messages=row.messages,
ground_truth=str(row.ground_truth),
)
# Set the evaluation result on the row
row.evaluation_result = result
return row