python-sdk/tests/pytest/gsm8k/test_pytest_math_example.py at 5613767d3993e2001cc76f2d8c6aadf2373f2437 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
from typing import List, Dict, Any, Optional


def extract_answer_digits(ground_truth: str) -> Optional[str]:
    """
    Extract the digits from the answer string.
    """
    answer_string = ground_truth.split("<answer>")[1].split("</answer>")[0]
    return re.search(r"(\d+)", answer_string).group(1) if answer_string else None


@evaluation_test(
    input_dataset=["development/gsm8k_sample.jsonl"],
    completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
    max_dataset_rows=5,
    passed_threshold=0.0,
    rollout_processor=SingleTurnRolloutProcessor(),
    mode="pointwise",
    evaluation_test_kwargs=[
        {"math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}}
    ],
)
def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
    """
    Evaluate math problem solving considering both accuracy and format.

    This function demonstrates how to combine multiple evaluation criteria:
    - Numerical accuracy using built-in math evaluation (80% weight)
    - Format compliance checking for <think>...</think><answer>...</answer> structure (20% weight)

    Args:
        row: EvaluationRow containing the conversation messages and ground truth
        **kwargs: Additional parameters (like math_reward_kwargs)

    Returns:
        EvaluationRow with the evaluation result
    """
    #### Get predicted answer value
    prediction = extract_answer_digits(str(row.messages[2].content))
    gt = extract_answer_digits(str(row.ground_truth))

    #### Get score
    if prediction is None or gt is None:
        score = 0
        reason = "Missing answer tags in prediction or ground truth."

    elif gt == prediction:
        score = 1
        reason = "Model answer is correct."

    else:
        score = 0
        reason = "Model answer is not correct."

    reason += f" Prediction: {prediction}, Ground Truth: {gt}"

    evaluation_result = EvaluateResult(
        score=score,  # Required: The final evaluation score
        is_score_valid=True,  # Optional: Whether the score is valid, true by default
        reason=reason,  # Optional: The reason for the score
    )
    row.evaluation_result = evaluation_result
    return row