|
8 | 8 |
|
9 | 9 | import pytest |
10 | 10 |
|
11 | | -from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult |
| 11 | +from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message |
12 | 12 | from eval_protocol.pytest import evaluation_test |
13 | 13 | from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor |
14 | 14 | from eval_protocol.quickstart.utils import ( |
|
23 | 23 | from openai import AsyncOpenAI |
24 | 24 |
|
25 | 25 |
|
| 26 | +def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: |
| 27 | + converted: List[EvaluationRow] = [] |
| 28 | + for r in rows: |
| 29 | + question = r.get("question", "") |
| 30 | + answer = r.get("answer", None) |
| 31 | + messages = [ |
| 32 | + Message(role="system", content="hi"), |
| 33 | + Message(role="user", content=str(question)), |
| 34 | + ] |
| 35 | + converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None)) |
| 36 | + return converted |
| 37 | + |
| 38 | + |
26 | 39 | @pytest.mark.asyncio |
27 | 40 | @evaluation_test( |
28 | | - input_rows=[ |
29 | | - fetch_langfuse_traces_as_evaluation_rows( |
30 | | - hours_back=24, |
31 | | - limit=1, |
32 | | - page_size=10, |
33 | | - sleep_between_gets=3.0, |
34 | | - max_retries=5, |
35 | | - ) |
| 41 | + input_dataset=[ |
| 42 | + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", |
| 43 | + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", |
36 | 44 | ], |
| 45 | + dataset_adapter=aime2025_dataset_adapter, |
37 | 46 | completion_params=[ |
38 | | - # { |
39 | | - # "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", |
40 | | - # }, |
41 | | - {"model": "gpt-4.1"}, |
42 | 47 | { |
43 | 48 | "max_tokens": 131000, |
44 | | - "extra_body": {"reasoning_effort": "medium"}, |
| 49 | + "extra_body": {"reasoning_effort": "low"}, |
45 | 50 | "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", |
46 | 51 | }, |
47 | 52 | { |
48 | 53 | "max_tokens": 131000, |
49 | | - "extra_body": {"reasoning_effort": "low"}, |
50 | | - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", |
| 54 | + "extra_body": {"reasoning_effort": "medium"}, |
| 55 | + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", |
51 | 56 | }, |
52 | 57 | ], |
53 | 58 | rollout_processor=SingleTurnRolloutProcessor(), |
54 | | - # preprocess_fn=split_multi_turn_rows, |
55 | | - max_concurrent_rollouts=64, |
| 59 | + aggregation_method="mean", |
| 60 | + passed_threshold=0.8, |
| 61 | + num_runs=1, |
| 62 | + max_dataset_rows=1, |
| 63 | + max_concurrent_rollouts=4, |
56 | 64 | mode="pointwise", |
57 | 65 | ) |
58 | 66 | async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: |
|
0 commit comments