Skip to content

Commit 21fdb2b

Browse files
committed
undo weird changes i made
1 parent ba6ff32 commit 21fdb2b

File tree

3 files changed

+15
-24
lines changed

3 files changed

+15
-24
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -82,28 +82,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
8282
"max_tokens": 131000,
8383
"extra_body": {"reasoning_effort": "low"},
8484
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
85-
},
86-
{
87-
"max_tokens": 131000,
88-
"extra_body": {"reasoning_effort": "medium"},
89-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
90-
},
91-
{
92-
"max_tokens": 131000,
93-
"extra_body": {"reasoning_effort": "low"},
94-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
95-
},
96-
{
97-
"max_tokens": 131000,
98-
"extra_body": {"reasoning_effort": "medium"},
99-
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
100-
},
85+
}
10186
],
10287
rollout_processor=SingleTurnRolloutProcessor(),
10388
aggregation_method="mean",
10489
passed_threshold=0.8,
105-
num_runs=1,
106-
max_dataset_rows=1,
90+
num_runs=8,
91+
max_dataset_rows=2,
10792
max_concurrent_rollouts=4,
10893
mode="pointwise",
10994
)

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List
66

77
from litellm import acompletion
8+
from typing import Dict
89

910
from eval_protocol.dataset_logger import default_logger
1011
from eval_protocol.models import EvaluationRow, Message
@@ -61,10 +62,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
6162
if row.tools is not None:
6263
request_params["tools"] = row.tools
6364

65+
# Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
66+
import importlib
67+
68+
_litellm = importlib.import_module("litellm")
69+
acompletion = getattr(_litellm, "acompletion")
6470
response = await acompletion(**request_params)
6571

66-
assistant_content = response.choices[0].message.content or "" # pyright: ignore[reportAttributeAccessIssue]
67-
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None # pyright: ignore[reportAttributeAccessIssue]
72+
assistant_content = response.choices[0].message.content or ""
73+
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
6874

6975
converted_tool_calls = None
7076
if tool_calls:
@@ -106,9 +112,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
106112
]
107113

108114
row.execution_metadata.usage = CompletionUsage(
109-
prompt_tokens=response.usage.prompt_tokens, # pyright: ignore[reportAttributeAccessIssue]
110-
completion_tokens=response.usage.completion_tokens, # pyright: ignore[reportAttributeAccessIssue]
111-
total_tokens=response.usage.total_tokens, # pyright: ignore[reportAttributeAccessIssue]
115+
prompt_tokens=response.usage.prompt_tokens,
116+
completion_tokens=response.usage.completion_tokens,
117+
total_tokens=response.usage.total_tokens,
112118
)
113119

114120
row.messages = messages

eval_protocol/quickstart/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def split_multi_turn_rows(data: list[EvaluationRow]) -> list[EvaluationRow]:
130130
)
131131
)
132132

133-
return [expanded_rows[0]]
133+
return expanded_rows
134134

135135

136136
async def pairwise_judgment_async(question_text, answer_a, answer_b, tools, judge_config, shared_client):

0 commit comments

Comments
 (0)