Skip to content

Commit ee7e415

Browse files
committed
try all
1 parent e36383f commit ee7e415

File tree

2 files changed

+15
-18
lines changed

2 files changed

+15
-18
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,25 +79,26 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7979

8080
@evaluation_test(
8181
input_dataset=[
82-
_get_aime_dataset_path(),
83-
# "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
84-
# "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
82+
# _get_aime_dataset_path(),
83+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
84+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
8585
],
8686
dataset_adapter=aime2025_dataset_adapter,
8787
completion_params=[
8888
{
89-
"max_tokens": 65536,
89+
"max_tokens": 131000,
9090
# "extra_body": {"reasoning_effort": "low"},
9191
"model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
9292
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
93+
"stream": True,
9394
}
9495
],
9596
rollout_processor=SingleTurnRolloutProcessor(),
9697
aggregation_method="mean",
9798
passed_threshold=0.8,
9899
num_runs=1,
99-
max_dataset_rows=1,
100-
max_concurrent_rollouts=1,
100+
max_dataset_rows=30,
101+
max_concurrent_rollouts=8,
101102
mode="pointwise",
102103
)
103104
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
import litellm
88
from litellm import acompletion
9-
from typing import Dict
109

1110
from eval_protocol.dataset_logger import default_logger
1211
from eval_protocol.models import EvaluationRow, Message
@@ -36,7 +35,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
3635
request_params = {"messages": messages_payload, **config.completion_params}
3736
# Ensure caching is disabled only for this request (review feedback)
3837
request_params["cache"] = {"no-cache": True}
39-
request_params["stream"] = True # Enable streaming
4038
# Single-level reasoning effort: expect `reasoning_effort` only
4139
effort_val = None
4240

@@ -64,16 +62,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
6462
if row.tools is not None:
6563
request_params["tools"] = row.tools
6664

67-
chunks = []
68-
print("time: ", time.time())
69-
70-
stream = await acompletion(**request_params)
71-
async for chunk in stream:
72-
# print("chunk added at time: ", time.time())
73-
# print("chunk: ", chunk)
74-
chunks.append(chunk)
75-
76-
response = litellm.stream_chunk_builder(chunks, messages_payload)
65+
if request_params.get("stream") is True:
66+
chunks = []
67+
stream = await acompletion(**request_params)
68+
async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues]
69+
chunks.append(chunk)
70+
response = litellm.stream_chunk_builder(chunks, messages_payload)
71+
else:
72+
response = await acompletion(**request_params)
7773

7874
if response is None:
7975
raise ValueError("Response is None")

0 commit comments

Comments
 (0)