Skip to content

Commit 352297c

Browse files
committed
try proper streaming helper
1 parent fce442b commit 352297c

File tree

2 files changed

+27
-39
lines changed

2 files changed

+27
-39
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
5959
return None
6060

6161

62+
def _get_aime_dataset_path() -> str:
63+
"""Get the AIME dataset file path."""
64+
return str(Path(__file__).parent / "data" / "aime.jsonl")
65+
66+
6267
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6368
converted: List[EvaluationRow] = []
6469
for r in rows:
@@ -74,24 +79,25 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7479

7580
@evaluation_test(
7681
input_dataset=[
82+
# _get_aime_dataset_path(),
7783
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
7884
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
7985
],
8086
dataset_adapter=aime2025_dataset_adapter,
8187
completion_params=[
8288
{
8389
"max_tokens": 131000,
84-
# "extra_body": {"reasoning_effort": "low"},
85-
"model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
86-
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
90+
"extra_body": {"reasoning_effort": "low"},
91+
# "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
92+
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
8793
}
8894
],
8995
rollout_processor=SingleTurnRolloutProcessor(),
9096
aggregation_method="mean",
9197
passed_threshold=0.8,
9298
num_runs=1,
93-
max_dataset_rows=30,
94-
max_concurrent_rollouts=8,
99+
max_dataset_rows=1,
100+
max_concurrent_rollouts=1,
95101
mode="pointwise",
96102
)
97103
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 16 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616

1717
logger = logging.getLogger(__name__)
1818

19-
litellm._turn_on_debug() # pyright: ignore[reportPrivateImportUsage]
20-
2119

2220
class SingleTurnRolloutProcessor(RolloutProcessor):
2321
"""Single turn rollout processor for direct LLM calls."""
@@ -66,26 +64,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
6664
if row.tools is not None:
6765
request_params["tools"] = row.tools
6866

69-
# _litellm = importlib.import_module("litellm")
70-
# acompletion = getattr(_litellm, "acompletion")
71-
72-
# Handle streaming response
73-
assistant_content = ""
74-
tool_calls = None
75-
usage_info = None
67+
chunks = []
7668

7769
stream = await acompletion(**request_params)
78-
async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues]
79-
if chunk.choices and len(chunk.choices) > 0:
80-
delta = chunk.choices[0].delta
81-
if hasattr(delta, "content") and delta.content:
82-
assistant_content += delta.content
83-
if hasattr(delta, "tool_calls") and delta.tool_calls:
84-
tool_calls = delta.tool_calls
85-
86-
# Capture usage info from the final chunk
87-
if hasattr(chunk, "usage") and chunk.usage:
88-
usage_info = chunk.usage
70+
async for chunk in stream:
71+
chunks.append(chunk)
72+
73+
response = litellm.stream_chunk_builder(chunks, messages_payload)
74+
75+
if response is None:
76+
raise ValueError("Response is None")
77+
78+
assistant_content = response.choices[0].message.content or ""
79+
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
8980

9081
converted_tool_calls = None
9182
if tool_calls:
@@ -125,20 +116,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
125116
tool_calls=converted_tool_calls,
126117
)
127118
]
128-
129-
if usage_info:
130-
row.execution_metadata.usage = CompletionUsage(
131-
prompt_tokens=usage_info.prompt_tokens,
132-
completion_tokens=usage_info.completion_tokens,
133-
total_tokens=usage_info.total_tokens,
134-
)
135-
else:
136-
# Fallback if usage info not available from streaming
137-
row.execution_metadata.usage = CompletionUsage(
138-
prompt_tokens=0,
139-
completion_tokens=0,
140-
total_tokens=0,
141-
)
119+
row.execution_metadata.usage = CompletionUsage(
120+
prompt_tokens=response.usage.prompt_tokens,
121+
completion_tokens=response.usage.completion_tokens,
122+
total_tokens=response.usage.total_tokens,
123+
)
142124

143125
row.messages = messages
144126

0 commit comments

Comments
 (0)