Skip to content

Commit af137b3

Browse files
committed
test w streaming
1 parent 66278e4 commit af137b3

File tree

2 files changed

+32
-11
lines changed

2 files changed

+32
-11
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
8787
completion_params=[
8888
{
8989
"max_tokens": 131000,
90-
# "extra_body": {"reasoning_effort": "low"},
90+
"extra_body": {"reasoning_effort": "low"},
9191
"model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
92-
"request_timeout": 30,
92+
# "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
9393
}
9494
],
9595
rollout_processor=SingleTurnRolloutProcessor(),

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
3535
request_params = {"messages": messages_payload, **config.completion_params}
3636
# Ensure caching is disabled only for this request (review feedback)
3737
request_params["cache"] = {"no-cache": True}
38-
request_params["timeout"] = 1200 # 20 minutes timeout
38+
request_params["stream"] = True # Enable streaming
3939
# Single-level reasoning effort: expect `reasoning_effort` only
4040
effort_val = None
4141

@@ -68,10 +68,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
6868

6969
_litellm = importlib.import_module("litellm")
7070
acompletion = getattr(_litellm, "acompletion")
71-
response = await acompletion(**request_params)
7271

73-
assistant_content = response.choices[0].message.content or ""
74-
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
72+
# Handle streaming response
73+
assistant_content = ""
74+
tool_calls = None
75+
usage_info = None
76+
77+
async for chunk in await acompletion(**request_params):
78+
if chunk.choices and len(chunk.choices) > 0:
79+
delta = chunk.choices[0].delta
80+
if hasattr(delta, "content") and delta.content:
81+
assistant_content += delta.content
82+
if hasattr(delta, "tool_calls") and delta.tool_calls:
83+
tool_calls = delta.tool_calls
84+
85+
# Capture usage info from the final chunk
86+
if hasattr(chunk, "usage") and chunk.usage:
87+
usage_info = chunk.usage
7588

7689
converted_tool_calls = None
7790
if tool_calls:
@@ -112,11 +125,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
112125
)
113126
]
114127

115-
row.execution_metadata.usage = CompletionUsage(
116-
prompt_tokens=response.usage.prompt_tokens,
117-
completion_tokens=response.usage.completion_tokens,
118-
total_tokens=response.usage.total_tokens,
119-
)
128+
if usage_info:
129+
row.execution_metadata.usage = CompletionUsage(
130+
prompt_tokens=usage_info.prompt_tokens,
131+
completion_tokens=usage_info.completion_tokens,
132+
total_tokens=usage_info.total_tokens,
133+
)
134+
else:
135+
# Fallback if usage info not available from streaming
136+
row.execution_metadata.usage = CompletionUsage(
137+
prompt_tokens=0,
138+
completion_tokens=0,
139+
total_tokens=0,
140+
)
120141

121142
row.messages = messages
122143

0 commit comments

Comments
 (0)