try proper streaming helper

xzrderek · xzrderek · commit 352297cf8ff6 · 2025-10-14T15:39:17.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -59,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
+def _get_aime_dataset_path() -> str:
+    """Get the AIME dataset file path."""
+    return str(Path(__file__).parent / "data" / "aime.jsonl")
+
+
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:
@@ -74,24 +79,25 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
+        # _get_aime_dataset_path(),
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
         {
             "max_tokens": 131000,
-            # "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
-            # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "extra_body": {"reasoning_effort": "low"},
+            # "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
     num_runs=1,
-    max_dataset_rows=30,
-    max_concurrent_rollouts=8,
+    max_dataset_rows=1,
+    max_concurrent_rollouts=1,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -16,8 +16,6 @@
 
 logger = logging.getLogger(__name__)
 
-litellm._turn_on_debug()  # pyright: ignore[reportPrivateImportUsage]
-
 
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
@@ -66,26 +64,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            # _litellm = importlib.import_module("litellm")
-            # acompletion = getattr(_litellm, "acompletion")
-
-            # Handle streaming response
-            assistant_content = ""
-            tool_calls = None
-            usage_info = None
+            chunks = []
 
             stream = await acompletion(**request_params)
-            async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
-                if chunk.choices and len(chunk.choices) > 0:
-                    delta = chunk.choices[0].delta
-                    if hasattr(delta, "content") and delta.content:
-                        assistant_content += delta.content
-                    if hasattr(delta, "tool_calls") and delta.tool_calls:
-                        tool_calls = delta.tool_calls
-
-                # Capture usage info from the final chunk
-                if hasattr(chunk, "usage") and chunk.usage:
-                    usage_info = chunk.usage
+            async for chunk in stream:
+                chunks.append(chunk)
+
+            response = litellm.stream_chunk_builder(chunks, messages_payload)
+
+            if response is None:
+                raise ValueError("Response is None")
+
+            assistant_content = response.choices[0].message.content or ""
+            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
 
             converted_tool_calls = None
             if tool_calls:
@@ -125,20 +116,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     tool_calls=converted_tool_calls,
                 )
             ]
-
-            if usage_info:
-                row.execution_metadata.usage = CompletionUsage(
-                    prompt_tokens=usage_info.prompt_tokens,
-                    completion_tokens=usage_info.completion_tokens,
-                    total_tokens=usage_info.total_tokens,
-                )
-            else:
-                # Fallback if usage info not available from streaming
-                row.execution_metadata.usage = CompletionUsage(
-                    prompt_tokens=0,
-                    completion_tokens=0,
-                    total_tokens=0,
-                )
+            row.execution_metadata.usage = CompletionUsage(
+                prompt_tokens=response.usage.prompt_tokens,
+                completion_tokens=response.usage.completion_tokens,
+                total_tokens=response.usage.total_tokens,
+            )
 
             row.messages = messages