try again

xzrderek · xzrderek · commit fce442b80c2b · 2025-10-14T15:08:01.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -81,17 +81,17 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
+            # "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=8,
-    max_dataset_rows=2,
-    max_concurrent_rollouts=4,
+    num_runs=1,
+    max_dataset_rows=30,
+    max_concurrent_rollouts=8,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -4,6 +4,7 @@
 import time
 from typing import List
 
+import litellm
 from litellm import acompletion
 from typing import Dict
 
@@ -15,6 +16,8 @@
 
 logger = logging.getLogger(__name__)
 
+litellm._turn_on_debug()  # pyright: ignore[reportPrivateImportUsage]
+
 
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
@@ -35,7 +38,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
-            # request_params["timeout"] = 1200  # 20 minutes timeout
             request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
@@ -64,30 +66,27 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
-            import importlib
-
-            _litellm = importlib.import_module("litellm")
-            acompletion = getattr(_litellm, "acompletion")
+            # _litellm = importlib.import_module("litellm")
+            # acompletion = getattr(_litellm, "acompletion")
 
-            # Handle streaming response - following LiteLLM docs pattern
+            # Handle streaming response
             assistant_content = ""
             tool_calls = None
-            chunks = []
-
-            response = await acompletion(**request_params)
-
-            # Process streaming chunks
-            async for chunk in response:
-                chunks.append(chunk)  # Collect chunks for potential use with stream_chunk_builder
+            usage_info = None
 
+            stream = await acompletion(**request_params)
+            async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                 if chunk.choices and len(chunk.choices) > 0:
                     delta = chunk.choices[0].delta
                     if hasattr(delta, "content") and delta.content:
                         assistant_content += delta.content
                     if hasattr(delta, "tool_calls") and delta.tool_calls:
                         tool_calls = delta.tool_calls
 
+                # Capture usage info from the final chunk
+                if hasattr(chunk, "usage") and chunk.usage:
+                    usage_info = chunk.usage
+
             converted_tool_calls = None
             if tool_calls:
                 converted_tool_calls = []
@@ -127,26 +126,18 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 )
             ]
 
-            # Try to get usage info from chunks, fallback to estimates
-            usage_info = None
-            for chunk in reversed(chunks):  # Check last chunks first for usage info
-                if hasattr(chunk, "usage") and chunk.usage:
-                    usage_info = chunk.usage
-                    break
-
             if usage_info:
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=usage_info.prompt_tokens,
                     completion_tokens=usage_info.completion_tokens,
                     total_tokens=usage_info.total_tokens,
                 )
             else:
-                # Fallback estimates when streaming doesn't provide usage
-                estimated_completion_tokens = len(assistant_content.split()) if assistant_content else 0
+                # Fallback if usage info not available from streaming
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=0,
-                    completion_tokens=estimated_completion_tokens,
-                    total_tokens=estimated_completion_tokens,
+                    completion_tokens=0,
+                    total_tokens=0,
                 )
 
             row.messages = messages