try all

xzrderek · xzrderek · commit ee7e415eda62 · 2025-10-14T17:17:49.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -79,25 +79,26 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        _get_aime_dataset_path(),
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        # _get_aime_dataset_path(),
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
         {
-            "max_tokens": 65536,
+            "max_tokens": 131000,
             # "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "stream": True,
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
     num_runs=1,
-    max_dataset_rows=1,
-    max_concurrent_rollouts=1,
+    max_dataset_rows=30,
+    max_concurrent_rollouts=8,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -6,7 +6,6 @@
 
 import litellm
 from litellm import acompletion
-from typing import Dict
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
@@ -36,7 +35,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
-            request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
 
@@ -64,16 +62,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            chunks = []
-            print("time: ", time.time())
-
-            stream = await acompletion(**request_params)
-            async for chunk in stream:
-                # print("chunk added at time: ", time.time())
-                # print("chunk: ", chunk)
-                chunks.append(chunk)
-
-            response = litellm.stream_chunk_builder(chunks, messages_payload)
+            if request_params.get("stream") is True:
+                chunks = []
+                stream = await acompletion(**request_params)
+                async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
+                    chunks.append(chunk)
+                response = litellm.stream_chunk_builder(chunks, messages_payload)
+            else:
+                response = await acompletion(**request_params)
 
             if response is None:
                 raise ValueError("Response is None")