From 514ff96653536babd6541b2e09d47262a917a23c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 11:42:07 -0700
Subject: [PATCH 01/24] Test

---
 eval_protocol/benchmarks/data/aime.jsonl | 1 +
 eval_protocol/benchmarks/test_aime25.py  | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100644 eval_protocol/benchmarks/data/aime.jsonl

diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl
new file mode 100644
index 00000000..5869edb6
--- /dev/null
+++ b/eval_protocol/benchmarks/data/aime.jsonl
@@ -0,0 +1 @@
+{"question": "On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.", "answer": "588"}
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 91a67f77..5898750b 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -73,8 +73,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        "eval_protocol/benchmarks/data/aime.jsonl",
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
@@ -87,8 +88,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=8,
-    max_dataset_rows=2,
+    num_runs=1,
+    max_dataset_rows=1,
     max_concurrent_rollouts=4,
     mode="pointwise",
 )

From c1fdb95262f658ca118b4b0ff263f4e10ca77445 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 11:56:18 -0700
Subject: [PATCH 02/24] fix path

---
 eval_protocol/benchmarks/test_aime25.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 5898750b..59352043 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -1,4 +1,5 @@
 from typing import Any, Dict, List, Optional
+from pathlib import Path
 
 from eval_protocol.models import (
     EvaluateResult,
@@ -58,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
+def _get_aime_dataset_path() -> str:
+    """Get the AIME dataset file path."""
+    return str(Path(__file__).parent / "data" / "aime.jsonl")
+
+
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:
@@ -73,7 +79,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        "eval_protocol/benchmarks/data/aime.jsonl",
+        _get_aime_dataset_path(),
         # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
         # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],

From 883797656c47a057fd31ad502ed4119e7447407d Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 13:01:17 -0700
Subject: [PATCH 03/24] test 20 min timeout

---
 eval_protocol/pytest/default_single_turn_rollout_process.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 2b4bf893..dfae6e29 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -35,6 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
+            request_params["timeout"] = 1200  # 20 minutes timeout
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
 

From d998db433f4483f01d1190e9e2562a36061cd982 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 13:18:06 -0700
Subject: [PATCH 04/24] test

---
 eval_protocol/benchmarks/test_aime25.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 59352043..8e1c62b2 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -89,6 +89,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
             "max_tokens": 131000,
             "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "request_timeout": 1200,  # 20 minutes Fireworks timeout
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),

From 66278e4034d48121c7bf53efeed0d1f1f49e5053 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 13:21:06 -0700
Subject: [PATCH 05/24] test

---
 eval_protocol/benchmarks/test_aime25.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 8e1c62b2..2bdeef2e 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -87,9 +87,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
-            "request_timeout": 1200,  # 20 minutes Fireworks timeout
+            # "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
+            "request_timeout": 30,
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
@@ -97,7 +97,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     passed_threshold=0.8,
     num_runs=1,
     max_dataset_rows=1,
-    max_concurrent_rollouts=4,
+    max_concurrent_rollouts=1,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

From af137b3f845a9079a3110e424bbfa46114ac287c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 13:47:47 -0700
Subject: [PATCH 06/24] test w streaming

---
 eval_protocol/benchmarks/test_aime25.py       |  4 +-
 .../default_single_turn_rollout_process.py    | 39 ++++++++++++++-----
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 2bdeef2e..700be097 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -87,9 +87,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            # "extra_body": {"reasoning_effort": "low"},
+            "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
-            "request_timeout": 30,
+            # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index dfae6e29..1b1d8c3d 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -35,7 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
-            request_params["timeout"] = 1200  # 20 minutes timeout
+            request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
 
@@ -68,10 +68,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             _litellm = importlib.import_module("litellm")
             acompletion = getattr(_litellm, "acompletion")
-            response = await acompletion(**request_params)
 
-            assistant_content = response.choices[0].message.content or ""
-            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+            # Handle streaming response
+            assistant_content = ""
+            tool_calls = None
+            usage_info = None
+
+            async for chunk in await acompletion(**request_params):
+                if chunk.choices and len(chunk.choices) > 0:
+                    delta = chunk.choices[0].delta
+                    if hasattr(delta, "content") and delta.content:
+                        assistant_content += delta.content
+                    if hasattr(delta, "tool_calls") and delta.tool_calls:
+                        tool_calls = delta.tool_calls
+
+                # Capture usage info from the final chunk
+                if hasattr(chunk, "usage") and chunk.usage:
+                    usage_info = chunk.usage
 
             converted_tool_calls = None
             if tool_calls:
@@ -112,11 +125,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 )
             ]
 
-            row.execution_metadata.usage = CompletionUsage(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
-            )
+            if usage_info:
+                row.execution_metadata.usage = CompletionUsage(
+                    prompt_tokens=usage_info.prompt_tokens,
+                    completion_tokens=usage_info.completion_tokens,
+                    total_tokens=usage_info.total_tokens,
+                )
+            else:
+                # Fallback if usage info not available from streaming
+                row.execution_metadata.usage = CompletionUsage(
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    total_tokens=0,
+                )
 
             row.messages = messages
 

From a3e79418abae119814d46308bb53739833808e4e Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 13:56:55 -0700
Subject: [PATCH 07/24] test on full dataset

---
 eval_protocol/benchmarks/test_aime25.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 700be097..4e7a336f 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
-def _get_aime_dataset_path() -> str:
-    """Get the AIME dataset file path."""
-    return str(Path(__file__).parent / "data" / "aime.jsonl")
-
-
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:
@@ -79,9 +74,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        _get_aime_dataset_path(),
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
@@ -95,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=1,
-    max_dataset_rows=1,
-    max_concurrent_rollouts=1,
+    num_runs=8,
+    max_dataset_rows=2,
+    max_concurrent_rollouts=4,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

From e71c5d8c57a702894693878d3e8fa560feb7e6e1 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 14:26:24 -0700
Subject: [PATCH 08/24] try again

---
 .../default_single_turn_rollout_process.py    | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 1b1d8c3d..07de69a0 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -35,6 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
+            # request_params["timeout"] = 1200  # 20 minutes timeout
             request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
@@ -69,12 +70,17 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             _litellm = importlib.import_module("litellm")
             acompletion = getattr(_litellm, "acompletion")
 
-            # Handle streaming response
+            # Handle streaming response - following LiteLLM docs pattern
             assistant_content = ""
             tool_calls = None
-            usage_info = None
+            chunks = []
+
+            response = await acompletion(**request_params)
+
+            # Process streaming chunks
+            async for chunk in response:
+                chunks.append(chunk)  # Collect chunks for potential use with stream_chunk_builder
 
-            async for chunk in await acompletion(**request_params):
                 if chunk.choices and len(chunk.choices) > 0:
                     delta = chunk.choices[0].delta
                     if hasattr(delta, "content") and delta.content:
@@ -82,10 +88,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     if hasattr(delta, "tool_calls") and delta.tool_calls:
                         tool_calls = delta.tool_calls
 
-                # Capture usage info from the final chunk
-                if hasattr(chunk, "usage") and chunk.usage:
-                    usage_info = chunk.usage
-
             converted_tool_calls = None
             if tool_calls:
                 converted_tool_calls = []
@@ -125,6 +127,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 )
             ]
 
+            # Try to get usage info from chunks, fallback to estimates
+            usage_info = None
+            for chunk in reversed(chunks):  # Check last chunks first for usage info
+                if hasattr(chunk, "usage") and chunk.usage:
+                    usage_info = chunk.usage
+                    break
+
             if usage_info:
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=usage_info.prompt_tokens,
@@ -132,11 +141,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     total_tokens=usage_info.total_tokens,
                 )
             else:
-                # Fallback if usage info not available from streaming
+                # Fallback estimates when streaming doesn't provide usage
+                estimated_completion_tokens = len(assistant_content.split()) if assistant_content else 0
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=0,
-                    completion_tokens=0,
-                    total_tokens=0,
+                    completion_tokens=estimated_completion_tokens,
+                    total_tokens=estimated_completion_tokens,
                 )
 
             row.messages = messages

From fce442b80c2b5f7fef756436e9c5b57cc7f6c581 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 15:08:01 -0700
Subject: [PATCH 09/24] try again

---
 eval_protocol/benchmarks/test_aime25.py       |  8 ++--
 .../default_single_turn_rollout_process.py    | 41 ++++++++-----------
 2 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 4e7a336f..6e1c7852 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -81,7 +81,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
+            # "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
@@ -89,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=8,
-    max_dataset_rows=2,
-    max_concurrent_rollouts=4,
+    num_runs=1,
+    max_dataset_rows=30,
+    max_concurrent_rollouts=8,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 07de69a0..64b66283 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -4,6 +4,7 @@
 import time
 from typing import List
 
+import litellm
 from litellm import acompletion
 from typing import Dict
 
@@ -15,6 +16,8 @@
 
 logger = logging.getLogger(__name__)
 
+litellm._turn_on_debug()  # pyright: ignore[reportPrivateImportUsage]
+
 
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
@@ -35,7 +38,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
-            # request_params["timeout"] = 1200  # 20 minutes timeout
             request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
@@ -64,23 +66,16 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
-            import importlib
-
-            _litellm = importlib.import_module("litellm")
-            acompletion = getattr(_litellm, "acompletion")
+            # _litellm = importlib.import_module("litellm")
+            # acompletion = getattr(_litellm, "acompletion")
 
-            # Handle streaming response - following LiteLLM docs pattern
+            # Handle streaming response
             assistant_content = ""
             tool_calls = None
-            chunks = []
-
-            response = await acompletion(**request_params)
-
-            # Process streaming chunks
-            async for chunk in response:
-                chunks.append(chunk)  # Collect chunks for potential use with stream_chunk_builder
+            usage_info = None
 
+            stream = await acompletion(**request_params)
+            async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                 if chunk.choices and len(chunk.choices) > 0:
                     delta = chunk.choices[0].delta
                     if hasattr(delta, "content") and delta.content:
@@ -88,6 +83,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     if hasattr(delta, "tool_calls") and delta.tool_calls:
                         tool_calls = delta.tool_calls
 
+                # Capture usage info from the final chunk
+                if hasattr(chunk, "usage") and chunk.usage:
+                    usage_info = chunk.usage
+
             converted_tool_calls = None
             if tool_calls:
                 converted_tool_calls = []
@@ -127,13 +126,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 )
             ]
 
-            # Try to get usage info from chunks, fallback to estimates
-            usage_info = None
-            for chunk in reversed(chunks):  # Check last chunks first for usage info
-                if hasattr(chunk, "usage") and chunk.usage:
-                    usage_info = chunk.usage
-                    break
-
             if usage_info:
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=usage_info.prompt_tokens,
@@ -141,12 +133,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     total_tokens=usage_info.total_tokens,
                 )
             else:
-                # Fallback estimates when streaming doesn't provide usage
-                estimated_completion_tokens = len(assistant_content.split()) if assistant_content else 0
+                # Fallback if usage info not available from streaming
                 row.execution_metadata.usage = CompletionUsage(
                     prompt_tokens=0,
-                    completion_tokens=estimated_completion_tokens,
-                    total_tokens=estimated_completion_tokens,
+                    completion_tokens=0,
+                    total_tokens=0,
                 )
 
             row.messages = messages

From 352297cf8ff683c4158ae67b01d12ffb72ae1a71 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 15:39:17 -0700
Subject: [PATCH 10/24] try proper streaming helper

---
 eval_protocol/benchmarks/test_aime25.py       | 16 ++++--
 .../default_single_turn_rollout_process.py    | 50 ++++++-------------
 2 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 6e1c7852..c90b80ac 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -59,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
+def _get_aime_dataset_path() -> str:
+    """Get the AIME dataset file path."""
+    return str(Path(__file__).parent / "data" / "aime.jsonl")
+
+
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:
@@ -74,6 +79,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
+        # _get_aime_dataset_path(),
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
@@ -81,17 +87,17 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            # "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
-            # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "extra_body": {"reasoning_effort": "low"},
+            # "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
     num_runs=1,
-    max_dataset_rows=30,
-    max_concurrent_rollouts=8,
+    max_dataset_rows=1,
+    max_concurrent_rollouts=1,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 64b66283..47618974 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -16,8 +16,6 @@
 
 logger = logging.getLogger(__name__)
 
-litellm._turn_on_debug()  # pyright: ignore[reportPrivateImportUsage]
-
 
 class SingleTurnRolloutProcessor(RolloutProcessor):
     """Single turn rollout processor for direct LLM calls."""
@@ -66,26 +64,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            # _litellm = importlib.import_module("litellm")
-            # acompletion = getattr(_litellm, "acompletion")
-
-            # Handle streaming response
-            assistant_content = ""
-            tool_calls = None
-            usage_info = None
+            chunks = []
 
             stream = await acompletion(**request_params)
-            async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
-                if chunk.choices and len(chunk.choices) > 0:
-                    delta = chunk.choices[0].delta
-                    if hasattr(delta, "content") and delta.content:
-                        assistant_content += delta.content
-                    if hasattr(delta, "tool_calls") and delta.tool_calls:
-                        tool_calls = delta.tool_calls
-
-                # Capture usage info from the final chunk
-                if hasattr(chunk, "usage") and chunk.usage:
-                    usage_info = chunk.usage
+            async for chunk in stream:
+                chunks.append(chunk)
+
+            response = litellm.stream_chunk_builder(chunks, messages_payload)
+
+            if response is None:
+                raise ValueError("Response is None")
+
+            assistant_content = response.choices[0].message.content or ""
+            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
 
             converted_tool_calls = None
             if tool_calls:
@@ -125,20 +116,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     tool_calls=converted_tool_calls,
                 )
             ]
-
-            if usage_info:
-                row.execution_metadata.usage = CompletionUsage(
-                    prompt_tokens=usage_info.prompt_tokens,
-                    completion_tokens=usage_info.completion_tokens,
-                    total_tokens=usage_info.total_tokens,
-                )
-            else:
-                # Fallback if usage info not available from streaming
-                row.execution_metadata.usage = CompletionUsage(
-                    prompt_tokens=0,
-                    completion_tokens=0,
-                    total_tokens=0,
-                )
+            row.execution_metadata.usage = CompletionUsage(
+                prompt_tokens=response.usage.prompt_tokens,
+                completion_tokens=response.usage.completion_tokens,
+                total_tokens=response.usage.total_tokens,
+            )
 
             row.messages = messages
 

From 5baebc587db0084997a9e18ce10a802691766859 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 15:51:27 -0700
Subject: [PATCH 11/24] test failing

---
 eval_protocol/benchmarks/data/aime.jsonl | 2 +-
 eval_protocol/benchmarks/test_aime25.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl
index 5869edb6..a09508ce 100644
--- a/eval_protocol/benchmarks/data/aime.jsonl
+++ b/eval_protocol/benchmarks/data/aime.jsonl
@@ -1 +1 @@
-{"question": "On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.", "answer": "588"}
+{"question": "Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c\\leq3^6$ and $a^3+b^3+c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by $1000$.", "answer": "735"}
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index c90b80ac..98e49499 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        # _get_aime_dataset_path(),
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        _get_aime_dataset_path(),
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[

From 706e42494e7f7a4735cbfe9a64d711f14f085c2c Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 16:21:11 -0700
Subject: [PATCH 12/24] test

---
 eval_protocol/benchmarks/test_aime25.py                   | 8 ++++----
 .../pytest/default_single_turn_rollout_process.py         | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 98e49499..a430cc19 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -86,10 +86,10 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
         {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
-            # "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "max_tokens": 65536,
+            # "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
+            # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 47618974..ae1b0db2 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -65,9 +65,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 request_params["tools"] = row.tools
 
             chunks = []
+            print("time: ", time.time())
 
             stream = await acompletion(**request_params)
             async for chunk in stream:
+                print("chunk added at time: ", time.time())
+                print("chunk: ", chunk)
                 chunks.append(chunk)
 
             response = litellm.stream_chunk_builder(chunks, messages_payload)

From e36383f6a85a18f3afb06e5d7b4e32acb69aa20b Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 16:34:41 -0700
Subject: [PATCH 13/24] test

---
 eval_protocol/pytest/default_single_turn_rollout_process.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index ae1b0db2..9b698bb1 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -69,8 +69,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             stream = await acompletion(**request_params)
             async for chunk in stream:
-                print("chunk added at time: ", time.time())
-                print("chunk: ", chunk)
+                # print("chunk added at time: ", time.time())
+                # print("chunk: ", chunk)
                 chunks.append(chunk)
 
             response = litellm.stream_chunk_builder(chunks, messages_payload)

From ee7e415eda62edeca7f1eb6b4454551be7f4f418 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 17:17:49 -0700
Subject: [PATCH 14/24] try all

---
 eval_protocol/benchmarks/test_aime25.py       | 13 ++++++------
 .../default_single_turn_rollout_process.py    | 20 ++++++++-----------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index a430cc19..98dba3c9 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -79,25 +79,26 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        _get_aime_dataset_path(),
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        # _get_aime_dataset_path(),
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
         {
-            "max_tokens": 65536,
+            "max_tokens": 131000,
             # "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "stream": True,
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
     num_runs=1,
-    max_dataset_rows=1,
-    max_concurrent_rollouts=1,
+    max_dataset_rows=30,
+    max_concurrent_rollouts=8,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 9b698bb1..b4f9e9f8 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -6,7 +6,6 @@
 
 import litellm
 from litellm import acompletion
-from typing import Dict
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
@@ -36,7 +35,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             request_params = {"messages": messages_payload, **config.completion_params}
             # Ensure caching is disabled only for this request (review feedback)
             request_params["cache"] = {"no-cache": True}
-            request_params["stream"] = True  # Enable streaming
             # Single-level reasoning effort: expect `reasoning_effort` only
             effort_val = None
 
@@ -64,16 +62,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
-            chunks = []
-            print("time: ", time.time())
-
-            stream = await acompletion(**request_params)
-            async for chunk in stream:
-                # print("chunk added at time: ", time.time())
-                # print("chunk: ", chunk)
-                chunks.append(chunk)
-
-            response = litellm.stream_chunk_builder(chunks, messages_payload)
+            if request_params.get("stream") is True:
+                chunks = []
+                stream = await acompletion(**request_params)
+                async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
+                    chunks.append(chunk)
+                response = litellm.stream_chunk_builder(chunks, messages_payload)
+            else:
+                response = await acompletion(**request_params)
 
             if response is None:
                 raise ValueError("Response is None")

From d231ba82c5aa28da5ae931bc0f10bfd09f916d23 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 17:33:03 -0700
Subject: [PATCH 15/24] test low concurrency

---
 eval_protocol/benchmarks/test_aime25.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 98dba3c9..fbc8bd00 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -98,7 +98,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     passed_threshold=0.8,
     num_runs=1,
     max_dataset_rows=30,
-    max_concurrent_rollouts=8,
+    max_concurrent_rollouts=2,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

From fc827678c33e3900d8fade1b935fc1229897d2b6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 22:00:50 -0700
Subject: [PATCH 16/24] test

---
 eval_protocol/benchmarks/test_aime25.py       |  7 ++++---
 .../default_single_turn_rollout_process.py    | 20 +++++++++++++------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index fbc8bd00..6e11d834 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        # _get_aime_dataset_path(),
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        _get_aime_dataset_path(),
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
@@ -91,6 +91,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
             "stream": True,
+            # "timeout": 2400,
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index b4f9e9f8..d98ab042 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -6,6 +6,8 @@
 
 import litellm
 from litellm import acompletion
+from litellm.types.utils import ModelResponse, Choices
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
@@ -65,14 +67,18 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if request_params.get("stream") is True:
                 chunks = []
                 stream = await acompletion(**request_params)
+
+                assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper"
+
                 async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                     chunks.append(chunk)
                 response = litellm.stream_chunk_builder(chunks, messages_payload)
             else:
                 response = await acompletion(**request_params)
 
-            if response is None:
-                raise ValueError("Response is None")
+            assert response is not None, "Response is None"
+            assert isinstance(response, ModelResponse), "Response should be ModelResponse"
+            assert isinstance(response.choices[0], Choices), "Response choice should be a Choices"
 
             assistant_content = response.choices[0].message.content or ""
             tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
@@ -115,10 +121,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     tool_calls=converted_tool_calls,
                 )
             ]
-            row.execution_metadata.usage = CompletionUsage(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
+            row.execution_metadata.usage = (
+                CompletionUsage(  # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field
+                    prompt_tokens=response.usage.prompt_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                    completion_tokens=response.usage.completion_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                    total_tokens=response.usage.total_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                )
             )
 
             row.messages = messages

From c091e5be66bee81bb854b5d187ccb1f9fc385869 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 22:51:29 -0700
Subject: [PATCH 17/24] try 1 concurrent rollout

---
 eval_protocol/benchmarks/test_aime25.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 6e11d834..96d9ee29 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -99,7 +99,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     passed_threshold=0.8,
     num_runs=1,
     max_dataset_rows=30,
-    max_concurrent_rollouts=2,
+    max_concurrent_rollouts=1,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

From 74187811d989eff4b5fba572623bd14755fe2339 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 22:56:26 -0700
Subject: [PATCH 18/24] print out reasoning tokens to debug

---
 .../pytest/default_single_turn_rollout_process.py        | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index d98ab042..65ac7db4 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -73,6 +73,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                     chunks.append(chunk)
                 response = litellm.stream_chunk_builder(chunks, messages_payload)
+
+                # Check for reasoning content
+                print("DEBUG: ", messages_payload)
+                if hasattr(response.choices[0].message, "reasoning_content"):
+                    print(f"Reasoning: {response.choices[0].message.reasoning_content}")
+
+                # Check for thinking blocks
+                if hasattr(response.choices[0].message, "thinking_blocks"):
+                    print(f"Thinking: {response.choices[0].message.thinking_blocks}")
             else:
                 response = await acompletion(**request_params)
 

From b4d149c6ea5c5badfad9a06653f2cf35240fc29b Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 14 Oct 2025 23:13:46 -0700
Subject: [PATCH 19/24] full test

---
 eval_protocol/benchmarks/test_aime25.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 96d9ee29..fcfb0ac1 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        _get_aime_dataset_path(),
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        # _get_aime_dataset_path(),
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[

From 1221842549908dfdd750bcda100f85d1af96d9f0 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 15 Oct 2025 00:41:40 -0700
Subject: [PATCH 20/24] enable other stream

---
 eval_protocol/mcp/execution/policy.py         | 24 +++++++++++++-----
 .../default_mcp_gym_rollout_processor.py      | 25 ++++++++++++++-----
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 1adb9b95..777c4f7e 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -5,15 +5,14 @@
 Rewritten to use LiteLLM for unified retry logic, caching, and provider support.
 """
 
-import asyncio
-import json
 import logging
 import os
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional
 
 import litellm
-from litellm import acompletion, completion
+from litellm import acompletion
+from litellm.types.utils import ModelResponse
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 from litellm.caching.caching import Cache
 from litellm.caching.dual_cache import DualCache
 from litellm.caching.in_memory_cache import InMemoryCache
@@ -194,7 +193,20 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             request_params["tools"] = tools
 
         try:
-            response = await acompletion(model=self.model_id, **request_params)
+            if request_params.get("stream") is True:
+                chunks = []
+                stream = await acompletion(model=self.model_id, **request_params)
+
+                assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper"
+
+                async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
+                    chunks.append(chunk)
+                response = litellm.stream_chunk_builder(chunks, messages)
+            else:
+                response = await acompletion(model=self.model_id, **request_params)
+
+            assert response is not None, "Response is None"
+            assert isinstance(response, ModelResponse), "Response should be ModelResponse"
 
             # Log cache hit/miss for monitoring
             hidden = getattr(response, "_hidden_params", {})
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index 9173e6f9..2d01b6c1 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -223,13 +223,26 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
             try:
                 self.server.start()
 
+                model_id = str(
+                    (config.completion_params.get("model") if config.completion_params else None) or "gpt-4o-mini"
+                )
+                temperature = config.completion_params.get("temperature", 0.0)
+                max_tokens = config.completion_params.get("max_tokens", 4096)
+
+                # Pass all other completion_params (e.g. stream=True) via kwargs
+                other_params = {
+                    k: v
+                    for k, v in (config.completion_params or {}).items()
+                    if k not in ["model", "temperature", "max_tokens", "extra_body"]
+                }
+                extra_body = config.completion_params.get("extra_body", {}) or {}
+
                 self.policy = ep.LiteLLMPolicy(
-                    model_id=str(
-                        (config.completion_params.get("model") if config.completion_params else None) or "gpt-4o-mini"
-                    ),
-                    temperature=config.completion_params.get("temperature", 0.0),
-                    max_tokens=config.completion_params.get("max_tokens", 4096),
-                    **(config.completion_params.get("extra_body", {}) or {}),
+                    model_id=model_id,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    **extra_body,
+                    **other_params,
                 )
 
             except Exception as e:

From 0c8642dc05c7f86b5b33f7b12aedcb52ebbee376 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 15 Oct 2025 09:07:06 -0700
Subject: [PATCH 21/24] remove debug

---
 .../pytest/default_single_turn_rollout_process.py        | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 65ac7db4..d98ab042 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -73,15 +73,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                     chunks.append(chunk)
                 response = litellm.stream_chunk_builder(chunks, messages_payload)
-
-                # Check for reasoning content
-                print("DEBUG: ", messages_payload)
-                if hasattr(response.choices[0].message, "reasoning_content"):
-                    print(f"Reasoning: {response.choices[0].message.reasoning_content}")
-
-                # Check for thinking blocks
-                if hasattr(response.choices[0].message, "thinking_blocks"):
-                    print(f"Thinking: {response.choices[0].message.thinking_blocks}")
             else:
                 response = await acompletion(**request_params)
 

From a4f599f17f8fda5ebda71bab9722ca715134ff41 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 15 Oct 2025 09:08:14 -0700
Subject: [PATCH 22/24] revert test to original

---
 eval_protocol/benchmarks/data/aime.jsonl |  1 -
 eval_protocol/benchmarks/test_aime25.py  | 14 +++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)
 delete mode 100644 eval_protocol/benchmarks/data/aime.jsonl

diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl
deleted file mode 100644
index a09508ce..00000000
--- a/eval_protocol/benchmarks/data/aime.jsonl
+++ /dev/null
@@ -1 +0,0 @@
-{"question": "Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c\\leq3^6$ and $a^3+b^3+c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by $1000$.", "answer": "735"}
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index fcfb0ac1..26ac635a 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -79,7 +79,6 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        # _get_aime_dataset_path(),
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
@@ -87,19 +86,16 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     completion_params=[
         {
             "max_tokens": 131000,
-            # "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
-            # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
-            "stream": True,
-            # "timeout": 2400,
+            "extra_body": {"reasoning_effort": "low"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=1,
-    max_dataset_rows=30,
-    max_concurrent_rollouts=1,
+    num_runs=8,
+    max_dataset_rows=2,
+    max_concurrent_rollouts=4,
     mode="pointwise",
 )
 def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:

From 41bbea1c39c328ea72d7d824dde2fdfd5a431790 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 15 Oct 2025 09:08:32 -0700
Subject: [PATCH 23/24] clean

---
 eval_protocol/benchmarks/test_aime25.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index 26ac635a..e059e265 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
         return None
 
 
-def _get_aime_dataset_path() -> str:
-    """Get the AIME dataset file path."""
-    return str(Path(__file__).parent / "data" / "aime.jsonl")
-
-
 def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     converted: List[EvaluationRow] = []
     for r in rows:

From 4652db7a04fcc0e02eb31559ead6fe99c6bb4350 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Wed, 15 Oct 2025 09:11:44 -0700
Subject: [PATCH 24/24] clean

---
 eval_protocol/benchmarks/test_aime25.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
index e059e265..91a67f77 100644
--- a/eval_protocol/benchmarks/test_aime25.py
+++ b/eval_protocol/benchmarks/test_aime25.py
@@ -1,5 +1,4 @@
 from typing import Any, Dict, List, Optional
-from pathlib import Path
 
 from eval_protocol.models import (
     EvaluateResult,