From 514ff96653536babd6541b2e09d47262a917a23c Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 11:42:07 -0700 Subject: [PATCH 01/24] Test --- eval_protocol/benchmarks/data/aime.jsonl | 1 + eval_protocol/benchmarks/test_aime25.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) create mode 100644 eval_protocol/benchmarks/data/aime.jsonl diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl new file mode 100644 index 00000000..5869edb6 --- /dev/null +++ b/eval_protocol/benchmarks/data/aime.jsonl @@ -0,0 +1 @@ +{"question": "On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.", "answer": "588"} diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 91a67f77..5898750b 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -73,8 +73,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + "eval_protocol/benchmarks/data/aime.jsonl", + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ @@ -87,8 +88,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, - num_runs=8, - max_dataset_rows=2, + num_runs=1, + max_dataset_rows=1, max_concurrent_rollouts=4, mode="pointwise", ) From c1fdb95262f658ca118b4b0ff263f4e10ca77445 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 11:56:18 -0700 Subject: [PATCH 02/24] fix path --- eval_protocol/benchmarks/test_aime25.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 5898750b..59352043 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -1,4 +1,5 @@ from typing import Any, Dict, List, Optional +from pathlib import Path from eval_protocol.models import ( EvaluateResult, @@ -58,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None +def _get_aime_dataset_path() -> str: + """Get the AIME dataset file path.""" + return str(Path(__file__).parent / "data" / "aime.jsonl") + + def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: @@ -73,7 +79,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - "eval_protocol/benchmarks/data/aime.jsonl", + _get_aime_dataset_path(), # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], From 883797656c47a057fd31ad502ed4119e7447407d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 13:01:17 -0700 Subject: [PATCH 03/24] test 20 min timeout --- eval_protocol/pytest/default_single_turn_rollout_process.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 2b4bf893..dfae6e29 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -35,6 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} + request_params["timeout"] = 1200 # 20 minutes timeout # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None From d998db433f4483f01d1190e9e2562a36061cd982 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 13:18:06 -0700 Subject: [PATCH 04/24] test --- eval_protocol/benchmarks/test_aime25.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 59352043..8e1c62b2 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -89,6 +89,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: "max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "request_timeout": 1200, # 20 minutes Fireworks timeout } ], rollout_processor=SingleTurnRolloutProcessor(), From 66278e4034d48121c7bf53efeed0d1f1f49e5053 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 13:21:06 -0700 Subject: [PATCH 05/24] test --- eval_protocol/benchmarks/test_aime25.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 8e1c62b2..2bdeef2e 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -87,9 +87,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - "extra_body": {"reasoning_effort": "low"}, - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", - "request_timeout": 1200, # 20 minutes Fireworks timeout + # "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", + "request_timeout": 30, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -97,7 +97,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: passed_threshold=0.8, num_runs=1, max_dataset_rows=1, - max_concurrent_rollouts=4, + max_concurrent_rollouts=1, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: From af137b3f845a9079a3110e424bbfa46114ac287c Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 13:47:47 -0700 Subject: [PATCH 06/24] test w streaming --- eval_protocol/benchmarks/test_aime25.py | 4 +- .../default_single_turn_rollout_process.py | 39 ++++++++++++++----- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 2bdeef2e..700be097 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -87,9 +87,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - # "extra_body": {"reasoning_effort": "low"}, + "extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", - "request_timeout": 30, + # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], rollout_processor=SingleTurnRolloutProcessor(), diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index dfae6e29..1b1d8c3d 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -35,7 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} - request_params["timeout"] = 1200 # 20 minutes timeout + request_params["stream"] = True # Enable streaming # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None @@ -68,10 +68,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: _litellm = importlib.import_module("litellm") acompletion = getattr(_litellm, "acompletion") - response = await acompletion(**request_params) - assistant_content = response.choices[0].message.content or "" - tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None + # Handle streaming response + assistant_content = "" + tool_calls = None + usage_info = None + + async for chunk in await acompletion(**request_params): + if chunk.choices and len(chunk.choices) > 0: + delta = chunk.choices[0].delta + if hasattr(delta, "content") and delta.content: + assistant_content += delta.content + if hasattr(delta, "tool_calls") and delta.tool_calls: + tool_calls = delta.tool_calls + + # Capture usage info from the final chunk + if hasattr(chunk, "usage") and chunk.usage: + usage_info = chunk.usage converted_tool_calls = None if tool_calls: @@ -112,11 +125,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: ) ] - row.execution_metadata.usage = CompletionUsage( - prompt_tokens=response.usage.prompt_tokens, - completion_tokens=response.usage.completion_tokens, - total_tokens=response.usage.total_tokens, - ) + if usage_info: + row.execution_metadata.usage = CompletionUsage( + prompt_tokens=usage_info.prompt_tokens, + completion_tokens=usage_info.completion_tokens, + total_tokens=usage_info.total_tokens, + ) + else: + # Fallback if usage info not available from streaming + row.execution_metadata.usage = CompletionUsage( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + ) row.messages = messages From a3e79418abae119814d46308bb53739833808e4e Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 13:56:55 -0700 Subject: [PATCH 07/24] test on full dataset --- eval_protocol/benchmarks/test_aime25.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 700be097..4e7a336f 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None -def _get_aime_dataset_path() -> str: - """Get the AIME dataset file path.""" - return str(Path(__file__).parent / "data" / "aime.jsonl") - - def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: @@ -79,9 +74,8 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - _get_aime_dataset_path(), - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ @@ -95,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, - num_runs=1, - max_dataset_rows=1, - max_concurrent_rollouts=1, + num_runs=8, + max_dataset_rows=2, + max_concurrent_rollouts=4, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: From e71c5d8c57a702894693878d3e8fa560feb7e6e1 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 14:26:24 -0700 Subject: [PATCH 08/24] try again --- .../default_single_turn_rollout_process.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 1b1d8c3d..07de69a0 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -35,6 +35,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} + # request_params["timeout"] = 1200 # 20 minutes timeout request_params["stream"] = True # Enable streaming # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None @@ -69,12 +70,17 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: _litellm = importlib.import_module("litellm") acompletion = getattr(_litellm, "acompletion") - # Handle streaming response + # Handle streaming response - following LiteLLM docs pattern assistant_content = "" tool_calls = None - usage_info = None + chunks = [] + + response = await acompletion(**request_params) + + # Process streaming chunks + async for chunk in response: + chunks.append(chunk) # Collect chunks for potential use with stream_chunk_builder - async for chunk in await acompletion(**request_params): if chunk.choices and len(chunk.choices) > 0: delta = chunk.choices[0].delta if hasattr(delta, "content") and delta.content: @@ -82,10 +88,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if hasattr(delta, "tool_calls") and delta.tool_calls: tool_calls = delta.tool_calls - # Capture usage info from the final chunk - if hasattr(chunk, "usage") and chunk.usage: - usage_info = chunk.usage - converted_tool_calls = None if tool_calls: converted_tool_calls = [] @@ -125,6 +127,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: ) ] + # Try to get usage info from chunks, fallback to estimates + usage_info = None + for chunk in reversed(chunks): # Check last chunks first for usage info + if hasattr(chunk, "usage") and chunk.usage: + usage_info = chunk.usage + break + if usage_info: row.execution_metadata.usage = CompletionUsage( prompt_tokens=usage_info.prompt_tokens, @@ -132,11 +141,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: total_tokens=usage_info.total_tokens, ) else: - # Fallback if usage info not available from streaming + # Fallback estimates when streaming doesn't provide usage + estimated_completion_tokens = len(assistant_content.split()) if assistant_content else 0 row.execution_metadata.usage = CompletionUsage( prompt_tokens=0, - completion_tokens=0, - total_tokens=0, + completion_tokens=estimated_completion_tokens, + total_tokens=estimated_completion_tokens, ) row.messages = messages From fce442b80c2b5f7fef756436e9c5b57cc7f6c581 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 15:08:01 -0700 Subject: [PATCH 09/24] try again --- eval_protocol/benchmarks/test_aime25.py | 8 ++-- .../default_single_turn_rollout_process.py | 41 ++++++++----------- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 4e7a336f..6e1c7852 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -81,7 +81,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - "extra_body": {"reasoning_effort": "low"}, + # "extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } @@ -89,9 +89,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, - num_runs=8, - max_dataset_rows=2, - max_concurrent_rollouts=4, + num_runs=1, + max_dataset_rows=30, + max_concurrent_rollouts=8, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 07de69a0..64b66283 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -4,6 +4,7 @@ import time from typing import List +import litellm from litellm import acompletion from typing import Dict @@ -15,6 +16,8 @@ logger = logging.getLogger(__name__) +litellm._turn_on_debug() # pyright: ignore[reportPrivateImportUsage] + class SingleTurnRolloutProcessor(RolloutProcessor): """Single turn rollout processor for direct LLM calls.""" @@ -35,7 +38,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} - # request_params["timeout"] = 1200 # 20 minutes timeout request_params["stream"] = True # Enable streaming # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None @@ -64,23 +66,16 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if row.tools is not None: request_params["tools"] = row.tools - # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet - import importlib - - _litellm = importlib.import_module("litellm") - acompletion = getattr(_litellm, "acompletion") + # _litellm = importlib.import_module("litellm") + # acompletion = getattr(_litellm, "acompletion") - # Handle streaming response - following LiteLLM docs pattern + # Handle streaming response assistant_content = "" tool_calls = None - chunks = [] - - response = await acompletion(**request_params) - - # Process streaming chunks - async for chunk in response: - chunks.append(chunk) # Collect chunks for potential use with stream_chunk_builder + usage_info = None + stream = await acompletion(**request_params) + async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] if chunk.choices and len(chunk.choices) > 0: delta = chunk.choices[0].delta if hasattr(delta, "content") and delta.content: @@ -88,6 +83,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if hasattr(delta, "tool_calls") and delta.tool_calls: tool_calls = delta.tool_calls + # Capture usage info from the final chunk + if hasattr(chunk, "usage") and chunk.usage: + usage_info = chunk.usage + converted_tool_calls = None if tool_calls: converted_tool_calls = [] @@ -127,13 +126,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: ) ] - # Try to get usage info from chunks, fallback to estimates - usage_info = None - for chunk in reversed(chunks): # Check last chunks first for usage info - if hasattr(chunk, "usage") and chunk.usage: - usage_info = chunk.usage - break - if usage_info: row.execution_metadata.usage = CompletionUsage( prompt_tokens=usage_info.prompt_tokens, @@ -141,12 +133,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: total_tokens=usage_info.total_tokens, ) else: - # Fallback estimates when streaming doesn't provide usage - estimated_completion_tokens = len(assistant_content.split()) if assistant_content else 0 + # Fallback if usage info not available from streaming row.execution_metadata.usage = CompletionUsage( prompt_tokens=0, - completion_tokens=estimated_completion_tokens, - total_tokens=estimated_completion_tokens, + completion_tokens=0, + total_tokens=0, ) row.messages = messages From 352297cf8ff683c4158ae67b01d12ffb72ae1a71 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 15:39:17 -0700 Subject: [PATCH 10/24] try proper streaming helper --- eval_protocol/benchmarks/test_aime25.py | 16 ++++-- .../default_single_turn_rollout_process.py | 50 ++++++------------- 2 files changed, 27 insertions(+), 39 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 6e1c7852..c90b80ac 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -59,6 +59,11 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None +def _get_aime_dataset_path() -> str: + """Get the AIME dataset file path.""" + return str(Path(__file__).parent / "data" / "aime.jsonl") + + def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: @@ -74,6 +79,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ + # _get_aime_dataset_path(), "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], @@ -81,17 +87,17 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - # "extra_body": {"reasoning_effort": "low"}, - "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", - # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "extra_body": {"reasoning_effort": "low"}, + # "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, num_runs=1, - max_dataset_rows=30, - max_concurrent_rollouts=8, + max_dataset_rows=1, + max_concurrent_rollouts=1, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 64b66283..47618974 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -16,8 +16,6 @@ logger = logging.getLogger(__name__) -litellm._turn_on_debug() # pyright: ignore[reportPrivateImportUsage] - class SingleTurnRolloutProcessor(RolloutProcessor): """Single turn rollout processor for direct LLM calls.""" @@ -66,26 +64,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if row.tools is not None: request_params["tools"] = row.tools - # _litellm = importlib.import_module("litellm") - # acompletion = getattr(_litellm, "acompletion") - - # Handle streaming response - assistant_content = "" - tool_calls = None - usage_info = None + chunks = [] stream = await acompletion(**request_params) - async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] - if chunk.choices and len(chunk.choices) > 0: - delta = chunk.choices[0].delta - if hasattr(delta, "content") and delta.content: - assistant_content += delta.content - if hasattr(delta, "tool_calls") and delta.tool_calls: - tool_calls = delta.tool_calls - - # Capture usage info from the final chunk - if hasattr(chunk, "usage") and chunk.usage: - usage_info = chunk.usage + async for chunk in stream: + chunks.append(chunk) + + response = litellm.stream_chunk_builder(chunks, messages_payload) + + if response is None: + raise ValueError("Response is None") + + assistant_content = response.choices[0].message.content or "" + tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None converted_tool_calls = None if tool_calls: @@ -125,20 +116,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: tool_calls=converted_tool_calls, ) ] - - if usage_info: - row.execution_metadata.usage = CompletionUsage( - prompt_tokens=usage_info.prompt_tokens, - completion_tokens=usage_info.completion_tokens, - total_tokens=usage_info.total_tokens, - ) - else: - # Fallback if usage info not available from streaming - row.execution_metadata.usage = CompletionUsage( - prompt_tokens=0, - completion_tokens=0, - total_tokens=0, - ) + row.execution_metadata.usage = CompletionUsage( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=response.usage.completion_tokens, + total_tokens=response.usage.total_tokens, + ) row.messages = messages From 5baebc587db0084997a9e18ce10a802691766859 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 15:51:27 -0700 Subject: [PATCH 11/24] test failing --- eval_protocol/benchmarks/data/aime.jsonl | 2 +- eval_protocol/benchmarks/test_aime25.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl index 5869edb6..a09508ce 100644 --- a/eval_protocol/benchmarks/data/aime.jsonl +++ b/eval_protocol/benchmarks/data/aime.jsonl @@ -1 +1 @@ -{"question": "On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.", "answer": "588"} +{"question": "Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c\\leq3^6$ and $a^3+b^3+c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by $1000$.", "answer": "735"} diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index c90b80ac..98e49499 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - # _get_aime_dataset_path(), - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + _get_aime_dataset_path(), + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ From 706e42494e7f7a4735cbfe9a64d711f14f085c2c Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 16:21:11 -0700 Subject: [PATCH 12/24] test --- eval_protocol/benchmarks/test_aime25.py | 8 ++++---- .../pytest/default_single_turn_rollout_process.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 98e49499..a430cc19 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -86,10 +86,10 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: dataset_adapter=aime2025_dataset_adapter, completion_params=[ { - "max_tokens": 131000, - "extra_body": {"reasoning_effort": "low"}, - # "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", - "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "max_tokens": 65536, + # "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", + # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], rollout_processor=SingleTurnRolloutProcessor(), diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 47618974..ae1b0db2 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -65,9 +65,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params["tools"] = row.tools chunks = [] + print("time: ", time.time()) stream = await acompletion(**request_params) async for chunk in stream: + print("chunk added at time: ", time.time()) + print("chunk: ", chunk) chunks.append(chunk) response = litellm.stream_chunk_builder(chunks, messages_payload) From e36383f6a85a18f3afb06e5d7b4e32acb69aa20b Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 16:34:41 -0700 Subject: [PATCH 13/24] test --- eval_protocol/pytest/default_single_turn_rollout_process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index ae1b0db2..9b698bb1 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -69,8 +69,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: stream = await acompletion(**request_params) async for chunk in stream: - print("chunk added at time: ", time.time()) - print("chunk: ", chunk) + # print("chunk added at time: ", time.time()) + # print("chunk: ", chunk) chunks.append(chunk) response = litellm.stream_chunk_builder(chunks, messages_payload) From ee7e415eda62edeca7f1eb6b4454551be7f4f418 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 17:17:49 -0700 Subject: [PATCH 14/24] try all --- eval_protocol/benchmarks/test_aime25.py | 13 ++++++------ .../default_single_turn_rollout_process.py | 20 ++++++++----------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index a430cc19..98dba3c9 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -79,25 +79,26 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - _get_aime_dataset_path(), - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + # _get_aime_dataset_path(), + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ { - "max_tokens": 65536, + "max_tokens": 131000, # "extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "stream": True, } ], rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, num_runs=1, - max_dataset_rows=1, - max_concurrent_rollouts=1, + max_dataset_rows=30, + max_concurrent_rollouts=8, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 9b698bb1..b4f9e9f8 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -6,7 +6,6 @@ import litellm from litellm import acompletion -from typing import Dict from eval_protocol.dataset_logger import default_logger from eval_protocol.models import EvaluationRow, Message @@ -36,7 +35,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} - request_params["stream"] = True # Enable streaming # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None @@ -64,16 +62,14 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if row.tools is not None: request_params["tools"] = row.tools - chunks = [] - print("time: ", time.time()) - - stream = await acompletion(**request_params) - async for chunk in stream: - # print("chunk added at time: ", time.time()) - # print("chunk: ", chunk) - chunks.append(chunk) - - response = litellm.stream_chunk_builder(chunks, messages_payload) + if request_params.get("stream") is True: + chunks = [] + stream = await acompletion(**request_params) + async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] + chunks.append(chunk) + response = litellm.stream_chunk_builder(chunks, messages_payload) + else: + response = await acompletion(**request_params) if response is None: raise ValueError("Response is None") From d231ba82c5aa28da5ae931bc0f10bfd09f916d23 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 17:33:03 -0700 Subject: [PATCH 15/24] test low concurrency --- eval_protocol/benchmarks/test_aime25.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 98dba3c9..fbc8bd00 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -98,7 +98,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: passed_threshold=0.8, num_runs=1, max_dataset_rows=30, - max_concurrent_rollouts=8, + max_concurrent_rollouts=2, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: From fc827678c33e3900d8fade1b935fc1229897d2b6 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 22:00:50 -0700 Subject: [PATCH 16/24] test --- eval_protocol/benchmarks/test_aime25.py | 7 ++++--- .../default_single_turn_rollout_process.py | 20 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index fbc8bd00..6e11d834 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - # _get_aime_dataset_path(), - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + _get_aime_dataset_path(), + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ @@ -91,6 +91,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "stream": True, + # "timeout": 2400, } ], rollout_processor=SingleTurnRolloutProcessor(), diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index b4f9e9f8..d98ab042 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -6,6 +6,8 @@ import litellm from litellm import acompletion +from litellm.types.utils import ModelResponse, Choices +from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper from eval_protocol.dataset_logger import default_logger from eval_protocol.models import EvaluationRow, Message @@ -65,14 +67,18 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if request_params.get("stream") is True: chunks = [] stream = await acompletion(**request_params) + + assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper" + async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] chunks.append(chunk) response = litellm.stream_chunk_builder(chunks, messages_payload) else: response = await acompletion(**request_params) - if response is None: - raise ValueError("Response is None") + assert response is not None, "Response is None" + assert isinstance(response, ModelResponse), "Response should be ModelResponse" + assert isinstance(response.choices[0], Choices), "Response choice should be a Choices" assistant_content = response.choices[0].message.content or "" tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None @@ -115,10 +121,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: tool_calls=converted_tool_calls, ) ] - row.execution_metadata.usage = CompletionUsage( - prompt_tokens=response.usage.prompt_tokens, - completion_tokens=response.usage.completion_tokens, - total_tokens=response.usage.total_tokens, + row.execution_metadata.usage = ( + CompletionUsage( # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field + prompt_tokens=response.usage.prompt_tokens, # pyright: ignore[reportAttributeAccessIssue] + completion_tokens=response.usage.completion_tokens, # pyright: ignore[reportAttributeAccessIssue] + total_tokens=response.usage.total_tokens, # pyright: ignore[reportAttributeAccessIssue] + ) ) row.messages = messages From c091e5be66bee81bb854b5d187ccb1f9fc385869 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 22:51:29 -0700 Subject: [PATCH 17/24] try 1 concurrent rollout --- eval_protocol/benchmarks/test_aime25.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 6e11d834..96d9ee29 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -99,7 +99,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: passed_threshold=0.8, num_runs=1, max_dataset_rows=30, - max_concurrent_rollouts=2, + max_concurrent_rollouts=1, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: From 74187811d989eff4b5fba572623bd14755fe2339 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 22:56:26 -0700 Subject: [PATCH 18/24] print out reasoning tokens to debug --- .../pytest/default_single_turn_rollout_process.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index d98ab042..65ac7db4 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -73,6 +73,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] chunks.append(chunk) response = litellm.stream_chunk_builder(chunks, messages_payload) + + # Check for reasoning content + print("DEBUG: ", messages_payload) + if hasattr(response.choices[0].message, "reasoning_content"): + print(f"Reasoning: {response.choices[0].message.reasoning_content}") + + # Check for thinking blocks + if hasattr(response.choices[0].message, "thinking_blocks"): + print(f"Thinking: {response.choices[0].message.thinking_blocks}") else: response = await acompletion(**request_params) From b4d149c6ea5c5badfad9a06653f2cf35240fc29b Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 14 Oct 2025 23:13:46 -0700 Subject: [PATCH 19/24] full test --- eval_protocol/benchmarks/test_aime25.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 96d9ee29..fcfb0ac1 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - _get_aime_dataset_path(), - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + # _get_aime_dataset_path(), + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, completion_params=[ From 1221842549908dfdd750bcda100f85d1af96d9f0 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 15 Oct 2025 00:41:40 -0700 Subject: [PATCH 20/24] enable other stream --- eval_protocol/mcp/execution/policy.py | 24 +++++++++++++----- .../default_mcp_gym_rollout_processor.py | 25 ++++++++++++++----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index 1adb9b95..777c4f7e 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -5,15 +5,14 @@ Rewritten to use LiteLLM for unified retry logic, caching, and provider support. """ -import asyncio -import json import logging import os -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Optional import litellm -from litellm import acompletion, completion +from litellm import acompletion +from litellm.types.utils import ModelResponse +from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper from litellm.caching.caching import Cache from litellm.caching.dual_cache import DualCache from litellm.caching.in_memory_cache import InMemoryCache @@ -194,7 +193,20 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[ request_params["tools"] = tools try: - response = await acompletion(model=self.model_id, **request_params) + if request_params.get("stream") is True: + chunks = [] + stream = await acompletion(model=self.model_id, **request_params) + + assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper" + + async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] + chunks.append(chunk) + response = litellm.stream_chunk_builder(chunks, messages) + else: + response = await acompletion(model=self.model_id, **request_params) + + assert response is not None, "Response is None" + assert isinstance(response, ModelResponse), "Response should be ModelResponse" # Log cache hit/miss for monitoring hidden = getattr(response, "_hidden_params", {}) diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 9173e6f9..2d01b6c1 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -223,13 +223,26 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> try: self.server.start() + model_id = str( + (config.completion_params.get("model") if config.completion_params else None) or "gpt-4o-mini" + ) + temperature = config.completion_params.get("temperature", 0.0) + max_tokens = config.completion_params.get("max_tokens", 4096) + + # Pass all other completion_params (e.g. stream=True) via kwargs + other_params = { + k: v + for k, v in (config.completion_params or {}).items() + if k not in ["model", "temperature", "max_tokens", "extra_body"] + } + extra_body = config.completion_params.get("extra_body", {}) or {} + self.policy = ep.LiteLLMPolicy( - model_id=str( - (config.completion_params.get("model") if config.completion_params else None) or "gpt-4o-mini" - ), - temperature=config.completion_params.get("temperature", 0.0), - max_tokens=config.completion_params.get("max_tokens", 4096), - **(config.completion_params.get("extra_body", {}) or {}), + model_id=model_id, + temperature=temperature, + max_tokens=max_tokens, + **extra_body, + **other_params, ) except Exception as e: From 0c8642dc05c7f86b5b33f7b12aedcb52ebbee376 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 15 Oct 2025 09:07:06 -0700 Subject: [PATCH 21/24] remove debug --- .../pytest/default_single_turn_rollout_process.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 65ac7db4..d98ab042 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -73,15 +73,6 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: async for chunk in stream: # pyright: ignore[reportGeneralTypeIssues] chunks.append(chunk) response = litellm.stream_chunk_builder(chunks, messages_payload) - - # Check for reasoning content - print("DEBUG: ", messages_payload) - if hasattr(response.choices[0].message, "reasoning_content"): - print(f"Reasoning: {response.choices[0].message.reasoning_content}") - - # Check for thinking blocks - if hasattr(response.choices[0].message, "thinking_blocks"): - print(f"Thinking: {response.choices[0].message.thinking_blocks}") else: response = await acompletion(**request_params) From a4f599f17f8fda5ebda71bab9722ca715134ff41 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 15 Oct 2025 09:08:14 -0700 Subject: [PATCH 22/24] revert test to original --- eval_protocol/benchmarks/data/aime.jsonl | 1 - eval_protocol/benchmarks/test_aime25.py | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) delete mode 100644 eval_protocol/benchmarks/data/aime.jsonl diff --git a/eval_protocol/benchmarks/data/aime.jsonl b/eval_protocol/benchmarks/data/aime.jsonl deleted file mode 100644 index a09508ce..00000000 --- a/eval_protocol/benchmarks/data/aime.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"question": "Let $N$ denote the number of ordered triples of positive integers $(a,b,c)$ such that $a,b,c\\leq3^6$ and $a^3+b^3+c^3$ is a multiple of $3^7$. Find the remainder when $N$ is divided by $1000$.", "answer": "735"} diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index fcfb0ac1..26ac635a 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -79,7 +79,6 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @evaluation_test( input_dataset=[ - # _get_aime_dataset_path(), "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], @@ -87,19 +86,16 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: completion_params=[ { "max_tokens": 131000, - # "extra_body": {"reasoning_effort": "low"}, - "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne", - # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", - "stream": True, - # "timeout": 2400, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=0.8, - num_runs=1, - max_dataset_rows=30, - max_concurrent_rollouts=1, + num_runs=8, + max_dataset_rows=2, + max_concurrent_rollouts=4, mode="pointwise", ) def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow: From 41bbea1c39c328ea72d7d824dde2fdfd5a431790 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 15 Oct 2025 09:08:32 -0700 Subject: [PATCH 23/24] clean --- eval_protocol/benchmarks/test_aime25.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 26ac635a..e059e265 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -59,11 +59,6 @@ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]: return None -def _get_aime_dataset_path() -> str: - """Get the AIME dataset file path.""" - return str(Path(__file__).parent / "data" / "aime.jsonl") - - def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: converted: List[EvaluationRow] = [] for r in rows: From 4652db7a04fcc0e02eb31559ead6fe99c6bb4350 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Wed, 15 Oct 2025 09:11:44 -0700 Subject: [PATCH 24/24] clean --- eval_protocol/benchmarks/test_aime25.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index e059e265..91a67f77 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -1,5 +1,4 @@ from typing import Any, Dict, List, Optional -from pathlib import Path from eval_protocol.models import ( EvaluateResult,