undo weird changes i made

xzrderek · xzrderek · commit 21fdb2b80c42 · 2025-09-12T16:20:22.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -82,28 +82,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
             "max_tokens": 131000,
             "extra_body": {"reasoning_effort": "low"},
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
-        },
-        {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "medium"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
-        },
-        {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
-        },
-        {
-            "max_tokens": 131000,
-            "extra_body": {"reasoning_effort": "medium"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
-        },
+        }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     aggregation_method="mean",
     passed_threshold=0.8,
-    num_runs=1,
-    max_dataset_rows=1,
+    num_runs=8,
+    max_dataset_rows=2,
     max_concurrent_rollouts=4,
     mode="pointwise",
 )
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -5,6 +5,7 @@
 from typing import List
 
 from litellm import acompletion
+from typing import Dict
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
@@ -61,10 +62,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if row.tools is not None:
                 request_params["tools"] = row.tools
 
+            # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
+            import importlib
+
+            _litellm = importlib.import_module("litellm")
+            acompletion = getattr(_litellm, "acompletion")
             response = await acompletion(**request_params)
 
-            assistant_content = response.choices[0].message.content or ""  # pyright: ignore[reportAttributeAccessIssue]
-            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None  # pyright: ignore[reportAttributeAccessIssue]
+            assistant_content = response.choices[0].message.content or ""
+            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
 
             converted_tool_calls = None
             if tool_calls:
@@ -106,9 +112,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             ]
 
             row.execution_metadata.usage = CompletionUsage(
-                prompt_tokens=response.usage.prompt_tokens,  # pyright: ignore[reportAttributeAccessIssue]
-                completion_tokens=response.usage.completion_tokens,  # pyright: ignore[reportAttributeAccessIssue]
-                total_tokens=response.usage.total_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                prompt_tokens=response.usage.prompt_tokens,
+                completion_tokens=response.usage.completion_tokens,
+                total_tokens=response.usage.total_tokens,
             )
 
             row.messages = messages
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -130,7 +130,7 @@ def split_multi_turn_rows(data: list[EvaluationRow]) -> list[EvaluationRow]:
                 )
             )
 
-    return [expanded_rows[0]]
+    return expanded_rows
 
 
 async def pairwise_judgment_async(question_text, answer_a, answer_b, tools, judge_config, shared_client):

Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ def split_multi_turn_rows(data: list[EvaluationRow]) -> list[EvaluationRow]:`
`130`	`130`	`)`
`131`	`131`	`)`
`132`	`132`
`133`		`- return [expanded_rows[0]]`
	`133`	`+ return expanded_rows`
`134`	`134`
`135`	`135`
`136`	`136`	`async def pairwise_judgment_async(question_text, answer_a, answer_b, tools, judge_config, shared_client):`