fix test_math_dataset

Dylan Huang · Dylan Huang · commit 2f2bf26d6c59 · 2025-08-29T10:43:10.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -322,10 +322,11 @@ async def _execute_pointwise_eval_with_semaphore(
                             row: EvaluationRow,
                         ) -> EvaluationRow:
                             async with semaphore:
+                                evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
                                 result = await execute_pytest(
                                     test_func,
                                     processed_row=row,
-                                    evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
+                                    evaluation_test_kwargs=evaluation_test_kwargs,
                                 )
                                 if not isinstance(result, EvaluationRow):
                                     raise ValueError(
@@ -337,10 +338,11 @@ async def _execute_groupwise_eval_with_semaphore(
                             rows: list[EvaluationRow],
                         ) -> list[EvaluationRow]:
                             async with semaphore:
+                                evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
                                 results = await execute_pytest(
                                     test_func,
                                     processed_dataset=rows,
-                                    evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
+                                    evaluation_test_kwargs=evaluation_test_kwargs,
                                 )
                                 if not isinstance(results, list):
                                     raise ValueError(
diff --git a/eval_protocol/pytest/execution.py b/eval_protocol/pytest/execution.py
@@ -19,23 +19,25 @@ async def execute_pytest(
             raise ValueError("'row' is a reserved parameter for the evaluation function")
         if "rows" in evaluation_test_kwargs:
             raise ValueError("'rows' is a reserved parameter for the evaluation function")
+    else:
+        evaluation_test_kwargs = {}
 
     # Handle both sync and async test functions
     if asyncio.iscoroutinefunction(test_func):
         if processed_row is not None:
             test_func = cast(Callable[[EvaluationRow], Awaitable[EvaluationRow]], test_func)
-            return await test_func(processed_row)
+            return await test_func(processed_row, **evaluation_test_kwargs)
         if processed_dataset is not None:
             test_func = cast(Callable[[list[EvaluationRow]], Awaitable[list[EvaluationRow]]], test_func)
-            return await test_func(processed_dataset)
+            return await test_func(processed_dataset, **evaluation_test_kwargs)
         test_func = cast(Callable[[], Awaitable[EvaluationRow]], test_func)
-        return await test_func()
+        return await test_func(**evaluation_test_kwargs)
     else:
         if processed_row is not None:
             test_func = cast(Callable[[EvaluationRow], EvaluationRow], test_func)
-            return test_func(processed_row)
+            return test_func(processed_row, **evaluation_test_kwargs)
         if processed_dataset is not None:
             test_func = cast(Callable[[Dataset], Dataset], test_func)
-            return test_func(processed_dataset)
+            return test_func(processed_dataset, **evaluation_test_kwargs)
         test_func = cast(Callable[[], EvaluationRow], test_func)
-        return test_func()
+        return test_func(**evaluation_test_kwargs)