eval-protocol
diff --git a/‎eval_protocol/evaluation.py‎
Lines changed: 2 additions & 3 deletions b/‎eval_protocol/evaluation.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 27 additions & 24 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 27 additions & 24 deletions
diff --git a/‎eval_protocol/pytest/evaluation_test_utils.py‎
Lines changed: 22 additions & 2 deletions b/‎eval_protocol/pytest/evaluation_test_utils.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 57 additions & 45 deletions b/‎eval_protocol/pytest/remote_rollout_processor.py‎
Lines changed: 57 additions & 45 deletions
diff --git a/‎eval_protocol/pytest/rollout_processor.py‎
Lines changed: 4 additions & 0 deletions b/‎eval_protocol/pytest/rollout_processor.py‎
Lines changed: 4 additions & 0 deletions
@@ -34,7 +34,7 @@ def __init__(
 
     @staticmethod
     def _parse_ignore_file(ignore_path: str) -> List[str]:
-        """Parse .gitignore or .dockerignore and return patterns."""
+        """Parse .gitignore and return patterns."""
         patterns = []
         if not os.path.exists(ignore_path):
             return patterns
@@ -129,8 +129,7 @@ def _create_tar_gz_with_ignores(output_path: str, source_dir: str) -> int:
 
         source_path = Path(source_dir)
         gitignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".gitignore"))
-        dockerignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".dockerignore"))
-        all_ignore_patterns = gitignore_patterns + dockerignore_patterns
+        all_ignore_patterns = gitignore_patterns
 
         logger.info(f"Creating tar.gz with {len(all_ignore_patterns)} ignore patterns")
 
 
@@ -20,13 +20,12 @@
     EvaluationRow,
     EvaluationThreshold,
     EvaluationThresholdDict,
-    EvaluateResult,
     Status,
     EPParameters,
 )
 from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
 from eval_protocol.pytest.evaluation_test_postprocess import postprocess
-from eval_protocol.pytest.execution import execute_pytest, execute_pytest_with_exception_handling
+from eval_protocol.pytest.execution import execute_pytest_with_exception_handling
 from eval_protocol.pytest.priority_scheduler import execute_priority_rollouts
 from eval_protocol.pytest.generate_parameter_combinations import (
     ParameterizedTestKwargs,
@@ -56,6 +55,7 @@
     AggregationMethod,
     add_cost_metrics,
     log_eval_status_and_rows,
+    normalize_fireworks_model,
     parse_ep_completion_params,
     parse_ep_completion_params_overwrite,
     parse_ep_max_concurrent_rollouts,
@@ -93,8 +93,8 @@ def evaluation_test(
     filtered_row_ids: Sequence[str] | None = None,
     max_dataset_rows: int | None = None,
     mcp_config_path: str | None = None,
-    max_concurrent_rollouts: int = 8,
-    max_concurrent_evaluations: int = 64,
+    max_concurrent_rollouts: int = 96,
+    max_concurrent_evaluations: int = 96,
     server_script_path: str | None = None,
     steps: int = 30,
     mode: EvaluationTestMode = "pointwise",
@@ -205,6 +205,7 @@ def evaluation_test(
     max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
     completion_params = parse_ep_completion_params(completion_params)
     completion_params = parse_ep_completion_params_overwrite(completion_params)
+    completion_params = [normalize_fireworks_model(cp) for cp in completion_params]
     original_completion_params = completion_params
     passed_threshold = parse_ep_passed_threshold(passed_threshold)
     data_loaders = parse_ep_dataloaders(data_loaders)
@@ -365,6 +366,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                             row.input_metadata.row_id = generate_id(seed=0, index=index)
 
                     completion_params = kwargs["completion_params"] if "completion_params" in kwargs else None
+                    completion_params = normalize_fireworks_model(completion_params)
                     # Create eval metadata with test function info and current commit hash
                     eval_metadata = EvalMetadata(
                         name=test_func.__name__,
@@ -409,21 +411,22 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
 
                     rollout_processor.setup()
 
-                    use_priority_scheduler = (
-                        (
-                            os.environ.get("EP_USE_PRIORITY_SCHEDULER", "0") == "1"
-                            and not isinstance(rollout_processor, MCPGymRolloutProcessor)
-                        )
-                    )
+                    use_priority_scheduler = os.environ.get(
+                        "EP_USE_PRIORITY_SCHEDULER", "0"
+                    ) == "1" and not isinstance(rollout_processor, MCPGymRolloutProcessor)
 
                     if use_priority_scheduler:
                         microbatch_output_size = os.environ.get("EP_MICRO_BATCH_OUTPUT_SIZE", None)
                         output_dir = os.environ.get("EP_OUTPUT_DIR", None)
                         if microbatch_output_size and output_dir:
-                            output_buffer = MicroBatchDataBuffer(num_runs=num_runs, batch_size=int(microbatch_output_size), output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"))
+                            output_buffer = MicroBatchDataBuffer(
+                                num_runs=num_runs,
+                                batch_size=int(microbatch_output_size),
+                                output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"),
+                            )
                         else:
                             output_buffer = None
-                        
+
                         try:
                             priority_results = await execute_priority_rollouts(
                                 dataset=data,
@@ -441,12 +444,12 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                         finally:
                             if output_buffer:
                                 await output_buffer.close()
-                        
+
                         for res in priority_results:
                             run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
                             if run_idx < len(all_results):
                                 all_results[run_idx].append(res)
-                            
+
                             processed_rows_in_run.append(res)
 
                         postprocess(
@@ -462,6 +465,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                         )
 
                     else:
+
                         async def execute_run(run_idx: int, config: RolloutProcessorConfig):
                             nonlocal all_results
 
@@ -506,9 +510,7 @@ async def _execute_pointwise_eval_with_semaphore(
                                         raise ValueError(
                                             f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                         )
-                                    result.execution_metadata.eval_duration_seconds = (
-                                        time.perf_counter() - start_time
-                                    )
+                                    result.execution_metadata.eval_duration_seconds = time.perf_counter() - start_time
                                     return result
 
                             async def _execute_groupwise_eval_with_semaphore(
@@ -519,7 +521,9 @@ async def _execute_groupwise_eval_with_semaphore(
                                     evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
                                     primary_rollout_id = rows[0].execution_metadata.rollout_id if rows else None
                                     group_rollout_ids = [
-                                        r.execution_metadata.rollout_id for r in rows if r.execution_metadata.rollout_id
+                                        r.execution_metadata.rollout_id
+                                        for r in rows
+                                        if r.execution_metadata.rollout_id
                                     ]
                                     async with rollout_logging_context(
                                         primary_rollout_id or "",
@@ -596,7 +600,9 @@ async def _collect_result(config, lst):
                                         row_groups[row.input_metadata.row_id].append(row)
                                 tasks = []
                                 for _, rows in row_groups.items():
-                                    tasks.append(asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows)))
+                                    tasks.append(
+                                        asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows))
+                                    )
                                 results = []
                                 for task in tasks:
                                     res = await task
@@ -692,9 +698,9 @@ async def _collect_result(config, lst):
                             # For other processors, create all tasks at once and run in parallel
                             # Concurrency is now controlled by the shared semaphore in each rollout processor
                             await run_tasks_with_run_progress(execute_run, num_runs, config)
-                        
+
                         experiment_duration_seconds = time.perf_counter() - experiment_start_time
-                        
+
                         # for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
                         # rollout_id is used to differentiate the result from different completion_params
                         if mode == "groupwise":
@@ -730,15 +736,12 @@ async def _collect_result(config, lst):
                                 experiment_duration_seconds,
                             )
 
-
-                
                     if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
                         raise AssertionError(
                             "Some EvaluationRow instances are missing evaluation_result. "
                             "Your @evaluation_test function must set `row.evaluation_result`"
                         )
 
-                    
                 except AssertionError:
                     _log_eval_error(
                         Status.eval_finished(),
 
@@ -371,7 +371,7 @@ async def execute_row_with_backoff_retry(row: EvaluationRow) -> EvaluationRow:
             retry_config = replace(config, kwargs={**(config.kwargs or {}), "start_server": False})
             retry_tasks = rollout_processor([row], retry_config)
             result = await retry_tasks[0]
-            
+
             # Apply post-processing quality checks if configured
             # This must be inside the retry function so ResponseQualityError can trigger retries
             if config.post_processor is not None:
@@ -380,7 +380,7 @@ async def execute_row_with_backoff_retry(row: EvaluationRow) -> EvaluationRow:
                 except ResponseQualityError as quality_error:
                     # Re-raise ResponseQualityError to trigger retry logic
                     raise quality_error
-            
+
             return result
 
         async def execute_row_with_backoff(task: asyncio.Task[EvaluationRow], row: EvaluationRow) -> EvaluationRow:
@@ -464,6 +464,7 @@ async def execute_row_with_backoff_and_log(
                 yield result
 
     finally:
+        await rollout_processor.acleanup()
         rollout_processor.cleanup()
 
 
@@ -618,3 +619,22 @@ def build_rollout_processor_config(
         server_script_path=None,
         kwargs=rollout_processor_kwargs,
     )
+
+
+def normalize_fireworks_model(completion_params: CompletionParams | None) -> CompletionParams | None:
+    """Fireworks model names like 'accounts/<org>/models/<model>' need the fireworks_ai/
+    prefix when routing through LiteLLM. This function adds the prefix if missing.
+    """
+    if completion_params is None:
+        return None
+
+    model = completion_params.get("model")
+    if (
+        model
+        and isinstance(model, str)
+        and not model.startswith("fireworks_ai/")
+        and re.match(r"^accounts/[^/]+/models/.+", model)
+    ):
+        completion_params = completion_params.copy()
+        completion_params["model"] = f"fireworks_ai/{model}"
+    return completion_params
@@ -1,14 +1,10 @@
 import asyncio
 import time
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
 
-import requests
+import aiohttp
 
 from eval_protocol.models import EvaluationRow, Status
-from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
-from eval_protocol.types.remote_rollout_processor import (
-    DataLoaderConfig,
-)
 from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
 from eval_protocol.exceptions import exception_for_status_code
 
@@ -51,6 +47,12 @@ def __init__(
         self._poll_interval = poll_interval
         self._timeout_seconds = timeout_seconds
         self._tracing_adapter = FireworksTracingAdapter(base_url=self._model_base_url)
+        self._session: Optional[aiohttp.ClientSession] = None
+
+    def _get_or_create_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession()
+        return self._session
 
     def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
         tasks: List[asyncio.Task[EvaluationRow]] = []
@@ -88,48 +90,26 @@ async def _process_row(row: EvaluationRow) -> EvaluationRow:
             init_payload = build_init_request(row, config, model_base_url)
 
             # Fire-and-poll
-            def _post_init() -> None:
-                url = f"{remote_base_url}/init"
-                try:
-                    r = requests.post(url, json=init_payload.model_dump(), timeout=300)
-                    r.raise_for_status()
-                except requests.exceptions.Timeout:
-                    raise TimeoutError(
-                        f"The /init endpoint tried {url} with {init_payload.model_dump()} but timed out after 300 seconds."
-                    )
-
-            await asyncio.to_thread(_post_init)
+            init_url = f"{remote_base_url}/init"
+
+            timeout_init = aiohttp.ClientTimeout(total=300)
+
+            try:
+                session = self._get_or_create_session()
+                async with session.post(init_url, json=init_payload.model_dump(), timeout=timeout_init) as resp:
+                    if resp.status >= 400:
+                        body = await resp.text()
+                        raise RuntimeError(f"Remote /init failed (HTTP {resp.status}): {body}")
+                    resp.raise_for_status()
+                    await resp.read()  # Drain the response body and release the connection back to the pool
+            except asyncio.TimeoutError:
+                raise TimeoutError(
+                    f"The /init endpoint tried {init_url} with {init_payload.model_dump()} but timed out after 300 seconds."
+                )
 
-            terminated = False
             deadline = time.time() + timeout_seconds
 
-            def _get_status() -> Dict[str, Any]:
-                url = f"{remote_base_url}/status"
-                r = requests.get(url, params={"rollout_id": row.execution_metadata.rollout_id}, timeout=15)
-                r.raise_for_status()
-                return r.json()
-
-            continue_polling_status = True
             while time.time() < deadline:
-                try:
-                    if continue_polling_status:
-                        status = await asyncio.to_thread(_get_status)
-                        terminated = bool(status.get("terminated", False))
-                        if terminated:
-                            break
-                except requests.exceptions.HTTPError as e:
-                    if e.response is not None and e.response.status_code == 404:
-                        # 404 means server doesn't implement /status endpoint, stop polling
-                        logger.debug(
-                            f"Server doesn't implement /status endpoint (404), stopping status polling for rollout {row.execution_metadata.rollout_id}"
-                        )
-                        continue_polling_status = False
-                    else:
-                        raise
-                except Exception:
-                    # For all other exceptions, raise them
-                    raise
-
                 # Search Fireworks tracing logs for completion (run in thread to avoid blocking event loop)
                 completed_logs = await asyncio.to_thread(
                     self._tracing_adapter.search_logs, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
@@ -142,9 +122,20 @@ def _get_status() -> Dict[str, Any]:
                         status_logs.append(log)
 
                 if status_logs:
+                    if len(status_logs) > 1:
+                        logger.warning(
+                            "Found %s status logs for rollout %s; expected at most 1. Using the first one: %s",
+                            len(status_logs),
+                            row.execution_metadata.rollout_id,
+                            status_logs[0],
+                        )
                     # Use the first log with status information
                     status_log = status_logs[0]
                     status_dict = status_log.get("status")
+                    raw_extras = status_log.get("extras") or {}
+                    status_extras = {
+                        k: v for k, v in raw_extras.items() if k not in ("logger_name", "level", "timestamp")
+                    }
 
                     logger.info(
                         f"Found status log for rollout {row.execution_metadata.rollout_id}: {status_log.get('message', '')}"
@@ -169,6 +160,11 @@ def _get_status() -> Dict[str, Any]:
                         details=status_details,
                     )
 
+                    if row.execution_metadata.extra:
+                        row.execution_metadata.extra.update(status_extras)
+                    else:
+                        row.execution_metadata.extra = status_extras
+
                     logger.info("Stopping polling for rollout %s", row.execution_metadata.rollout_id)
                     break
 
@@ -200,5 +196,21 @@ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
         tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
         return tasks
 
+    async def acleanup(self) -> None:
+        """Async cleanup - preferred when you can await."""
+        if self._session and not self._session.closed:
+            await self._session.close()
+
     def cleanup(self) -> None:
-        return None
+        """Sync cleanup - best-effort, schedules close if event loop is running."""
+        if self._session and not self._session.closed:
+            try:
+                loop = asyncio.get_running_loop()
+                loop.create_task(self._session.close())
+            except RuntimeError:
+                # No running event loop - can't safely close the session.
+                # The session will be garbage collected eventually, but warn about it.
+                logger.warning(
+                    "RemoteRolloutProcessor.cleanup() called outside of async context. "
+                    "Session may not be properly closed. Use `await processor.acleanup()` when possible."
+                )
@@ -19,6 +19,10 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
         """Process evaluation rows and return async tasks. Must be implemented by subclasses."""
         pass
 
+    async def acleanup(self) -> None:
+        """Async cleanup - preferred when you can await."""
+        pass
+
     def cleanup(self) -> None:
         """Cleanup resources. Override in subclasses if cleanup is needed."""
         pass