From b57e6d29bcca1b5b83213ac04998a53a3098b145 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 29 Sep 2025 16:29:56 -0700
Subject: [PATCH 1/4] Add row_ids parameter to evaluation_test function for
 filtering evaluations

- Introduced row_ids as an optional parameter to allow filtering of evaluation rows based on specified identifiers.
- Updated documentation to reflect the new parameter and its usage in the evaluation process.
---
 eval_protocol/pytest/evaluation_test.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 2cf8ce0c..6ddaad8a 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -79,6 +79,7 @@ def evaluation_test(
     aggregation_method: AggregationMethod = "mean",
     passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
     num_runs: int = 1,
+    row_ids: Sequence[str] | None = None,
     max_dataset_rows: int | None = None,
     mcp_config_path: str | None = None,
     max_concurrent_rollouts: int = 8,
@@ -146,6 +147,7 @@ def evaluation_test(
             Success rate must be above success, and if set, standard error must be below standard_error.
             Success rate +/- one standard_error is equivalent to 68% confidence interval.
         num_runs: Number of times to repeat the rollout and evaluations.
+        row_ids: List of row_ids to use filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel.
@@ -286,6 +288,9 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     else:
                         raise ValueError("No input dataset, input messages, or input rows provided")
 
+                    if row_ids is not None:
+                        data = [row for row in data if row.input_metadata.row_id in row_ids]
+
                     """
                     data_loaders handles preprocess_fn internally so we want
                     to specially handle data_loaders here so we don't double

From c90758d3d4a927faf06d675ac56db69a4711937a Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 29 Sep 2025 16:30:12 -0700
Subject: [PATCH 2/4] Handle timeout in RemoteRolloutProcessor by updating
 rollout status

- Added logic to set the rollout status to an error when the polling loop completes without a successful break, indicating a timeout.
- Enhanced error handling to provide clearer feedback on rollout timeouts.
---
 eval_protocol/pytest/remote_rollout_processor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index d6e563a4..7201e688 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -155,7 +155,13 @@ def _get_status() -> Dict[str, Any]:
                 except Exception:
                     # transient errors; continue polling
                     pass
+
                 await asyncio.sleep(poll_interval)
+            else:
+                # Loop completed without breaking, which means we timed out
+                row.rollout_status = Status.rollout_error(
+                    f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
+                )
 
             # Update duration, regardless of termination
             row.execution_metadata.duration_seconds = time.perf_counter() - start_time

From 9a6fe7a924cb1c24a526dec5004a85f65c02e7c9 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 29 Sep 2025 16:30:21 -0700
Subject: [PATCH 3/4] Add optional status field to StatusResponse model in
 remote rollout processor

- Introduced an optional status indicator in the StatusResponse model to differentiate between successful and failed rollouts.
- Updated documentation to clarify the purpose of the new status field for better understanding in the eval-protocol context.
---
 eval_protocol/types/remote_rollout_processor.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py
index 21a821e0..817d1c3f 100644
--- a/eval_protocol/types/remote_rollout_processor.py
+++ b/eval_protocol/types/remote_rollout_processor.py
@@ -4,7 +4,7 @@
 
 from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Field
-from eval_protocol.models import Message
+from eval_protocol.models import Message, Status
 
 
 class RolloutMetadata(BaseModel):
@@ -40,6 +40,12 @@ class StatusResponse(BaseModel):
     terminated: bool
     info: Optional[Dict[str, Any]] = None
 
+    status: Optional[Status] = None
+    """
+    Optional status indicator for the rollout to be used by eval-protocol. This
+    is useful to distinguish between successful and failed rollouts.
+    """
+
 
 def create_langfuse_config_tags(init_request: InitRequest) -> List[str]:
     """Create Langfuse tags from InitRequest metadata."""

From 1744b558ceb39f3a910a898ed6dd3df6a4576691 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 29 Sep 2025 16:40:31 -0700
Subject: [PATCH 4/4] Rename row_ids parameter to filtered_row_ids in
 evaluation_test function for clarity

- Updated the parameter name from row_ids to filtered_row_ids to better reflect its purpose in filtering evaluation rows.
- Adjusted related documentation to ensure consistency and clarity regarding the new parameter name.
---
 eval_protocol/pytest/evaluation_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 6ddaad8a..5ebd0dfa 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -79,7 +79,7 @@ def evaluation_test(
     aggregation_method: AggregationMethod = "mean",
     passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
     num_runs: int = 1,
-    row_ids: Sequence[str] | None = None,
+    filtered_row_ids: Sequence[str] | None = None,
     max_dataset_rows: int | None = None,
     mcp_config_path: str | None = None,
     max_concurrent_rollouts: int = 8,
@@ -147,7 +147,7 @@ def evaluation_test(
             Success rate must be above success, and if set, standard error must be below standard_error.
             Success rate +/- one standard_error is equivalent to 68% confidence interval.
         num_runs: Number of times to repeat the rollout and evaluations.
-        row_ids: List of row_ids to use filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated.
+        filtered_row_ids: List of row_ids to filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel.
@@ -288,8 +288,8 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     else:
                         raise ValueError("No input dataset, input messages, or input rows provided")
 
-                    if row_ids is not None:
-                        data = [row for row in data if row.input_metadata.row_id in row_ids]
+                    if filtered_row_ids is not None:
+                        data = [row for row in data if row.input_metadata.row_id in filtered_row_ids]
 
                     """
                     data_loaders handles preprocess_fn internally so we want