From b57e6d29bcca1b5b83213ac04998a53a3098b145 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 29 Sep 2025 16:29:56 -0700 Subject: [PATCH 1/4] Add row_ids parameter to evaluation_test function for filtering evaluations - Introduced row_ids as an optional parameter to allow filtering of evaluation rows based on specified identifiers. - Updated documentation to reflect the new parameter and its usage in the evaluation process. --- eval_protocol/pytest/evaluation_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2cf8ce0c..6ddaad8a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -79,6 +79,7 @@ def evaluation_test( aggregation_method: AggregationMethod = "mean", passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None, num_runs: int = 1, + row_ids: Sequence[str] | None = None, max_dataset_rows: int | None = None, mcp_config_path: str | None = None, max_concurrent_rollouts: int = 8, @@ -146,6 +147,7 @@ def evaluation_test( Success rate must be above success, and if set, standard error must be below standard_error. Success rate +/- one standard_error is equivalent to 68% confidence interval. num_runs: Number of times to repeat the rollout and evaluations. + row_ids: List of row_ids to use filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated. max_dataset_rows: Limit dataset to the first N rows. mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel. @@ -286,6 +288,9 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo else: raise ValueError("No input dataset, input messages, or input rows provided") + if row_ids is not None: + data = [row for row in data if row.input_metadata.row_id in row_ids] + """ data_loaders handles preprocess_fn internally so we want to specially handle data_loaders here so we don't double From c90758d3d4a927faf06d675ac56db69a4711937a Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 29 Sep 2025 16:30:12 -0700 Subject: [PATCH 2/4] Handle timeout in RemoteRolloutProcessor by updating rollout status - Added logic to set the rollout status to an error when the polling loop completes without a successful break, indicating a timeout. - Enhanced error handling to provide clearer feedback on rollout timeouts. --- eval_protocol/pytest/remote_rollout_processor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py index d6e563a4..7201e688 100644 --- a/eval_protocol/pytest/remote_rollout_processor.py +++ b/eval_protocol/pytest/remote_rollout_processor.py @@ -155,7 +155,13 @@ def _get_status() -> Dict[str, Any]: except Exception: # transient errors; continue polling pass + await asyncio.sleep(poll_interval) + else: + # Loop completed without breaking, which means we timed out + row.rollout_status = Status.rollout_error( + f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds" + ) # Update duration, regardless of termination row.execution_metadata.duration_seconds = time.perf_counter() - start_time From 9a6fe7a924cb1c24a526dec5004a85f65c02e7c9 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 29 Sep 2025 16:30:21 -0700 Subject: [PATCH 3/4] Add optional status field to StatusResponse model in remote rollout processor - Introduced an optional status indicator in the StatusResponse model to differentiate between successful and failed rollouts. - Updated documentation to clarify the purpose of the new status field for better understanding in the eval-protocol context. --- eval_protocol/types/remote_rollout_processor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/eval_protocol/types/remote_rollout_processor.py b/eval_protocol/types/remote_rollout_processor.py index 21a821e0..817d1c3f 100644 --- a/eval_protocol/types/remote_rollout_processor.py +++ b/eval_protocol/types/remote_rollout_processor.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field -from eval_protocol.models import Message +from eval_protocol.models import Message, Status class RolloutMetadata(BaseModel): @@ -40,6 +40,12 @@ class StatusResponse(BaseModel): terminated: bool info: Optional[Dict[str, Any]] = None + status: Optional[Status] = None + """ + Optional status indicator for the rollout to be used by eval-protocol. This + is useful to distinguish between successful and failed rollouts. + """ + def create_langfuse_config_tags(init_request: InitRequest) -> List[str]: """Create Langfuse tags from InitRequest metadata.""" From 1744b558ceb39f3a910a898ed6dd3df6a4576691 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 29 Sep 2025 16:40:31 -0700 Subject: [PATCH 4/4] Rename row_ids parameter to filtered_row_ids in evaluation_test function for clarity - Updated the parameter name from row_ids to filtered_row_ids to better reflect its purpose in filtering evaluation rows. - Adjusted related documentation to ensure consistency and clarity regarding the new parameter name. --- eval_protocol/pytest/evaluation_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 6ddaad8a..5ebd0dfa 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -79,7 +79,7 @@ def evaluation_test( aggregation_method: AggregationMethod = "mean", passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None, num_runs: int = 1, - row_ids: Sequence[str] | None = None, + filtered_row_ids: Sequence[str] | None = None, max_dataset_rows: int | None = None, mcp_config_path: str | None = None, max_concurrent_rollouts: int = 8, @@ -147,7 +147,7 @@ def evaluation_test( Success rate must be above success, and if set, standard error must be below standard_error. Success rate +/- one standard_error is equivalent to 68% confidence interval. num_runs: Number of times to repeat the rollout and evaluations. - row_ids: List of row_ids to use filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated. + filtered_row_ids: List of row_ids to filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated. max_dataset_rows: Limit dataset to the first N rows. mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel. @@ -288,8 +288,8 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo else: raise ValueError("No input dataset, input messages, or input rows provided") - if row_ids is not None: - data = [row for row in data if row.input_metadata.row_id in row_ids] + if filtered_row_ids is not None: + data = [row for row in data if row.input_metadata.row_id in filtered_row_ids] """ data_loaders handles preprocess_fn internally so we want