From e9adadc9c5e945963c2ef1474c1c9730dd05167e Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 21 Aug 2025 16:00:46 -0700 Subject: [PATCH 01/16] test reproduces error properly --- eval_protocol/mcp/mcp_multi_client.py | 5 +- .../docs_mcp_config_broken.json | 7 +++ tests/pytest/test_pytest_propagate_error.py | 63 +++++++++++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 tests/pytest/mcp_configurations/docs_mcp_config_broken.json create mode 100644 tests/pytest/test_pytest_propagate_error.py diff --git a/eval_protocol/mcp/mcp_multi_client.py b/eval_protocol/mcp/mcp_multi_client.py index 38f2dbee..5b0c676c 100644 --- a/eval_protocol/mcp/mcp_multi_client.py +++ b/eval_protocol/mcp/mcp_multi_client.py @@ -69,10 +69,7 @@ async def connect_to_servers(self): return for server_name, server_config in self.config.mcpServers.items(): - try: - await self._connect_to_server(server_name, server_config) - except Exception as e: - print(f"Failed to connect to server '{server_name}': {e}") + await self._connect_to_server(server_name, server_config) async def _connect_to_server( self, server_name: str, server_config: Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl] diff --git a/tests/pytest/mcp_configurations/docs_mcp_config_broken.json b/tests/pytest/mcp_configurations/docs_mcp_config_broken.json new file mode 100644 index 00000000..11a3bb56 --- /dev/null +++ b/tests/pytest/mcp_configurations/docs_mcp_config_broken.json @@ -0,0 +1,7 @@ +{ + "mcpServers": { + "docs.fireworks.ai": { + "url": "https://docs.fireworks.ai/mcp-non-existent" + } + } +} diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py new file mode 100644 index 00000000..b6bdd517 --- /dev/null +++ b/tests/pytest/test_pytest_propagate_error.py @@ -0,0 +1,63 @@ +from typing import Set +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.default_agent_rollout_processor import AgentRolloutProcessor +from eval_protocol.dataset_logger import DatasetLogger + + +class TrackingLogger(DatasetLogger): + """Custom logger that ensures that the final row is in an error state.""" + + def __init__(self, rollouts: dict[str, EvaluationRow]): + self.rollouts = rollouts + + def log(self, row: EvaluationRow): + self.rollouts[row.execution_metadata.rollout_id] = row + + def read(self): + return [] + + +async def test_pytest_propagate_error(): + """ + Properly propagate errors from rollout processing to eval_metadata.status. + To test this, we use a broken MCP configuration that should fail during the + rollout processing. Then the final eval_metadata.status should be an error. + This way the UI can properly render an error state for the rollout and a + developer can identify and investigate the error. + """ + from eval_protocol.pytest.evaluation_test import evaluation_test + + input_messages = [ + [ + Message( + role="system", + content="You are a helpful assistant that can answer questions about Fireworks.", + ), + ] + ] + completion_params_list = [ + {"model": "dummy/local-model"}, + ] + + rollouts: dict[str, EvaluationRow] = {} + logger = TrackingLogger(rollouts) + + @evaluation_test( + input_messages=input_messages, + completion_params=completion_params_list, + rollout_processor=AgentRolloutProcessor(), + mode="pointwise", + num_runs=5, + mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config_broken.json", + logger=logger, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + return row + + # Manually invoke all parameter combinations within a single test + for params in completion_params_list: + await eval_fn(input_messages=input_messages, completion_params=params) + + # assert that the status of eval_metadata.status is "error" + assert len(rollouts) == 5 + assert all(row.eval_metadata.status == "error" for row in rollouts.values()) From 087695c02ae124743eab7a0c6b7d4ea82860a569 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 21 Aug 2025 20:43:59 -0700 Subject: [PATCH 02/16] propagate error from rollout status --- eval_protocol/pytest/evaluation_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index ce5f8817..d7ba46dd 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -726,7 +726,10 @@ async def _collect_result(config, lst): for r in results: if r.eval_metadata is not None: - r.eval_metadata.status = "finished" + if r.rollout_status.is_error(): + r.eval_metadata.status = "error" + else: + r.eval_metadata.status = "finished" active_logger.log(r) # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them From 29b3aeab0001094036d4a5a045fe14d567a5cc7b Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 21 Aug 2025 21:15:27 -0700 Subject: [PATCH 03/16] migrate eval_metadata.status to AIP-193 --- eval_protocol/models.py | 26 +++++-- eval_protocol/pytest/evaluation_test.py | 25 ++++--- eval_protocol/pytest/utils.py | 2 +- eval_protocol/utils/logs_server.py | 12 +++- tests/pytest/test_pytest_propagate_error.py | 2 +- vite-app/src/App.tsx | 6 +- vite-app/src/components/EvaluationRow.tsx | 14 ++-- vite-app/src/components/StatusIndicator.tsx | 25 ++++--- vite-app/src/types/eval-protocol.ts | 77 +++++++++++++++++---- 9 files changed, 140 insertions(+), 49 deletions(-) diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 61b96e54..7f72a96a 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -107,13 +107,29 @@ class Code(int, Enum): DATA_LOSS = 15 UNAUTHENTICATED = 16 - # Custom codes for rollout states (using higher numbers to avoid conflicts) + # Custom codes for EP (using higher numbers to avoid conflicts) FINISHED = 100 + RUNNING = 101 @classmethod def rollout_running(cls) -> "Status": """Create a status indicating the rollout is running.""" - return cls(code=cls.Code.OK, message="Rollout is running", details=[]) + return cls(code=cls.Code.RUNNING, message="Rollout is running", details=[]) + + @classmethod + def eval_running(cls) -> "Status": + """Create a status indicating the evaluation is running.""" + return cls(code=cls.Code.RUNNING, message="Evaluation is running", details=[]) + + @classmethod + def eval_finished(cls) -> "Status": + """Create a status indicating the evaluation finished.""" + return cls(code=cls.Code.FINISHED, message="Evaluation finished", details=[]) + + @classmethod + def aborted(cls, message: str, details: Optional[List[Dict[str, Any]]] = None) -> "Status": + """Create a status indicating the evaluation was aborted.""" + return cls(code=cls.Code.ABORTED, message=message, details=details or []) @classmethod def rollout_finished( @@ -144,7 +160,7 @@ def error(cls, error_message: str, details: Optional[List[Dict[str, Any]]] = Non def is_running(self) -> bool: """Check if the status indicates the rollout is running.""" - return self.code == self.Code.OK and self.message == "Rollout is running" + return self.code == self.Code.RUNNING def is_finished(self) -> bool: """Check if the status indicates the rollout finished successfully.""" @@ -436,9 +452,7 @@ class EvalMetadata(BaseModel): default_factory=get_pep440_version, description="Version of the evaluation. Should be populated with a PEP 440 version string.", ) - status: Optional[Literal["running", "finished", "error", "stopped"]] = Field( - None, description="Status of the evaluation" - ) + status: Optional[Status] = Field(None, description="Status of the evaluation") num_runs: int = Field(..., description="Number of times the evaluation was repeated") aggregation_method: str = Field(..., description="Method used to aggregate scores across runs") passed_threshold: Optional[EvaluationThreshold] = Field( diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index d7ba46dd..0ce0353c 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -21,11 +21,13 @@ from eval_protocol.human_id import generate_id, num_combinations from eval_protocol.models import ( CompletionParams, + ErrorInfo, EvalMetadata, EvaluationRow, EvaluationThreshold, InputMetadata, Message, + Status, ) from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor @@ -57,6 +59,7 @@ ) from eval_protocol.pytest.exception_config import ExceptionHandlerConfig from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci +from eval_protocol.types.types import TerminationReason from ..common_utils import load_jsonl @@ -419,7 +422,7 @@ async def execute_with_params( if mode == "groupwise": combinations = generate_parameter_combinations( input_dataset, - None, + completion_params, input_messages, input_rows, evaluation_test_kwargs, @@ -482,9 +485,7 @@ async def wrapper_body(**kwargs): experiment_id = generate_id() - def _log_eval_error( - status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool - ) -> None: + def _log_eval_error(status: Status, rows: Optional[List[EvaluationRow]] | None, passed: bool) -> None: log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger) try: @@ -556,7 +557,7 @@ def _log_eval_error( eval_metadata = EvalMetadata( name=test_func.__name__, description=test_func.__doc__, - status="running", + status=Status.eval_running(), num_runs=num_runs, aggregation_method=aggregation_method, passed_threshold=threshold, @@ -727,9 +728,11 @@ async def _collect_result(config, lst): for r in results: if r.eval_metadata is not None: if r.rollout_status.is_error(): - r.eval_metadata.status = "error" + r.eval_metadata.status = Status.error( + r.rollout_status.message, r.rollout_status.details + ) else: - r.eval_metadata.status = "finished" + r.eval_metadata.status = Status.eval_finished() active_logger.log(r) # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them @@ -767,14 +770,16 @@ async def _collect_result(config, lst): except AssertionError: _log_eval_error( - "finished", + Status.eval_finished(), processed_rows_in_run if "processed_rows_in_run" in locals() else None, passed=False, ) raise - except Exception: + except Exception as e: _log_eval_error( - "error", processed_rows_in_run if "processed_rows_in_run" in locals() else None, passed=False + Status.error(str(e)), + processed_rows_in_run if "processed_rows_in_run" in locals() else None, + passed=False, ) raise diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 57af60b9..2a723093 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -113,7 +113,7 @@ async def wrapper(**kwargs): def log_eval_status_and_rows( eval_metadata: Optional[EvalMetadata], rows: Optional[List[EvaluationRow]] | None, - status: Literal["finished", "error"], + status: Status, passed: bool, logger: DatasetLogger, ) -> None: diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py index e5e6e4a3..0ea4c272 100644 --- a/eval_protocol/utils/logs_server.py +++ b/eval_protocol/utils/logs_server.py @@ -15,6 +15,7 @@ from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import LOG_EVENT_TYPE from eval_protocol.event_bus import event_bus +from eval_protocol.models import Status from eval_protocol.utils.vite_server import ViteServer if TYPE_CHECKING: @@ -179,7 +180,9 @@ def _check_running_evaluations(self): if self._should_update_status(row): logger.info(f"Updating status to 'stopped' for row {row.input_metadata.row_id} (PID {row.pid})") if row.eval_metadata is not None: - row.eval_metadata.status = "stopped" + row.eval_metadata.status = Status.aborted( + f"Evaluation aborted since process {row.pid} stopped" + ) updated_rows.append(row) # Log all updated rows @@ -194,7 +197,12 @@ def _check_running_evaluations(self): def _should_update_status(self, row: "EvaluationRow") -> bool: """Check if a row's status should be updated to 'stopped'.""" # Check if the row has running status and a PID - if row.eval_metadata and row.eval_metadata.status == "running" and row.pid is not None: + if ( + row.eval_metadata + and row.eval_metadata.status + and row.eval_metadata.status.is_running() + and row.pid is not None + ): # Check if the process is still running try: process = psutil.Process(row.pid) diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py index b6bdd517..e0aa59e6 100644 --- a/tests/pytest/test_pytest_propagate_error.py +++ b/tests/pytest/test_pytest_propagate_error.py @@ -60,4 +60,4 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # assert that the status of eval_metadata.status is "error" assert len(rollouts) == 5 - assert all(row.eval_metadata.status == "error" for row in rollouts.values()) + assert all(row.eval_metadata.status.is_error() for row in rollouts.values()) diff --git a/vite-app/src/App.tsx b/vite-app/src/App.tsx index bfdeb141..f322333d 100644 --- a/vite-app/src/App.tsx +++ b/vite-app/src/App.tsx @@ -150,7 +150,11 @@ const App = observer(() => {