diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 8f96021b..f15403ac 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -620,7 +620,13 @@ async def _collect_result(config, lst): experiment_duration_seconds = time.perf_counter() - experiment_start_time - # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them + if not all(r.evaluation_result is not None for run_results in all_results for r in run_results): + raise AssertionError( + "Some EvaluationRow instances are missing evaluation_result. " + "Your @evaluation_test function must set `row.evaluation_result`" + ) + + # for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them # rollout_id is used to differentiate the result from different completion_params if mode == "groupwise": results_by_group = [ diff --git a/tests/data_loader/test_dynamic_data_loader.py b/tests/data_loader/test_dynamic_data_loader.py index 73e24bbb..134cc9da 100644 --- a/tests/data_loader/test_dynamic_data_loader.py +++ b/tests/data_loader/test_dynamic_data_loader.py @@ -1,5 +1,5 @@ from eval_protocol.data_loader import DynamicDataLoader -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test @@ -27,6 +27,7 @@ def test_dynamic_data_loader(row: EvaluationRow) -> EvaluationRow: == "Factory function that generates evaluation rows dynamically." ) assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -45,6 +46,7 @@ def test_dynamic_data_loader_lambda(row: EvaluationRow) -> EvaluationRow: assert row.input_metadata.dataset_info.get("data_loader_num_rows_after_preprocessing") == 1 assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader" assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -72,4 +74,5 @@ def test_dynamic_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationR assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader" assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/data_loader/test_inline_data_loader.py b/tests/data_loader/test_inline_data_loader.py index 2df3fa24..c36cd9c1 100644 --- a/tests/data_loader/test_inline_data_loader.py +++ b/tests/data_loader/test_inline_data_loader.py @@ -1,5 +1,5 @@ from eval_protocol.data_loader.inline_data_loader import InlineDataLoader -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor @@ -20,6 +20,7 @@ def test_inline_data_loader(row: EvaluationRow) -> EvaluationRow: assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader" assert row.input_metadata.dataset_info.get("data_loader_variant_description") is None assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -41,4 +42,5 @@ def test_inline_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationRo assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader" assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_get_metadata.py b/tests/pytest/test_get_metadata.py index e89f5a9e..ff757215 100644 --- a/tests/pytest/test_get_metadata.py +++ b/tests/pytest/test_get_metadata.py @@ -1,7 +1,7 @@ import asyncio from eval_protocol.pytest import evaluation_test -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult @evaluation_test( @@ -22,6 +22,8 @@ ) def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" + for row in rows: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return rows diff --git a/tests/pytest/test_pydantic_agent.py b/tests/pytest/test_pydantic_agent.py index 1280c326..c22faf64 100644 --- a/tests/pytest/test_pydantic_agent.py +++ b/tests/pytest/test_pydantic_agent.py @@ -2,7 +2,7 @@ from pydantic_ai.models.openai import OpenAIChatModel import pytest -from eval_protocol.models import EvaluationRow, Message, Status +from eval_protocol.models import EvaluationRow, Message, Status, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor @@ -28,4 +28,5 @@ async def test_pydantic_agent(row: EvaluationRow) -> EvaluationRow: Super simple hello world test for Pydantic AI. """ assert row.rollout_status.code == Status.Code.FINISHED + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pydantic_multi_agent.py b/tests/pytest/test_pydantic_multi_agent.py index 9632b77c..9fb2f176 100644 --- a/tests/pytest/test_pydantic_multi_agent.py +++ b/tests/pytest/test_pydantic_multi_agent.py @@ -10,7 +10,7 @@ from pydantic_ai.models.openai import OpenAIChatModel import pytest -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test from pydantic_ai import Agent @@ -82,4 +82,5 @@ async def test_pydantic_multi_agent(row: EvaluationRow) -> EvaluationRow: """ Super simple hello world test for Pydantic AI. """ + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py index b1183a6d..bbdfaa9f 100644 --- a/tests/pytest/test_pytest_async.py +++ b/tests/pytest/test_pytest_async.py @@ -1,6 +1,6 @@ import pytest -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test @@ -20,6 +20,8 @@ ) async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" + for row in rows: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return rows @@ -36,6 +38,7 @@ async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]: ) async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow: """Run pointwise evaluation on sample dataset using pytest interface.""" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index d5324b4d..d057108b 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test @@ -24,4 +24,6 @@ ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" + for row in rows: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return rows diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py index 48c3dce4..9f46b7a3 100644 --- a/tests/pytest/test_pytest_ensure_logging.py +++ b/tests/pytest/test_pytest_ensure_logging.py @@ -26,7 +26,7 @@ async def test_ensure_logging(monkeypatch): with patch( "eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store ): - from eval_protocol.models import EvaluationRow + from eval_protocol.models import EvaluationRow, EvaluateResult from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from eval_protocol.pytest.evaluation_test import evaluation_test from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row @@ -44,6 +44,9 @@ async def test_ensure_logging(monkeypatch): # Don't pass logger parameter - let it use the default_logger (which we've replaced) ) def eval_fn(row: EvaluationRow) -> EvaluationRow: + # This test is only about logging behavior; attach a dummy evaluation_result + # so that evaluation_test's invariant about evaluation_result is satisfied. + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row await eval_fn( diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index 348d7ef8..ec02d213 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -1,7 +1,7 @@ import atexit import shutil import tempfile -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor @@ -23,6 +23,7 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: """Run math evaluation on sample dataset using pytest interface.""" assert row.messages[0].content == "What is the capital of France?" assert row.execution_metadata.invocation_id == "test-invocation-123" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -38,6 +39,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: """Run math evaluation on sample dataset using pytest interface.""" assert row.messages[0].content == "What is 5 * 6?" assert row.input_metadata.completion_params["model"] == "gpt-40" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -60,6 +62,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: ) def test_input_override(row: EvaluationRow) -> EvaluationRow: assert row.messages[0].content == "What is 10 / 2?" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @@ -79,6 +82,7 @@ def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> Evalu # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages) assert len(row.messages) == 1 assert row.messages[0].role == "user" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @evaluation_test( @@ -96,6 +100,7 @@ def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> Eval assert row.messages[0].role == "user" # Verify the original message content is preserved (no assistant response added) assert row.messages[0].content == "Test override" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row @evaluation_test( @@ -115,6 +120,7 @@ def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> E # Verify rows pass through unchanged assert len(row.messages) == 1 assert row.messages[0].role == "user" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py index b6bb4a35..79b8c28f 100644 --- a/tests/pytest/test_pytest_ids.py +++ b/tests/pytest/test_pytest_ids.py @@ -2,7 +2,7 @@ import eval_protocol.dataset_logger as dataset_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger -from eval_protocol.models import EvaluationRow +from eval_protocol.models import EvaluationRow, EvaluateResult from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row @@ -37,6 +37,7 @@ async def test_evaluation_test_decorator(monkeypatch): logger=logger, ) def eval_fn(row: EvaluationRow) -> EvaluationRow: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row dataset_paths = [ @@ -83,6 +84,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: unique_rollout_ids.add(row.execution_metadata.rollout_id) unique_invocation_ids.add(row.execution_metadata.invocation_id) unique_row_ids.add(row.input_metadata.row_id) + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row dataset_paths = [ diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py index c601c74b..f545f0f5 100644 --- a/tests/pytest/test_pytest_input_messages.py +++ b/tests/pytest/test_pytest_input_messages.py @@ -1,7 +1,7 @@ from typing import List import pytest -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test @@ -19,4 +19,6 @@ ) def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" + for row in rows: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return rows diff --git a/tests/pytest/test_pytest_input_rows.py b/tests/pytest/test_pytest_input_rows.py index 79689783..22a55887 100644 --- a/tests/pytest/test_pytest_input_rows.py +++ b/tests/pytest/test_pytest_input_rows.py @@ -1,4 +1,4 @@ -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor @@ -12,4 +12,5 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: """Run math evaluation on sample dataset using pytest interface.""" assert row.messages[0].content == "What is the capital of France?" + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pytest_input_rows_parametrized_completion_params.py b/tests/pytest/test_pytest_input_rows_parametrized_completion_params.py index b06476ca..16e2088b 100644 --- a/tests/pytest/test_pytest_input_rows_parametrized_completion_params.py +++ b/tests/pytest/test_pytest_input_rows_parametrized_completion_params.py @@ -1,4 +1,4 @@ -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test @@ -18,4 +18,5 @@ def test_pytest_input_rows_parametrized_completion_params(row: EvaluationRow, ** else: assert "gpt-4" in seen_models seen_models.add(model) + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row diff --git a/tests/pytest/test_pytest_mcp_config.py b/tests/pytest/test_pytest_mcp_config.py index d43bfe6d..d1259ce2 100644 --- a/tests/pytest/test_pytest_mcp_config.py +++ b/tests/pytest/test_pytest_mcp_config.py @@ -87,6 +87,9 @@ def read(self, row_id: str | None = None) -> list[EvaluationRow]: logger=logger, ) def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Attach a dummy evaluation_result so the invariant is satisfied; + # this test only cares about tools being added to the row. + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row await eval_fn(input_messages=input_messages, completion_params=completion_params_list[0]) # pyright: ignore[reportCallIssue] diff --git a/tests/pytest/test_pytest_missing_evaluation_result.py b/tests/pytest/test_pytest_missing_evaluation_result.py new file mode 100644 index 00000000..a2840212 --- /dev/null +++ b/tests/pytest/test_pytest_missing_evaluation_result.py @@ -0,0 +1,32 @@ +import pytest + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +from eval_protocol.pytest.evaluation_test import evaluation_test + + +@pytest.mark.asyncio +async def test_missing_evaluation_result_raises_assertion_error() -> None: + """evaluation_test should raise if any EvaluationRow is missing evaluation_result.""" + + input_messages = [ + [Message(role="user", content="Test message")], + ] + + @evaluation_test( + input_messages=[input_messages], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + num_runs=1, + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Intentionally forget to set row.evaluation_result + return row + + with pytest.raises(AssertionError) as excinfo: + # Trigger the evaluation; this should hit the assertion added in evaluation_test.py + await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue] + + msg = str(excinfo.value) + assert "Some EvaluationRow instances are missing evaluation_result" in msg + assert "must set `row.evaluation_result`" in msg diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py index 2472e527..20939106 100644 --- a/tests/pytest/test_pytest_propagate_error.py +++ b/tests/pytest/test_pytest_propagate_error.py @@ -1,5 +1,5 @@ from typing_extensions import override -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest.default_agent_rollout_processor import AgentRolloutProcessor from eval_protocol.dataset_logger.dataset_logger import DatasetLogger @@ -56,6 +56,9 @@ async def test_pytest_propagate_error(): logger=logger, ) def eval_fn(row: EvaluationRow) -> EvaluationRow: + # Attach a dummy evaluation_result so the invariant is satisfied; + # this test only cares that eval_metadata.status reflects rollout errors. + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row # Manually invoke all parameter combinations within a single test diff --git a/tests/pytest/test_pytest_stable_row_id.py b/tests/pytest/test_pytest_stable_row_id.py index c2a5709a..8ae47028 100644 --- a/tests/pytest/test_pytest_stable_row_id.py +++ b/tests/pytest/test_pytest_stable_row_id.py @@ -1,6 +1,6 @@ from typing import List -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row @@ -30,6 +30,7 @@ async def test_evaluation_test_decorator_ids_single(): ) def eval_fn(row: EvaluationRow) -> EvaluationRow: row_ids.add(row.input_metadata.row_id) + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row # Manually invoke all parameter combinations within a single test @@ -81,6 +82,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.input_metadata is not None assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str) row_ids.add(row.input_metadata.row_id) + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") return row # Single invocation (one dataset, one param set) with multiple runs diff --git a/tests/remote_server/test_remote_fireworks.py b/tests/remote_server/test_remote_fireworks.py index 48a56c08..4ef67f8e 100644 --- a/tests/remote_server/test_remote_fireworks.py +++ b/tests/remote_server/test_remote_fireworks.py @@ -10,7 +10,7 @@ import requests from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter @@ -119,6 +119,8 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat - trigger remote rollout via RemoteRolloutProcessor (calls init/status) - fetch traces from Langfuse via Fireworks tracing proxy filtered by metadata via output_data_loader; FAIL if none found """ + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") + assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content" assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row." diff --git a/tests/remote_server/test_remote_fireworks_propagate_status.py b/tests/remote_server/test_remote_fireworks_propagate_status.py index 7c05172f..8e2aaaa8 100644 --- a/tests/remote_server/test_remote_fireworks_propagate_status.py +++ b/tests/remote_server/test_remote_fireworks_propagate_status.py @@ -9,7 +9,7 @@ import requests from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader -from eval_protocol.models import EvaluationRow, Message, Status +from eval_protocol.models import EvaluationRow, Message, Status, EvaluateResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter @@ -96,6 +96,8 @@ def rows() -> List[EvaluationRow]: ), ) async def test_remote_rollout_and_fetch_fireworks_propagate_status(row: EvaluationRow) -> EvaluationRow: + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") + assert row.rollout_status.code == Status.Code.INTERNAL assert row.rollout_status.message == "test error" return row