eval-protocol · xzrderek · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -620,7 +620,13 @@ async def _collect_result(config, lst):
 
                     experiment_duration_seconds = time.perf_counter() - experiment_start_time
 
-                    # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
+                    if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
+                        raise AssertionError(
+                            "Some EvaluationRow instances are missing evaluation_result. "
+                            "Your @evaluation_test function must set `row.evaluation_result`"
+                        )
+
+                    # for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
                     # rollout_id is used to differentiate the result from different completion_params
                     if mode == "groupwise":
                         results_by_group = [

diff --git a/tests/pytest/test_pytest_missing_evaluation_result.py b/tests/pytest/test_pytest_missing_evaluation_result.py
@@ -0,0 +1,32 @@
+import pytest
+
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+from eval_protocol.pytest.evaluation_test import evaluation_test
+
+
+@pytest.mark.asyncio
+async def test_missing_evaluation_result_raises_assertion_error() -> None:
+    """evaluation_test should raise if any EvaluationRow is missing evaluation_result."""
+
+    input_messages = [
+        [Message(role="user", content="Test message")],
+    ]
+
+    @evaluation_test(
+        input_messages=[input_messages],
+        rollout_processor=NoOpRolloutProcessor(),
+        mode="pointwise",
+        num_runs=1,
+    )
+    def eval_fn(row: EvaluationRow) -> EvaluationRow:
+        # Intentionally forget to set row.evaluation_result
+        return row
+
+    with pytest.raises(AssertionError) as excinfo:
+        # Trigger the evaluation; this should hit the assertion added in evaluation_test.py
+        await eval_fn(input_messages=input_messages)  # pyright: ignore[reportCallIssue]
+
+    msg = str(excinfo.value)
+    assert "Some EvaluationRow instances are missing evaluation_result" in msg
+    assert "must set `row.evaluation_result`" in msg
diff --git a/tests/remote_server/test_remote_fireworks.py b/tests/remote_server/test_remote_fireworks.py
@@ -10,7 +10,7 @@
 import requests
 
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import EvaluationRow, Message, EvaluateResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
 from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
@@ -119,6 +119,8 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat
     - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
     - fetch traces from Langfuse via Fireworks tracing proxy filtered by metadata via output_data_loader; FAIL if none found
     """
+    row.evaluation_result = EvaluateResult(score=0.0, reason="Test reason")
+
     assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
     assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row."