Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,13 @@ async def _collect_result(config, lst):

experiment_duration_seconds = time.perf_counter() - experiment_start_time

# for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
raise AssertionError(
"Some EvaluationRow instances are missing evaluation_result. "
"Your @evaluation_test function must set `row.evaluation_result`"
)

# for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
# rollout_id is used to differentiate the result from different completion_params
if mode == "groupwise":
results_by_group = [
Expand Down
32 changes: 32 additions & 0 deletions tests/pytest/test_pytest_missing_evaluation_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from eval_protocol.pytest.evaluation_test import evaluation_test


@pytest.mark.asyncio
async def test_missing_evaluation_result_raises_assertion_error() -> None:
"""evaluation_test should raise if any EvaluationRow is missing evaluation_result."""

input_messages = [
[Message(role="user", content="Test message")],
]

@evaluation_test(
input_messages=[input_messages],
rollout_processor=NoOpRolloutProcessor(),
mode="pointwise",
num_runs=1,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# Intentionally forget to set row.evaluation_result
return row

with pytest.raises(AssertionError) as excinfo:
# Trigger the evaluation; this should hit the assertion added in evaluation_test.py
await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue]

msg = str(excinfo.value)
assert "Some EvaluationRow instances are missing evaluation_result" in msg
assert "must set `row.evaluation_result`" in msg
4 changes: 3 additions & 1 deletion tests/remote_server/test_remote_fireworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import requests

from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
Expand Down Expand Up @@ -119,6 +119,8 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat
- trigger remote rollout via RemoteRolloutProcessor (calls init/status)
- fetch traces from Langfuse via Fireworks tracing proxy filtered by metadata via output_data_loader; FAIL if none found
"""
row.evaluation_result = EvaluateResult(score=0.0, reason="Test reason")
Comment thread
xzrderek marked this conversation as resolved.
Outdated

assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row."

Expand Down
Loading