Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,13 @@ async def _collect_result(config, lst):

experiment_duration_seconds = time.perf_counter() - experiment_start_time

# for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
raise AssertionError(
"Some EvaluationRow instances are missing evaluation_result. "
"Your @evaluation_test function must set `row.evaluation_result`"
)

# for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
# rollout_id is used to differentiate the result from different completion_params
if mode == "groupwise":
results_by_group = [
Expand Down
5 changes: 4 additions & 1 deletion tests/data_loader/test_dynamic_data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from eval_protocol.data_loader import DynamicDataLoader
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test


Expand Down Expand Up @@ -27,6 +27,7 @@ def test_dynamic_data_loader(row: EvaluationRow) -> EvaluationRow:
== "Factory function that generates evaluation rows dynamically."
)
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand All @@ -45,6 +46,7 @@ def test_dynamic_data_loader_lambda(row: EvaluationRow) -> EvaluationRow:
assert row.input_metadata.dataset_info.get("data_loader_num_rows_after_preprocessing") == 1
assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader"
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand Down Expand Up @@ -72,4 +74,5 @@ def test_dynamic_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationR
assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader"
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False

row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
4 changes: 3 additions & 1 deletion tests/data_loader/test_inline_data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from eval_protocol.data_loader.inline_data_loader import InlineDataLoader
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor

Expand All @@ -20,6 +20,7 @@ def test_inline_data_loader(row: EvaluationRow) -> EvaluationRow:
assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader"
assert row.input_metadata.dataset_info.get("data_loader_variant_description") is None
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand All @@ -41,4 +42,5 @@ def test_inline_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationRo
assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader"
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False

row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
4 changes: 3 additions & 1 deletion tests/pytest/test_get_metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from eval_protocol.pytest import evaluation_test
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult


@evaluation_test(
Expand All @@ -22,6 +22,8 @@
)
def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
for row in rows:
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return rows


Expand Down
3 changes: 2 additions & 1 deletion tests/pytest/test_pydantic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pydantic_ai.models.openai import OpenAIChatModel
import pytest

from eval_protocol.models import EvaluationRow, Message, Status
from eval_protocol.models import EvaluationRow, Message, Status, EvaluateResult
from eval_protocol.pytest import evaluation_test

from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
Expand All @@ -28,4 +28,5 @@ async def test_pydantic_agent(row: EvaluationRow) -> EvaluationRow:
Super simple hello world test for Pydantic AI.
"""
assert row.rollout_status.code == Status.Code.FINISHED
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
3 changes: 2 additions & 1 deletion tests/pytest/test_pydantic_multi_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pydantic_ai.models.openai import OpenAIChatModel
import pytest

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from pydantic_ai import Agent

Expand Down Expand Up @@ -82,4 +82,5 @@ async def test_pydantic_multi_agent(row: EvaluationRow) -> EvaluationRow:
"""
Super simple hello world test for Pydantic AI.
"""
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
5 changes: 4 additions & 1 deletion tests/pytest/test_pytest_async.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test


Expand All @@ -20,6 +20,8 @@
)
async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
for row in rows:
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return rows


Expand All @@ -36,6 +38,7 @@ async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
)
async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow:
"""Run pointwise evaluation on sample dataset using pytest interface."""
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand Down
4 changes: 3 additions & 1 deletion tests/pytest/test_pytest_default_agent_rollout_processor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import datetime
from typing import List

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test


Expand All @@ -24,4 +24,6 @@
)
def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
for row in rows:
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return rows
5 changes: 4 additions & 1 deletion tests/pytest/test_pytest_ensure_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def test_ensure_logging(monkeypatch):
with patch(
"eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store
):
from eval_protocol.models import EvaluationRow
from eval_protocol.models import EvaluationRow, EvaluateResult
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from eval_protocol.pytest.evaluation_test import evaluation_test
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
Expand All @@ -44,6 +44,9 @@ async def test_ensure_logging(monkeypatch):
# Don't pass logger parameter - let it use the default_logger (which we've replaced)
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# This test is only about logging behavior; attach a dummy evaluation_result
# so that evaluation_test's invariant about evaluation_result is satisfied.
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

await eval_fn(
Expand Down
8 changes: 7 additions & 1 deletion tests/pytest/test_pytest_env_overwrite.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import atexit
import shutil
import tempfile
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
Expand All @@ -23,6 +23,7 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow:
"""Run math evaluation on sample dataset using pytest interface."""
assert row.messages[0].content == "What is the capital of France?"
assert row.execution_metadata.invocation_id == "test-invocation-123"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand All @@ -38,6 +39,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
"""Run math evaluation on sample dataset using pytest interface."""
assert row.messages[0].content == "What is 5 * 6?"
assert row.input_metadata.completion_params["model"] == "gpt-40"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand All @@ -60,6 +62,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
)
def test_input_override(row: EvaluationRow) -> EvaluationRow:
assert row.messages[0].content == "What is 10 / 2?"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand All @@ -79,6 +82,7 @@ def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> Evalu
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
assert len(row.messages) == 1
assert row.messages[0].role == "user"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

@evaluation_test(
Expand All @@ -96,6 +100,7 @@ def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> Eval
assert row.messages[0].role == "user"
# Verify the original message content is preserved (no assistant response added)
assert row.messages[0].content == "Test override"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

@evaluation_test(
Expand All @@ -115,6 +120,7 @@ def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> E
# Verify rows pass through unchanged
assert len(row.messages) == 1
assert row.messages[0].role == "user"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row


Expand Down
4 changes: 3 additions & 1 deletion tests/pytest/test_pytest_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import eval_protocol.dataset_logger as dataset_logger
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.models import EvaluationRow
from eval_protocol.models import EvaluationRow, EvaluateResult
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row

Expand Down Expand Up @@ -37,6 +37,7 @@ async def test_evaluation_test_decorator(monkeypatch):
logger=logger,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

dataset_paths = [
Expand Down Expand Up @@ -83,6 +84,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
unique_rollout_ids.add(row.execution_metadata.rollout_id)
unique_invocation_ids.add(row.execution_metadata.invocation_id)
unique_row_ids.add(row.input_metadata.row_id)
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

dataset_paths = [
Expand Down
4 changes: 3 additions & 1 deletion tests/pytest/test_pytest_input_messages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

import pytest
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test


Expand All @@ -19,4 +19,6 @@
)
def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
"""Run math evaluation on sample dataset using pytest interface."""
for row in rows:
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return rows
3 changes: 2 additions & 1 deletion tests/pytest/test_pytest_input_rows.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor

Expand All @@ -12,4 +12,5 @@
def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow:
"""Run math evaluation on sample dataset using pytest interface."""
assert row.messages[0].content == "What is the capital of France?"
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test


Expand All @@ -18,4 +18,5 @@ def test_pytest_input_rows_parametrized_completion_params(row: EvaluationRow, **
else:
assert "gpt-4" in seen_models
seen_models.add(model)
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row
3 changes: 3 additions & 0 deletions tests/pytest/test_pytest_mcp_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def read(self, row_id: str | None = None) -> list[EvaluationRow]:
logger=logger,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# Attach a dummy evaluation_result so the invariant is satisfied;
# this test only cares about tools being added to the row.
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

await eval_fn(input_messages=input_messages, completion_params=completion_params_list[0]) # pyright: ignore[reportCallIssue]
Expand Down
32 changes: 32 additions & 0 deletions tests/pytest/test_pytest_missing_evaluation_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from eval_protocol.pytest.evaluation_test import evaluation_test


@pytest.mark.asyncio
async def test_missing_evaluation_result_raises_assertion_error() -> None:
"""evaluation_test should raise if any EvaluationRow is missing evaluation_result."""

input_messages = [
[Message(role="user", content="Test message")],
]

@evaluation_test(
input_messages=[input_messages],
rollout_processor=NoOpRolloutProcessor(),
mode="pointwise",
num_runs=1,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# Intentionally forget to set row.evaluation_result
return row

with pytest.raises(AssertionError) as excinfo:
# Trigger the evaluation; this should hit the assertion added in evaluation_test.py
await eval_fn(input_messages=input_messages) # pyright: ignore[reportCallIssue]

msg = str(excinfo.value)
assert "Some EvaluationRow instances are missing evaluation_result" in msg
assert "must set `row.evaluation_result`" in msg
5 changes: 4 additions & 1 deletion tests/pytest/test_pytest_propagate_error.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing_extensions import override
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest.default_agent_rollout_processor import AgentRolloutProcessor
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger

Expand Down Expand Up @@ -56,6 +56,9 @@ async def test_pytest_propagate_error():
logger=logger,
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
# Attach a dummy evaluation_result so the invariant is satisfied;
# this test only cares that eval_metadata.status reflects rollout errors.
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

# Manually invoke all parameter combinations within a single test
Expand Down
4 changes: 3 additions & 1 deletion tests/pytest/test_pytest_stable_row_id.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row

Expand Down Expand Up @@ -30,6 +30,7 @@ async def test_evaluation_test_decorator_ids_single():
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
row_ids.add(row.input_metadata.row_id)
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

# Manually invoke all parameter combinations within a single test
Expand Down Expand Up @@ -81,6 +82,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
assert row.input_metadata is not None
assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str)
row_ids.add(row.input_metadata.row_id)
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
return row

# Single invocation (one dataset, one param set) with multiple runs
Expand Down
4 changes: 3 additions & 1 deletion tests/remote_server/test_remote_fireworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import requests

from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
Expand Down Expand Up @@ -119,6 +119,8 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat
- trigger remote rollout via RemoteRolloutProcessor (calls init/status)
- fetch traces from Langfuse via Fireworks tracing proxy filtered by metadata via output_data_loader; FAIL if none found
"""
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")

assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row."

Expand Down
Loading
Loading