Skip to content

Commit 4b80012

Browse files
committed
update more tests
1 parent d845ccc commit 4b80012

7 files changed

+20
-6
lines changed

tests/pytest/test_pydantic_agent.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from pydantic_ai.models.openai import OpenAIChatModel
33
import pytest
44

5-
from eval_protocol.models import EvaluationRow, Message, Status
5+
from eval_protocol.models import EvaluationRow, Message, Status, EvaluateResult
66
from eval_protocol.pytest import evaluation_test
77

88
from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
@@ -28,4 +28,5 @@ async def test_pydantic_agent(row: EvaluationRow) -> EvaluationRow:
2828
Super simple hello world test for Pydantic AI.
2929
"""
3030
assert row.rollout_status.code == Status.Code.FINISHED
31+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
3132
return row

tests/pytest/test_pydantic_multi_agent.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pydantic_ai.models.openai import OpenAIChatModel
1111
import pytest
1212

13-
from eval_protocol.models import EvaluationRow, Message
13+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
1414
from eval_protocol.pytest import evaluation_test
1515
from pydantic_ai import Agent
1616

@@ -82,4 +82,5 @@ async def test_pydantic_multi_agent(row: EvaluationRow) -> EvaluationRow:
8282
"""
8383
Super simple hello world test for Pydantic AI.
8484
"""
85+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
8586
return row

tests/pytest/test_pytest_default_agent_rollout_processor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from datetime import datetime
22
from typing import List
33

4-
from eval_protocol.models import EvaluationRow, Message
4+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
55
from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test
66

77

@@ -24,4 +24,6 @@
2424
)
2525
def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
2626
"""Run math evaluation on sample dataset using pytest interface."""
27+
for row in rows:
28+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2729
return rows

tests/pytest/test_pytest_ensure_logging.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ async def test_ensure_logging(monkeypatch):
2626
with patch(
2727
"eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store
2828
):
29-
from eval_protocol.models import EvaluationRow
29+
from eval_protocol.models import EvaluationRow, EvaluateResult
3030
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
3131
from eval_protocol.pytest.evaluation_test import evaluation_test
3232
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
@@ -44,6 +44,9 @@ async def test_ensure_logging(monkeypatch):
4444
# Don't pass logger parameter - let it use the default_logger (which we've replaced)
4545
)
4646
def eval_fn(row: EvaluationRow) -> EvaluationRow:
47+
# This test is only about logging behavior; attach a dummy evaluation_result
48+
# so that evaluation_test's invariant about evaluation_result is satisfied.
49+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
4750
return row
4851

4952
await eval_fn(

tests/pytest/test_pytest_input_rows_parametrized_completion_params.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from eval_protocol.models import EvaluationRow, Message
1+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
22
from eval_protocol.pytest import evaluation_test
33

44

@@ -18,4 +18,5 @@ def test_pytest_input_rows_parametrized_completion_params(row: EvaluationRow, **
1818
else:
1919
assert "gpt-4" in seen_models
2020
seen_models.add(model)
21+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2122
return row

tests/pytest/test_pytest_mcp_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ def read(self, row_id: str | None = None) -> list[EvaluationRow]:
8787
logger=logger,
8888
)
8989
def eval_fn(row: EvaluationRow) -> EvaluationRow:
90+
# Attach a dummy evaluation_result so the invariant is satisfied;
91+
# this test only cares about tools being added to the row.
92+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
9093
return row
9194

9295
await eval_fn(input_messages=input_messages, completion_params=completion_params_list[0]) # pyright: ignore[reportCallIssue]

tests/pytest/test_pytest_propagate_error.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing_extensions import override
2-
from eval_protocol.models import EvaluationRow, Message
2+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
33
from eval_protocol.pytest.default_agent_rollout_processor import AgentRolloutProcessor
44
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
55

@@ -56,6 +56,9 @@ async def test_pytest_propagate_error():
5656
logger=logger,
5757
)
5858
def eval_fn(row: EvaluationRow) -> EvaluationRow:
59+
# Attach a dummy evaluation_result so the invariant is satisfied;
60+
# this test only cares that eval_metadata.status reflects rollout errors.
61+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
5962
return row
6063

6164
# Manually invoke all parameter combinations within a single test

0 commit comments

Comments
 (0)