Skip to content

Commit d845ccc

Browse files
committed
fix test
1 parent aa46413 commit d845ccc

11 files changed

+36
-11
lines changed

tests/data_loader/test_dynamic_data_loader.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from eval_protocol.data_loader import DynamicDataLoader
2-
from eval_protocol.models import EvaluationRow, Message
2+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
33
from eval_protocol.pytest import evaluation_test
44

55

@@ -27,6 +27,7 @@ def test_dynamic_data_loader(row: EvaluationRow) -> EvaluationRow:
2727
== "Factory function that generates evaluation rows dynamically."
2828
)
2929
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
30+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
3031
return row
3132

3233

@@ -45,6 +46,7 @@ def test_dynamic_data_loader_lambda(row: EvaluationRow) -> EvaluationRow:
4546
assert row.input_metadata.dataset_info.get("data_loader_num_rows_after_preprocessing") == 1
4647
assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader"
4748
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
49+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
4850
return row
4951

5052

@@ -72,4 +74,5 @@ def test_dynamic_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationR
7274
assert row.input_metadata.dataset_info.get("data_loader_type") == "DynamicDataLoader"
7375
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
7476

77+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
7578
return row

tests/data_loader/test_inline_data_loader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from eval_protocol.data_loader.inline_data_loader import InlineDataLoader
2-
from eval_protocol.models import EvaluationRow, Message
2+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
33
from eval_protocol.pytest import evaluation_test
44
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
55

@@ -20,6 +20,7 @@ def test_inline_data_loader(row: EvaluationRow) -> EvaluationRow:
2020
assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader"
2121
assert row.input_metadata.dataset_info.get("data_loader_variant_description") is None
2222
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
23+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2324
return row
2425

2526

@@ -41,4 +42,5 @@ def test_inline_data_loader_max_dataset_rows(row: EvaluationRow) -> EvaluationRo
4142
assert row.input_metadata.dataset_info.get("data_loader_type") == "InlineDataLoader"
4243
assert row.input_metadata.dataset_info.get("data_loader_preprocessed") is False
4344

45+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
4446
return row

tests/pytest/test_get_metadata.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import asyncio
22

33
from eval_protocol.pytest import evaluation_test
4-
from eval_protocol.models import EvaluationRow, Message
4+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
55

66

77
@evaluation_test(
@@ -22,6 +22,8 @@
2222
)
2323
def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
2424
"""Run math evaluation on sample dataset using pytest interface."""
25+
for row in rows:
26+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2527
return rows
2628

2729

tests/pytest/test_pytest_async.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pytest
22

3-
from eval_protocol.models import EvaluationRow, Message
3+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
44
from eval_protocol.pytest import evaluation_test
55

66

@@ -20,6 +20,8 @@
2020
)
2121
async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
2222
"""Run math evaluation on sample dataset using pytest interface."""
23+
for row in rows:
24+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2325
return rows
2426

2527

@@ -36,6 +38,7 @@ async def test_pytest_async(rows: list[EvaluationRow]) -> list[EvaluationRow]:
3638
)
3739
async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow:
3840
"""Run pointwise evaluation on sample dataset using pytest interface."""
41+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
3942
return row
4043

4144

tests/pytest/test_pytest_env_overwrite.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import atexit
22
import shutil
33
import tempfile
4-
from eval_protocol.models import EvaluationRow, Message
4+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
55
from eval_protocol.pytest import evaluation_test
66
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
77
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
@@ -23,6 +23,7 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow:
2323
"""Run math evaluation on sample dataset using pytest interface."""
2424
assert row.messages[0].content == "What is the capital of France?"
2525
assert row.execution_metadata.invocation_id == "test-invocation-123"
26+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2627
return row
2728

2829

@@ -38,6 +39,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
3839
"""Run math evaluation on sample dataset using pytest interface."""
3940
assert row.messages[0].content == "What is 5 * 6?"
4041
assert row.input_metadata.completion_params["model"] == "gpt-40"
42+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
4143
return row
4244

4345

@@ -60,6 +62,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
6062
)
6163
def test_input_override(row: EvaluationRow) -> EvaluationRow:
6264
assert row.messages[0].content == "What is 10 / 2?"
65+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
6366
return row
6467

6568

@@ -79,6 +82,7 @@ def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> Evalu
7982
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
8083
assert len(row.messages) == 1
8184
assert row.messages[0].role == "user"
85+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
8286
return row
8387

8488
@evaluation_test(
@@ -96,6 +100,7 @@ def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> Eval
96100
assert row.messages[0].role == "user"
97101
# Verify the original message content is preserved (no assistant response added)
98102
assert row.messages[0].content == "Test override"
103+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
99104
return row
100105

101106
@evaluation_test(
@@ -115,6 +120,7 @@ def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> E
115120
# Verify rows pass through unchanged
116121
assert len(row.messages) == 1
117122
assert row.messages[0].role == "user"
123+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
118124
return row
119125

120126

tests/pytest/test_pytest_ids.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import eval_protocol.dataset_logger as dataset_logger
44
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
5-
from eval_protocol.models import EvaluationRow
5+
from eval_protocol.models import EvaluationRow, EvaluateResult
66
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
77
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
88

@@ -37,6 +37,7 @@ async def test_evaluation_test_decorator(monkeypatch):
3737
logger=logger,
3838
)
3939
def eval_fn(row: EvaluationRow) -> EvaluationRow:
40+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
4041
return row
4142

4243
dataset_paths = [
@@ -83,6 +84,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
8384
unique_rollout_ids.add(row.execution_metadata.rollout_id)
8485
unique_invocation_ids.add(row.execution_metadata.invocation_id)
8586
unique_row_ids.add(row.input_metadata.row_id)
87+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
8688
return row
8789

8890
dataset_paths = [

tests/pytest/test_pytest_input_messages.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import List
22

33
import pytest
4-
from eval_protocol.models import EvaluationRow, Message
4+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
55
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
66

77

@@ -19,4 +19,6 @@
1919
)
2020
def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
2121
"""Run math evaluation on sample dataset using pytest interface."""
22+
for row in rows:
23+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
2224
return rows

tests/pytest/test_pytest_input_rows.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from eval_protocol.models import EvaluationRow, Message
1+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
22
from eval_protocol.pytest import evaluation_test
33
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
44

@@ -12,4 +12,5 @@
1212
def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow:
1313
"""Run math evaluation on sample dataset using pytest interface."""
1414
assert row.messages[0].content == "What is the capital of France?"
15+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
1516
return row

tests/pytest/test_pytest_stable_row_id.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from typing import List
22

3-
from eval_protocol.models import EvaluationRow, Message
3+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
44
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
55
from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row
66

@@ -30,6 +30,7 @@ async def test_evaluation_test_decorator_ids_single():
3030
)
3131
def eval_fn(row: EvaluationRow) -> EvaluationRow:
3232
row_ids.add(row.input_metadata.row_id)
33+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
3334
return row
3435

3536
# Manually invoke all parameter combinations within a single test
@@ -81,6 +82,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
8182
assert row.input_metadata is not None
8283
assert row.input_metadata.row_id is not None and isinstance(row.input_metadata.row_id, str)
8384
row_ids.add(row.input_metadata.row_id)
85+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
8486
return row
8587

8688
# Single invocation (one dataset, one param set) with multiple runs

tests/remote_server/test_remote_fireworks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat
119119
- trigger remote rollout via RemoteRolloutProcessor (calls init/status)
120120
- fetch traces from Langfuse via Fireworks tracing proxy filtered by metadata via output_data_loader; FAIL if none found
121121
"""
122-
row.evaluation_result = EvaluateResult(score=0.0, reason="Test reason")
122+
row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result")
123123

124124
assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
125125
assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row."

0 commit comments

Comments
 (0)