Skip to content

Commit 068cac5

Browse files
committed
Merge branch 'main' into derekx/raise-on-assert
2 parents d5ea771 + 92cd591 commit 068cac5

File tree

3 files changed

+145
-1
lines changed

3 files changed

+145
-1
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,11 @@ def evaluation_test(
179179
completion_params = [None]
180180
else:
181181
completion_params_provided = True
182-
if rollout_processor is None:
182+
183+
# Override rollout processor if flag is set
184+
if os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1":
185+
rollout_processor = NoOpRolloutProcessor()
186+
elif rollout_processor is None:
183187
rollout_processor = NoOpRolloutProcessor()
184188

185189
active_logger: DatasetLogger = logger if logger else default_logger
@@ -729,6 +733,11 @@ async def _collect_result(config, lst):
729733
test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper
730734
)
731735

736+
# Make this pytest discoverable regardless of pytest configuration. So
737+
# you can name your eval whatever you want, as long as it's decorated
738+
# with @evaluation_test.
739+
dual_mode_wrapper.__test__ = True
740+
732741
setattr(dual_mode_wrapper, "__ep_params__", ep_params)
733742
return dual_mode_wrapper # pyright: ignore[reportReturnType, reportUnknownVariableType]
734743

eval_protocol/pytest/plugin.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,14 @@ def pytest_addoption(parser) -> None:
133133
default=None,
134134
help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
135135
)
136+
group.addoption(
137+
"--ep-no-op-rollout-processor",
138+
action="store_true",
139+
default=False,
140+
help=(
141+
"Override the rollout processor to use NoOpRolloutProcessor, which passes input dataset through unchanged."
142+
),
143+
)
136144
group.addoption(
137145
"--ep-output-dir",
138146
default=None,
@@ -267,6 +275,9 @@ def pytest_configure(config) -> None:
267275
# set this to save eval results to the target dir in jsonl format
268276
os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir")
269277

278+
if config.getoption("--ep-no-op-rollout-processor"):
279+
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = "1"
280+
270281
if config.getoption("--ep-no-upload"):
271282
os.environ["EP_NO_UPLOAD"] = "1"
272283

tests/pytest/test_pytest_env_overwrite.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
from eval_protocol.models import EvaluationRow, Message
55
from eval_protocol.pytest import evaluation_test
66
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
7+
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
8+
from eval_protocol.pytest.plugin import pytest_configure
79
import os
810
from unittest import mock
11+
from unittest.mock import MagicMock
912

1013

1114
with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}):
@@ -58,3 +61,124 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
5861
def test_input_override(row: EvaluationRow) -> EvaluationRow:
5962
assert row.messages[0].content == "What is 10 / 2?"
6063
return row
64+
65+
66+
# Tests for EP_USE_NO_OP_ROLLOUT_PROCESSOR override
67+
with mock.patch.dict(os.environ, {"EP_USE_NO_OP_ROLLOUT_PROCESSOR": "1"}):
68+
69+
@evaluation_test(
70+
input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test message")])]],
71+
completion_params=[{"model": "no-op"}],
72+
rollout_processor=None, # Should be overridden to NoOpRolloutProcessor
73+
mode="pointwise",
74+
)
75+
def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> EvaluationRow:
76+
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides None rollout processor."""
77+
assert row.messages[0].content == "Test message"
78+
# With NoOpRolloutProcessor, the row should pass through unchanged
79+
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
80+
assert len(row.messages) == 1
81+
assert row.messages[0].role == "user"
82+
return row
83+
84+
@evaluation_test(
85+
input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test override")])]],
86+
completion_params=[{"model": "no-op"}],
87+
rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden to NoOpRolloutProcessor
88+
mode="pointwise",
89+
)
90+
def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> EvaluationRow:
91+
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides other rollout processors."""
92+
assert row.messages[0].content == "Test override"
93+
# With NoOpRolloutProcessor, the row should pass through unchanged without calling the model
94+
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
95+
assert len(row.messages) == 1
96+
assert row.messages[0].role == "user"
97+
# Verify the original message content is preserved (no assistant response added)
98+
assert row.messages[0].content == "Test override"
99+
return row
100+
101+
@evaluation_test(
102+
input_rows=[
103+
[
104+
EvaluationRow(messages=[Message(role="user", content="First")]),
105+
EvaluationRow(messages=[Message(role="user", content="Second")]),
106+
]
107+
],
108+
completion_params=[{"model": "no-op"}],
109+
rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden
110+
mode="pointwise",
111+
)
112+
def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> EvaluationRow:
113+
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR works with multiple rows."""
114+
assert row.messages[0].content in ["First", "Second"]
115+
# Verify rows pass through unchanged
116+
assert len(row.messages) == 1
117+
assert row.messages[0].role == "user"
118+
return row
119+
120+
121+
def test_pytest_plugin_sets_no_op_rollout_processor_env_var():
122+
"""Test that pytest_configure sets EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is provided."""
123+
# Create a mock config object
124+
mock_config = MagicMock()
125+
126+
# Mock getoption to return True when called with the flag name, None for others
127+
def getoption_side_effect(opt):
128+
if opt == "--ep-no-op-rollout-processor":
129+
return True
130+
return None
131+
132+
mock_config.getoption = MagicMock(side_effect=getoption_side_effect)
133+
134+
# Save original env var value if it exists
135+
original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")
136+
137+
# Clear the environment variable first
138+
if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
139+
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
140+
141+
try:
142+
# Call pytest_configure
143+
pytest_configure(mock_config)
144+
145+
# Verify the environment variable was set
146+
assert os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1"
147+
finally:
148+
# Clean up - restore original or remove
149+
if original_value is not None:
150+
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value
151+
elif "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
152+
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
153+
154+
155+
def test_pytest_plugin_does_not_set_env_var_when_flag_not_provided():
156+
"""Test that pytest_configure does not set EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is not provided."""
157+
# Create a mock config object
158+
mock_config = MagicMock()
159+
160+
# Mock getoption to return False when called with the flag name, None for others
161+
def getoption_side_effect(opt):
162+
if opt == "--ep-no-op-rollout-processor":
163+
return False
164+
return None
165+
166+
mock_config.getoption = MagicMock(side_effect=getoption_side_effect)
167+
168+
# Save original env var value if it exists
169+
original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")
170+
171+
# Clear the environment variable first
172+
if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
173+
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
174+
175+
try:
176+
# Call pytest_configure
177+
pytest_configure(mock_config)
178+
179+
# Verify the environment variable was NOT set
180+
assert "EP_USE_NO_OP_ROLLOUT_PROCESSOR" not in os.environ
181+
finally:
182+
# Clean up - restore original if it existed
183+
if original_value is not None:
184+
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value

0 commit comments

Comments
 (0)