Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ def evaluation_test(
completion_params = [None]
else:
completion_params_provided = True
if rollout_processor is None:

# Override rollout processor if flag is set
if os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1":
rollout_processor = NoOpRolloutProcessor()
elif rollout_processor is None:
rollout_processor = NoOpRolloutProcessor()

active_logger: DatasetLogger = logger if logger else default_logger
Expand Down
11 changes: 11 additions & 0 deletions eval_protocol/pytest/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def pytest_addoption(parser) -> None:
default=None,
help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
)
group.addoption(
"--ep-no-op-rollout-processor",
action="store_true",
default=False,
help=(
"Override the rollout processor to use NoOpRolloutProcessor, which passes input dataset through unchanged."
),
)
group.addoption(
"--ep-output-dir",
default=None,
Expand Down Expand Up @@ -267,6 +275,9 @@ def pytest_configure(config) -> None:
# set this to save eval results to the target dir in jsonl format
os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir")

if config.getoption("--ep-no-op-rollout-processor"):
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = "1"

if config.getoption("--ep-no-upload"):
os.environ["EP_NO_UPLOAD"] = "1"

Expand Down
124 changes: 124 additions & 0 deletions tests/pytest/test_pytest_env_overwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.pytest.plugin import pytest_configure
import os
from unittest import mock
from unittest.mock import MagicMock


with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}):
Expand Down Expand Up @@ -58,3 +61,124 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
def test_input_override(row: EvaluationRow) -> EvaluationRow:
assert row.messages[0].content == "What is 10 / 2?"
return row


# Tests for EP_USE_NO_OP_ROLLOUT_PROCESSOR override
with mock.patch.dict(os.environ, {"EP_USE_NO_OP_ROLLOUT_PROCESSOR": "1"}):

@evaluation_test(
input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test message")])]],
completion_params=[{"model": "no-op"}],
rollout_processor=None, # Should be overridden to NoOpRolloutProcessor
mode="pointwise",
)
def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> EvaluationRow:
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides None rollout processor."""
assert row.messages[0].content == "Test message"
# With NoOpRolloutProcessor, the row should pass through unchanged
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
assert len(row.messages) == 1
assert row.messages[0].role == "user"
return row

@evaluation_test(
input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test override")])]],
completion_params=[{"model": "no-op"}],
rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden to NoOpRolloutProcessor
mode="pointwise",
)
def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> EvaluationRow:
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides other rollout processors."""
assert row.messages[0].content == "Test override"
# With NoOpRolloutProcessor, the row should pass through unchanged without calling the model
# Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
assert len(row.messages) == 1
assert row.messages[0].role == "user"
# Verify the original message content is preserved (no assistant response added)
assert row.messages[0].content == "Test override"
return row

@evaluation_test(
input_rows=[
[
EvaluationRow(messages=[Message(role="user", content="First")]),
EvaluationRow(messages=[Message(role="user", content="Second")]),
]
],
completion_params=[{"model": "no-op"}],
rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden
mode="pointwise",
)
def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> EvaluationRow:
"""Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR works with multiple rows."""
assert row.messages[0].content in ["First", "Second"]
# Verify rows pass through unchanged
assert len(row.messages) == 1
assert row.messages[0].role == "user"
return row


def test_pytest_plugin_sets_no_op_rollout_processor_env_var():
"""Test that pytest_configure sets EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is provided."""
# Create a mock config object
mock_config = MagicMock()

# Mock getoption to return True when called with the flag name, None for others
def getoption_side_effect(opt):
if opt == "--ep-no-op-rollout-processor":
return True
return None

mock_config.getoption = MagicMock(side_effect=getoption_side_effect)

# Save original env var value if it exists
original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")

# Clear the environment variable first
if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]

try:
# Call pytest_configure
pytest_configure(mock_config)

# Verify the environment variable was set
assert os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1"
finally:
# Clean up - restore original or remove
if original_value is not None:
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value
elif "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]


def test_pytest_plugin_does_not_set_env_var_when_flag_not_provided():
"""Test that pytest_configure does not set EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is not provided."""
# Create a mock config object
mock_config = MagicMock()

# Mock getoption to return False when called with the flag name, None for others
def getoption_side_effect(opt):
if opt == "--ep-no-op-rollout-processor":
return False
return None

mock_config.getoption = MagicMock(side_effect=getoption_side_effect)

# Save original env var value if it exists
original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")

# Clear the environment variable first
if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]

try:
# Call pytest_configure
pytest_configure(mock_config)

# Verify the environment variable was NOT set
assert "EP_USE_NO_OP_ROLLOUT_PROCESSOR" not in os.environ
finally:
# Clean up - restore original if it existed
if original_value is not None:
os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value
Loading