diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 857765d3..9ec6afff 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -179,7 +179,11 @@ def evaluation_test( completion_params = [None] else: completion_params_provided = True - if rollout_processor is None: + + # Override rollout processor if flag is set + if os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1": + rollout_processor = NoOpRolloutProcessor() + elif rollout_processor is None: rollout_processor = NoOpRolloutProcessor() active_logger: DatasetLogger = logger if logger else default_logger diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index d0c4af4d..6fe5989c 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -133,6 +133,14 @@ def pytest_addoption(parser) -> None: default=None, help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"), ) + group.addoption( + "--ep-no-op-rollout-processor", + action="store_true", + default=False, + help=( + "Override the rollout processor to use NoOpRolloutProcessor, which passes input dataset through unchanged." + ), + ) group.addoption( "--ep-output-dir", default=None, @@ -267,6 +275,9 @@ def pytest_configure(config) -> None: # set this to save eval results to the target dir in jsonl format os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir") + if config.getoption("--ep-no-op-rollout-processor"): + os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = "1" + if config.getoption("--ep-no-upload"): os.environ["EP_NO_UPLOAD"] = "1" diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index b4101c72..348d7ef8 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -4,8 +4,11 @@ from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor +from eval_protocol.pytest.plugin import pytest_configure import os from unittest import mock +from unittest.mock import MagicMock with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}): @@ -58,3 +61,124 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: def test_input_override(row: EvaluationRow) -> EvaluationRow: assert row.messages[0].content == "What is 10 / 2?" return row + + +# Tests for EP_USE_NO_OP_ROLLOUT_PROCESSOR override +with mock.patch.dict(os.environ, {"EP_USE_NO_OP_ROLLOUT_PROCESSOR": "1"}): + + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test message")])]], + completion_params=[{"model": "no-op"}], + rollout_processor=None, # Should be overridden to NoOpRolloutProcessor + mode="pointwise", + ) + def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> EvaluationRow: + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides None rollout processor.""" + assert row.messages[0].content == "Test message" + # With NoOpRolloutProcessor, the row should pass through unchanged + # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages) + assert len(row.messages) == 1 + assert row.messages[0].role == "user" + return row + + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test override")])]], + completion_params=[{"model": "no-op"}], + rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden to NoOpRolloutProcessor + mode="pointwise", + ) + def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> EvaluationRow: + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides other rollout processors.""" + assert row.messages[0].content == "Test override" + # With NoOpRolloutProcessor, the row should pass through unchanged without calling the model + # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages) + assert len(row.messages) == 1 + assert row.messages[0].role == "user" + # Verify the original message content is preserved (no assistant response added) + assert row.messages[0].content == "Test override" + return row + + @evaluation_test( + input_rows=[ + [ + EvaluationRow(messages=[Message(role="user", content="First")]), + EvaluationRow(messages=[Message(role="user", content="Second")]), + ] + ], + completion_params=[{"model": "no-op"}], + rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden + mode="pointwise", + ) + def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> EvaluationRow: + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR works with multiple rows.""" + assert row.messages[0].content in ["First", "Second"] + # Verify rows pass through unchanged + assert len(row.messages) == 1 + assert row.messages[0].role == "user" + return row + + +def test_pytest_plugin_sets_no_op_rollout_processor_env_var(): + """Test that pytest_configure sets EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is provided.""" + # Create a mock config object + mock_config = MagicMock() + + # Mock getoption to return True when called with the flag name, None for others + def getoption_side_effect(opt): + if opt == "--ep-no-op-rollout-processor": + return True + return None + + mock_config.getoption = MagicMock(side_effect=getoption_side_effect) + + # Save original env var value if it exists + original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") + + # Clear the environment variable first + if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] + + try: + # Call pytest_configure + pytest_configure(mock_config) + + # Verify the environment variable was set + assert os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1" + finally: + # Clean up - restore original or remove + if original_value is not None: + os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value + elif "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] + + +def test_pytest_plugin_does_not_set_env_var_when_flag_not_provided(): + """Test that pytest_configure does not set EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is not provided.""" + # Create a mock config object + mock_config = MagicMock() + + # Mock getoption to return False when called with the flag name, None for others + def getoption_side_effect(opt): + if opt == "--ep-no-op-rollout-processor": + return False + return None + + mock_config.getoption = MagicMock(side_effect=getoption_side_effect) + + # Save original env var value if it exists + original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") + + # Clear the environment variable first + if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] + + try: + # Call pytest_configure + pytest_configure(mock_config) + + # Verify the environment variable was NOT set + assert "EP_USE_NO_OP_ROLLOUT_PROCESSOR" not in os.environ + finally: + # Clean up - restore original if it existed + if original_value is not None: + os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value