|
4 | 4 | from eval_protocol.models import EvaluationRow, Message |
5 | 5 | from eval_protocol.pytest import evaluation_test |
6 | 6 | from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor |
| 7 | +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor |
| 8 | +from eval_protocol.pytest.plugin import pytest_configure |
7 | 9 | import os |
8 | 10 | from unittest import mock |
| 11 | +from unittest.mock import MagicMock |
9 | 12 |
|
10 | 13 |
|
11 | 14 | with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}): |
@@ -58,3 +61,124 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: |
58 | 61 | def test_input_override(row: EvaluationRow) -> EvaluationRow: |
59 | 62 | assert row.messages[0].content == "What is 10 / 2?" |
60 | 63 | return row |
| 64 | + |
| 65 | + |
| 66 | +# Tests for EP_USE_NO_OP_ROLLOUT_PROCESSOR override |
| 67 | +with mock.patch.dict(os.environ, {"EP_USE_NO_OP_ROLLOUT_PROCESSOR": "1"}): |
| 68 | + |
| 69 | + @evaluation_test( |
| 70 | + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test message")])]], |
| 71 | + completion_params=[{"model": "no-op"}], |
| 72 | + rollout_processor=None, # Should be overridden to NoOpRolloutProcessor |
| 73 | + mode="pointwise", |
| 74 | + ) |
| 75 | + def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> EvaluationRow: |
| 76 | + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides None rollout processor.""" |
| 77 | + assert row.messages[0].content == "Test message" |
| 78 | + # With NoOpRolloutProcessor, the row should pass through unchanged |
| 79 | + # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages) |
| 80 | + assert len(row.messages) == 1 |
| 81 | + assert row.messages[0].role == "user" |
| 82 | + return row |
| 83 | + |
| 84 | + @evaluation_test( |
| 85 | + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test override")])]], |
| 86 | + completion_params=[{"model": "no-op"}], |
| 87 | + rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden to NoOpRolloutProcessor |
| 88 | + mode="pointwise", |
| 89 | + ) |
| 90 | + def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> EvaluationRow: |
| 91 | + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides other rollout processors.""" |
| 92 | + assert row.messages[0].content == "Test override" |
| 93 | + # With NoOpRolloutProcessor, the row should pass through unchanged without calling the model |
| 94 | + # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages) |
| 95 | + assert len(row.messages) == 1 |
| 96 | + assert row.messages[0].role == "user" |
| 97 | + # Verify the original message content is preserved (no assistant response added) |
| 98 | + assert row.messages[0].content == "Test override" |
| 99 | + return row |
| 100 | + |
| 101 | + @evaluation_test( |
| 102 | + input_rows=[ |
| 103 | + [ |
| 104 | + EvaluationRow(messages=[Message(role="user", content="First")]), |
| 105 | + EvaluationRow(messages=[Message(role="user", content="Second")]), |
| 106 | + ] |
| 107 | + ], |
| 108 | + completion_params=[{"model": "no-op"}], |
| 109 | + rollout_processor=SingleTurnRolloutProcessor(), # Should be overridden |
| 110 | + mode="pointwise", |
| 111 | + ) |
| 112 | + def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> EvaluationRow: |
| 113 | + """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR works with multiple rows.""" |
| 114 | + assert row.messages[0].content in ["First", "Second"] |
| 115 | + # Verify rows pass through unchanged |
| 116 | + assert len(row.messages) == 1 |
| 117 | + assert row.messages[0].role == "user" |
| 118 | + return row |
| 119 | + |
| 120 | + |
| 121 | +def test_pytest_plugin_sets_no_op_rollout_processor_env_var(): |
| 122 | + """Test that pytest_configure sets EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is provided.""" |
| 123 | + # Create a mock config object |
| 124 | + mock_config = MagicMock() |
| 125 | + |
| 126 | + # Mock getoption to return True when called with the flag name, None for others |
| 127 | + def getoption_side_effect(opt): |
| 128 | + if opt == "--ep-no-op-rollout-processor": |
| 129 | + return True |
| 130 | + return None |
| 131 | + |
| 132 | + mock_config.getoption = MagicMock(side_effect=getoption_side_effect) |
| 133 | + |
| 134 | + # Save original env var value if it exists |
| 135 | + original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") |
| 136 | + |
| 137 | + # Clear the environment variable first |
| 138 | + if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: |
| 139 | + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] |
| 140 | + |
| 141 | + try: |
| 142 | + # Call pytest_configure |
| 143 | + pytest_configure(mock_config) |
| 144 | + |
| 145 | + # Verify the environment variable was set |
| 146 | + assert os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1" |
| 147 | + finally: |
| 148 | + # Clean up - restore original or remove |
| 149 | + if original_value is not None: |
| 150 | + os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value |
| 151 | + elif "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: |
| 152 | + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] |
| 153 | + |
| 154 | + |
| 155 | +def test_pytest_plugin_does_not_set_env_var_when_flag_not_provided(): |
| 156 | + """Test that pytest_configure does not set EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is not provided.""" |
| 157 | + # Create a mock config object |
| 158 | + mock_config = MagicMock() |
| 159 | + |
| 160 | + # Mock getoption to return False when called with the flag name, None for others |
| 161 | + def getoption_side_effect(opt): |
| 162 | + if opt == "--ep-no-op-rollout-processor": |
| 163 | + return False |
| 164 | + return None |
| 165 | + |
| 166 | + mock_config.getoption = MagicMock(side_effect=getoption_side_effect) |
| 167 | + |
| 168 | + # Save original env var value if it exists |
| 169 | + original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") |
| 170 | + |
| 171 | + # Clear the environment variable first |
| 172 | + if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ: |
| 173 | + del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] |
| 174 | + |
| 175 | + try: |
| 176 | + # Call pytest_configure |
| 177 | + pytest_configure(mock_config) |
| 178 | + |
| 179 | + # Verify the environment variable was NOT set |
| 180 | + assert "EP_USE_NO_OP_ROLLOUT_PROCESSOR" not in os.environ |
| 181 | + finally: |
| 182 | + # Clean up - restore original if it existed |
| 183 | + if original_value is not None: |
| 184 | + os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value |
0 commit comments