Merge branch 'main' into derekx/raise-on-assert

xzrderek · xzrderek · commit 068cac503c06 · 2025-11-06T23:01:24.000-08:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -179,7 +179,11 @@ def evaluation_test(
         completion_params = [None]
     else:
         completion_params_provided = True
-    if rollout_processor is None:
+
+    # Override rollout processor if flag is set
+    if os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1":
+        rollout_processor = NoOpRolloutProcessor()
+    elif rollout_processor is None:
         rollout_processor = NoOpRolloutProcessor()
 
     active_logger: DatasetLogger = logger if logger else default_logger
@@ -729,6 +733,11 @@ async def _collect_result(config, lst):
             test_func, mode, max_concurrent_rollouts, max_concurrent_evaluations, pytest_wrapper
         )
 
+        # Make this pytest discoverable regardless of pytest configuration. So
+        # you can name your eval whatever you want, as long as it's decorated
+        # with @evaluation_test.
+        dual_mode_wrapper.__test__ = True
+
         setattr(dual_mode_wrapper, "__ep_params__", ep_params)
         return dual_mode_wrapper  # pyright: ignore[reportReturnType, reportUnknownVariableType]
 
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -133,6 +133,14 @@ def pytest_addoption(parser) -> None:
         default=None,
         help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
     )
+    group.addoption(
+        "--ep-no-op-rollout-processor",
+        action="store_true",
+        default=False,
+        help=(
+            "Override the rollout processor to use NoOpRolloutProcessor, which passes input dataset through unchanged."
+        ),
+    )
     group.addoption(
         "--ep-output-dir",
         default=None,
@@ -267,6 +275,9 @@ def pytest_configure(config) -> None:
         # set this to save eval results to the target dir in jsonl format
         os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir")
 
+    if config.getoption("--ep-no-op-rollout-processor"):
+        os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = "1"
+
     if config.getoption("--ep-no-upload"):
         os.environ["EP_NO_UPLOAD"] = "1"
 
diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py
@@ -4,8 +4,11 @@
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor
+from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
+from eval_protocol.pytest.plugin import pytest_configure
 import os
 from unittest import mock
+from unittest.mock import MagicMock
 
 
 with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}):
@@ -58,3 +61,124 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow:
     def test_input_override(row: EvaluationRow) -> EvaluationRow:
         assert row.messages[0].content == "What is 10 / 2?"
         return row
+
+
+# Tests for EP_USE_NO_OP_ROLLOUT_PROCESSOR override
+with mock.patch.dict(os.environ, {"EP_USE_NO_OP_ROLLOUT_PROCESSOR": "1"}):
+
+    @evaluation_test(
+        input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test message")])]],
+        completion_params=[{"model": "no-op"}],
+        rollout_processor=None,  # Should be overridden to NoOpRolloutProcessor
+        mode="pointwise",
+    )
+    def test_no_op_rollout_processor_override_from_none(row: EvaluationRow) -> EvaluationRow:
+        """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides None rollout processor."""
+        assert row.messages[0].content == "Test message"
+        # With NoOpRolloutProcessor, the row should pass through unchanged
+        # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
+        assert len(row.messages) == 1
+        assert row.messages[0].role == "user"
+        return row
+
+    @evaluation_test(
+        input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test override")])]],
+        completion_params=[{"model": "no-op"}],
+        rollout_processor=SingleTurnRolloutProcessor(),  # Should be overridden to NoOpRolloutProcessor
+        mode="pointwise",
+    )
+    def test_no_op_rollout_processor_override_from_other(row: EvaluationRow) -> EvaluationRow:
+        """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR overrides other rollout processors."""
+        assert row.messages[0].content == "Test override"
+        # With NoOpRolloutProcessor, the row should pass through unchanged without calling the model
+        # Verify that no actual model call was made (NoOpRolloutProcessor doesn't modify messages)
+        assert len(row.messages) == 1
+        assert row.messages[0].role == "user"
+        # Verify the original message content is preserved (no assistant response added)
+        assert row.messages[0].content == "Test override"
+        return row
+
+    @evaluation_test(
+        input_rows=[
+            [
+                EvaluationRow(messages=[Message(role="user", content="First")]),
+                EvaluationRow(messages=[Message(role="user", content="Second")]),
+            ]
+        ],
+        completion_params=[{"model": "no-op"}],
+        rollout_processor=SingleTurnRolloutProcessor(),  # Should be overridden
+        mode="pointwise",
+    )
+    def test_no_op_rollout_processor_override_multiple_rows(row: EvaluationRow) -> EvaluationRow:
+        """Test that EP_USE_NO_OP_ROLLOUT_PROCESSOR works with multiple rows."""
+        assert row.messages[0].content in ["First", "Second"]
+        # Verify rows pass through unchanged
+        assert len(row.messages) == 1
+        assert row.messages[0].role == "user"
+        return row
+
+
+def test_pytest_plugin_sets_no_op_rollout_processor_env_var():
+    """Test that pytest_configure sets EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is provided."""
+    # Create a mock config object
+    mock_config = MagicMock()
+
+    # Mock getoption to return True when called with the flag name, None for others
+    def getoption_side_effect(opt):
+        if opt == "--ep-no-op-rollout-processor":
+            return True
+        return None
+
+    mock_config.getoption = MagicMock(side_effect=getoption_side_effect)
+
+    # Save original env var value if it exists
+    original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")
+
+    # Clear the environment variable first
+    if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
+        del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
+
+    try:
+        # Call pytest_configure
+        pytest_configure(mock_config)
+
+        # Verify the environment variable was set
+        assert os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR") == "1"
+    finally:
+        # Clean up - restore original or remove
+        if original_value is not None:
+            os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value
+        elif "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
+            del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
+
+
+def test_pytest_plugin_does_not_set_env_var_when_flag_not_provided():
+    """Test that pytest_configure does not set EP_USE_NO_OP_ROLLOUT_PROCESSOR when flag is not provided."""
+    # Create a mock config object
+    mock_config = MagicMock()
+
+    # Mock getoption to return False when called with the flag name, None for others
+    def getoption_side_effect(opt):
+        if opt == "--ep-no-op-rollout-processor":
+            return False
+        return None
+
+    mock_config.getoption = MagicMock(side_effect=getoption_side_effect)
+
+    # Save original env var value if it exists
+    original_value = os.environ.get("EP_USE_NO_OP_ROLLOUT_PROCESSOR")
+
+    # Clear the environment variable first
+    if "EP_USE_NO_OP_ROLLOUT_PROCESSOR" in os.environ:
+        del os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"]
+
+    try:
+        # Call pytest_configure
+        pytest_configure(mock_config)
+
+        # Verify the environment variable was NOT set
+        assert "EP_USE_NO_OP_ROLLOUT_PROCESSOR" not in os.environ
+    finally:
+        # Clean up - restore original if it existed
+        if original_value is not None:
+            os.environ["EP_USE_NO_OP_ROLLOUT_PROCESSOR"] = original_value