From afb347d5bafce7270a6b0016572789214edb9e61 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 15:10:31 -0700 Subject: [PATCH 1/6] support overwrite --- eval_protocol/pytest/evaluation_test.py | 10 +++++ eval_protocol/pytest/plugin.py | 35 ++++++++++++++++++ .../pytest/remote_rollout_processor.py | 3 ++ eval_protocol/pytest/utils.py | 37 +++++++++++++++++++ 4 files changed, 85 insertions(+) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 4625114a..e47fface 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -52,10 +52,12 @@ add_cost_metrics, log_eval_status_and_rows, parse_ep_completion_params, + parse_ep_completion_params_overwrite, parse_ep_max_concurrent_rollouts, parse_ep_max_rows, parse_ep_num_runs, parse_ep_passed_threshold, + parse_ep_dataloaders, rollout_processor_with_retry, run_tasks_with_eval_progress, run_tasks_with_run_progress, @@ -187,10 +189,18 @@ def evaluation_test( max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts) max_dataset_rows = parse_ep_max_rows(max_dataset_rows) completion_params = parse_ep_completion_params(completion_params) + completion_params = parse_ep_completion_params_overwrite(completion_params) original_completion_params = completion_params passed_threshold = parse_ep_passed_threshold(passed_threshold) + data_loaders = parse_ep_dataloaders(data_loaders) custom_invocation_id = os.environ.get("EP_INVOCATION_ID", None) + # dataloader might be overridden here, to avoid conflict, manually unset other data input params + if data_loaders: + input_dataset = None + input_messages = None + input_rows = None + def decorator( test_func: TestFunction, ) -> TestFunction: diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 8d369e70..df07bfd1 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -19,6 +19,7 @@ import pathlib import sys from pytest import StashKey +import pytest def pytest_addoption(parser) -> None: @@ -56,6 +57,7 @@ def pytest_addoption(parser) -> None: default=None, help=("Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."), ) + # deprecate this later group.addoption( "--ep-input-param", action="append", @@ -115,6 +117,27 @@ def pytest_addoption(parser) -> None: "Default: false (experiment JSONs are saved and uploaded by default)." ), ) + group.addoption( + "--ep-jsonl-path", + default=None, + help=("Load input from a jsonl file that is already in EvaluationRow or openai CHAT format") + ) + group.addoption( + "--ep-completion-params", + default=[], + action="append", + help=( + "Overwrite completion params with json. Can be used multiple times. " + ), + ) + group.addoption( + "--ep-remote-rollout-processor-base-url", + default=None, + help=( + "If set, use this base URL for remote rollout processing. " + "Example: http://localhost:8000" + ), + ) def _normalize_max_rows(val: Optional[str]) -> Optional[str]: @@ -243,6 +266,18 @@ def pytest_configure(config) -> None: if config.getoption("--ep-no-upload"): os.environ["EP_NO_UPLOAD"] = "1" + if config.getoption("--ep-jsonl-path"): + os.environ["EP_JSONL_PATH"] = config.getoption("--ep-jsonl-path") + + if config.getoption("--ep-completion-params"): + # redump to json to make sure they are legit + os.environ["EP_COMPLETION_PARAMS"] = json.dumps([ + json.loads(s) for s in config.getoption("--ep-completion-params") or [] + ]) + + if config.getoption("--ep-remote-rollout-processor-base-url"): + os.environ["EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"] = config.getoption("--ep-remote-rollout-processor-base-url") + # Allow ad-hoc overrides of input params via CLI flags try: merged: dict = {} diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py index 5efa793e..6e9d5521 100644 --- a/eval_protocol/pytest/remote_rollout_processor.py +++ b/eval_protocol/pytest/remote_rollout_processor.py @@ -8,6 +8,7 @@ from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader from .rollout_processor import RolloutProcessor from .types import RolloutProcessorConfig +import os class RemoteRolloutProcessor(RolloutProcessor): @@ -46,6 +47,8 @@ def __init__( # Prefer constructor-provided configuration. These can be overridden via # config.kwargs at call time for backward compatibility. self._remote_base_url = remote_base_url + if os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"): + self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL") self._poll_interval = poll_interval self._timeout_seconds = timeout_seconds self._output_data_loader = output_data_loader diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 4ac472ef..aec31679 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -19,6 +19,8 @@ EvaluationThresholdDict, Status, ) +from eval_protocol.data_loader import DynamicDataLoader +from eval_protocol.data_loader.models import EvaluationDataLoader from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import ( RolloutProcessorConfig, @@ -238,6 +240,41 @@ def parse_ep_completion_params( pass return completion_params +def parse_ep_completion_params_overwrite(completion_params: Sequence[CompletionParams | None] | None) -> Sequence[CompletionParams | None]: + new_completion_params = os.getenv("EP_COMPLETION_PARAMS") + if new_completion_params: + try: + new_completion_params_list = json.loads(new_completion_params) + if isinstance(new_completion_params_list, list): + return new_completion_params_list + except Exception: + pass + return completion_params or [] + +def _rows_from_jsonl(path: str) -> list[EvaluationRow]: + rows = [] + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + rows.append(EvaluationRow(**json.loads(line))) + except Exception as e: + print(f"❌ Failed to load rows from JSONL at {path}: {e}") + return [] + + return rows + +def parse_ep_dataloaders( + dataloaders: Sequence[EvaluationDataLoader] | EvaluationDataLoader | None, +) -> Sequence[EvaluationDataLoader] | EvaluationDataLoader | None: + try: + load_from_jsonl_path = os.getenv("EP_JSONL_PATH") + if load_from_jsonl_path: + return DynamicDataLoader( + generators=[lambda path=load_from_jsonl_path: _rows_from_jsonl(path)]) + except Exception: + pass + return dataloaders or None + def parse_ep_passed_threshold( default_value: float | EvaluationThresholdDict | EvaluationThreshold | None, From 61d066dbed283f0045a6141b01b7b3c244f3264f Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 15:11:22 -0700 Subject: [PATCH 2/6] format --- eval_protocol/pytest/evaluation_test.py | 4 ++-- eval_protocol/pytest/plugin.py | 19 +++++++------------ eval_protocol/pytest/utils.py | 14 +++++++++----- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index e47fface..fd35dad6 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -189,7 +189,7 @@ def evaluation_test( max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts) max_dataset_rows = parse_ep_max_rows(max_dataset_rows) completion_params = parse_ep_completion_params(completion_params) - completion_params = parse_ep_completion_params_overwrite(completion_params) + completion_params = parse_ep_completion_params_overwrite(completion_params) original_completion_params = completion_params passed_threshold = parse_ep_passed_threshold(passed_threshold) data_loaders = parse_ep_dataloaders(data_loaders) @@ -199,7 +199,7 @@ def evaluation_test( if data_loaders: input_dataset = None input_messages = None - input_rows = None + input_rows = None def decorator( test_func: TestFunction, diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index df07bfd1..524fe834 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -120,23 +120,18 @@ def pytest_addoption(parser) -> None: group.addoption( "--ep-jsonl-path", default=None, - help=("Load input from a jsonl file that is already in EvaluationRow or openai CHAT format") + help=("Load input from a jsonl file that is already in EvaluationRow or openai CHAT format"), ) group.addoption( "--ep-completion-params", default=[], action="append", - help=( - "Overwrite completion params with json. Can be used multiple times. " - ), + help=("Overwrite completion params with json. Can be used multiple times. "), ) group.addoption( "--ep-remote-rollout-processor-base-url", default=None, - help=( - "If set, use this base URL for remote rollout processing. " - "Example: http://localhost:8000" - ), + help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"), ) @@ -271,10 +266,10 @@ def pytest_configure(config) -> None: if config.getoption("--ep-completion-params"): # redump to json to make sure they are legit - os.environ["EP_COMPLETION_PARAMS"] = json.dumps([ - json.loads(s) for s in config.getoption("--ep-completion-params") or [] - ]) - + os.environ["EP_COMPLETION_PARAMS"] = json.dumps( + [json.loads(s) for s in config.getoption("--ep-completion-params") or []] + ) + if config.getoption("--ep-remote-rollout-processor-base-url"): os.environ["EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"] = config.getoption("--ep-remote-rollout-processor-base-url") diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index aec31679..557f5be2 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -20,7 +20,7 @@ Status, ) from eval_protocol.data_loader import DynamicDataLoader -from eval_protocol.data_loader.models import EvaluationDataLoader +from eval_protocol.data_loader.models import EvaluationDataLoader from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import ( RolloutProcessorConfig, @@ -240,7 +240,10 @@ def parse_ep_completion_params( pass return completion_params -def parse_ep_completion_params_overwrite(completion_params: Sequence[CompletionParams | None] | None) -> Sequence[CompletionParams | None]: + +def parse_ep_completion_params_overwrite( + completion_params: Sequence[CompletionParams | None] | None, +) -> Sequence[CompletionParams | None]: new_completion_params = os.getenv("EP_COMPLETION_PARAMS") if new_completion_params: try: @@ -251,6 +254,7 @@ def parse_ep_completion_params_overwrite(completion_params: Sequence[CompletionP pass return completion_params or [] + def _rows_from_jsonl(path: str) -> list[EvaluationRow]: rows = [] try: @@ -260,17 +264,17 @@ def _rows_from_jsonl(path: str) -> list[EvaluationRow]: except Exception as e: print(f"❌ Failed to load rows from JSONL at {path}: {e}") return [] - + return rows + def parse_ep_dataloaders( dataloaders: Sequence[EvaluationDataLoader] | EvaluationDataLoader | None, ) -> Sequence[EvaluationDataLoader] | EvaluationDataLoader | None: try: load_from_jsonl_path = os.getenv("EP_JSONL_PATH") if load_from_jsonl_path: - return DynamicDataLoader( - generators=[lambda path=load_from_jsonl_path: _rows_from_jsonl(path)]) + return DynamicDataLoader(generators=[lambda path=load_from_jsonl_path: _rows_from_jsonl(path)]) except Exception: pass return dataloaders or None From b47a51e209e89f7cd7766144e78d5ed08eaf3970 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 15:19:09 -0700 Subject: [PATCH 3/6] remove useless override --- eval_protocol/pytest/plugin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 524fe834..030c367e 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -270,9 +270,6 @@ def pytest_configure(config) -> None: [json.loads(s) for s in config.getoption("--ep-completion-params") or []] ) - if config.getoption("--ep-remote-rollout-processor-base-url"): - os.environ["EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"] = config.getoption("--ep-remote-rollout-processor-base-url") - # Allow ad-hoc overrides of input params via CLI flags try: merged: dict = {} From b37b35a91e6951e1cd01be20417c40cd2d0d72cf Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 15:38:19 -0700 Subject: [PATCH 4/6] add test --- tests/pytest/test_pytest_env_overwrite.py | 40 +++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index c88dd2b8..c6458bce 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -1,3 +1,6 @@ +import atexit +import shutil +import tempfile from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor @@ -18,3 +21,40 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: assert row.messages[0].content == "What is the capital of France?" assert row.execution_metadata.invocation_id == "test-invocation-123" return row + + + +with mock.patch.dict(os.environ, {"EP_COMPLETION_PARAMS": "[{\"model\": \"gpt-40\"}]"}): + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is 5 * 6?")])]], + completion_params=[{"model": "no-op"}], # This should be overridden by the env var + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + ) + def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: + """Run math evaluation on sample dataset using pytest interface.""" + assert row.messages[0].content == "What is 5 * 6?" + assert row.input_metadata.completion_params["model"] == "gpt-40" + return row + + + +_jsonl_tmpdir = tempfile.mkdtemp() +atexit.register(shutil.rmtree, _jsonl_tmpdir, ignore_errors=True) + +input_path = os.path.join(_jsonl_tmpdir, "input.jsonl") +with open(input_path, "w") as f: + f.write( + '{"messages": [{"role": "user", "content": "What is 10 / 2?"}], "input_metadata": {"some_key": "some_value"}}\n' + ) +print(f"finish prepare input file {input_path}") +with mock.patch.dict(os.environ, {"EP_JSONL_PATH": input_path}): + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="This will be ignored")])]], + completion_params=[{"model": "no-op"}], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + ) + def test_input_override(row: EvaluationRow) -> EvaluationRow: + assert row.messages[0].content == "What is 10 / 2?" + return row From 6295fd91ae46815ca39885ae7a996838705a994c Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 15:40:00 -0700 Subject: [PATCH 5/6] format --- tests/pytest/test_pytest_env_overwrite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index c6458bce..b4101c72 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -23,8 +23,8 @@ def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: return row +with mock.patch.dict(os.environ, {"EP_COMPLETION_PARAMS": '[{"model": "gpt-40"}]'}): -with mock.patch.dict(os.environ, {"EP_COMPLETION_PARAMS": "[{\"model\": \"gpt-40\"}]"}): @evaluation_test( input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is 5 * 6?")])]], completion_params=[{"model": "no-op"}], # This should be overridden by the env var @@ -38,7 +38,6 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: return row - _jsonl_tmpdir = tempfile.mkdtemp() atexit.register(shutil.rmtree, _jsonl_tmpdir, ignore_errors=True) @@ -49,6 +48,7 @@ def test_input_messages_in_env(row: EvaluationRow) -> EvaluationRow: ) print(f"finish prepare input file {input_path}") with mock.patch.dict(os.environ, {"EP_JSONL_PATH": input_path}): + @evaluation_test( input_rows=[[EvaluationRow(messages=[Message(role="user", content="This will be ignored")])]], completion_params=[{"model": "no-op"}], From b61a56fe4d0b7567b9b8b04fbf5ad7ef0811a099 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 29 Sep 2025 23:50:33 -0700 Subject: [PATCH 6/6] update comment --- eval_protocol/pytest/evaluation_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index d283fddf..8a468843 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -197,7 +197,7 @@ def evaluation_test( data_loaders = parse_ep_dataloaders(data_loaders) custom_invocation_id = os.environ.get("EP_INVOCATION_ID", None) - # dataloader might be overridden here, to avoid conflict, manually unset other data input params + # ignore other data input params when dataloader is provided if data_loaders: input_dataset = None input_messages = None