From 65abb771dc0f93a629e80f1745a42edf013bb608 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 21 Aug 2025 23:56:04 -0700 Subject: [PATCH 1/2] Fix Completion Params --- eval_protocol/pytest/evaluation_test.py | 27 +++++-------------------- eval_protocol/pytest/utils.py | 19 +++++++++++++++++ 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2cacc09f..11b2f287 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -54,6 +54,7 @@ parse_ep_max_rows, parse_ep_max_concurrent_rollouts, parse_ep_num_runs, + parse_ep_completion_params, rollout_processor_with_retry, sanitize_filename, ) @@ -338,6 +339,7 @@ def evaluation_test( # noqa: C901 num_runs = parse_ep_num_runs(num_runs) max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts) max_dataset_rows = parse_ep_max_rows(max_dataset_rows) + completion_params = parse_ep_completion_params(completion_params) def decorator( test_func: TestFunction, @@ -420,9 +422,6 @@ async def execute_with_params( else: return test_func(**kwargs) - # preserve the original completion_params list for groupwise mode - original_completion_params_list = completion_params - # Calculate all possible combinations of parameters if mode == "groupwise": combinations = generate_parameter_combinations( @@ -544,20 +543,6 @@ def _log_eval_error(status: Status, rows: Optional[List[EvaluationRow]] | None, "No model provided. Please provide a model in the completion parameters object." ) - # Optional global overrides via environment for ad-hoc experimentation - # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged - # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}'). - try: - import json as _json - - _env_override = os.getenv("EP_INPUT_PARAMS_JSON") - if _env_override: - override_obj = _json.loads(_env_override) - if isinstance(override_obj, dict): - completion_params = deep_update_dict(dict(completion_params), override_obj) - except Exception: - pass - # Create eval metadata with test function info and current commit hash eval_metadata = EvalMetadata( name=test_func.__name__, @@ -661,7 +646,7 @@ async def _execute_eval_with_semaphore(**inner_kwargs): row_groups = defaultdict(list) # key: row_id, value: list of rollout_result tasks: List[asyncio.Task[List[EvaluationRow]]] = [] # completion_groups = [] - for idx, cp in enumerate(original_completion_params_list): + for idx, cp in enumerate(completion_params): config = RolloutProcessorConfig( completion_params=cp, mcp_config_path=mcp_config_path or "", @@ -743,9 +728,7 @@ async def _collect_result(config, lst): # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them # rollout_id is used to differentiate the result from different completion_params if mode == "groupwise": - results_by_group = [ - [[] for _ in range(num_runs)] for _ in range(len(original_completion_params_list)) - ] + results_by_group = [[[] for _ in range(num_runs)] for _ in range(len(completion_params))] for i_run, result in enumerate(all_results): for r in result: completion_param_idx = int(r.execution_metadata.rollout_id.split("_")[1]) @@ -757,7 +740,7 @@ async def _collect_result(config, lst): threshold, active_logger, mode, - original_completion_params_list[rollout_id], + completion_params[rollout_id], test_func.__name__, num_runs, ) diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index c24fbdc9..bad097a7 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -170,6 +170,25 @@ def parse_ep_max_concurrent_rollouts(default_value: int) -> int: return int(raw) if raw is not None else default_value +def parse_ep_completion_params(completion_params: List[CompletionParams]) -> List[CompletionParams]: + """Apply EP_INPUT_PARAMS_JSON overrides to completion_params. + + Reads the environment variable set by plugin.py and applies deep merge to each completion param. + """ + try: + import json as _json + + _env_override = os.getenv("EP_INPUT_PARAMS_JSON") + if _env_override: + override_obj = _json.loads(_env_override) + if isinstance(override_obj, dict): + # Apply override to each completion_params item + return [deep_update_dict(dict(cp), override_obj) for cp in completion_params] + except Exception: + pass + return completion_params + + def deep_update_dict(base: dict, override: dict) -> dict: """Recursively update nested dictionaries in-place and return base.""" for key, value in override.items(): From 7943cdf2a9747d3e5f1e255661670d45f6fa82cf Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Fri, 22 Aug 2025 01:17:04 -0700 Subject: [PATCH 2/2] test fix --- eval_protocol/pytest/evaluation_test.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 11b2f287..2dd4dbc0 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -335,11 +335,14 @@ def evaluation_test( # noqa: C901 active_logger: DatasetLogger = logger if logger else default_logger - # Apply override from pytest flags if present + # Optional global overrides via environment for ad-hoc experimentation + # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged + # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}'). num_runs = parse_ep_num_runs(num_runs) max_concurrent_rollouts = parse_ep_max_concurrent_rollouts(max_concurrent_rollouts) max_dataset_rows = parse_ep_max_rows(max_dataset_rows) completion_params = parse_ep_completion_params(completion_params) + original_completion_params = completion_params def decorator( test_func: TestFunction, @@ -646,7 +649,7 @@ async def _execute_eval_with_semaphore(**inner_kwargs): row_groups = defaultdict(list) # key: row_id, value: list of rollout_result tasks: List[asyncio.Task[List[EvaluationRow]]] = [] # completion_groups = [] - for idx, cp in enumerate(completion_params): + for idx, cp in enumerate(original_completion_params): config = RolloutProcessorConfig( completion_params=cp, mcp_config_path=mcp_config_path or "", @@ -728,7 +731,9 @@ async def _collect_result(config, lst): # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them # rollout_id is used to differentiate the result from different completion_params if mode == "groupwise": - results_by_group = [[[] for _ in range(num_runs)] for _ in range(len(completion_params))] + results_by_group = [ + [[] for _ in range(num_runs)] for _ in range(len(original_completion_params)) + ] for i_run, result in enumerate(all_results): for r in result: completion_param_idx = int(r.execution_metadata.rollout_id.split("_")[1]) @@ -740,7 +745,7 @@ async def _collect_result(config, lst): threshold, active_logger, mode, - completion_params[rollout_id], + original_completion_params[rollout_id], test_func.__name__, num_runs, )