From 4fb291129d6d4ed3c2db5faa030a9c09b200fd85 Mon Sep 17 00:00:00 2001 From: benjibc Date: Fri, 10 Oct 2025 15:06:35 +0000 Subject: [PATCH 1/3] proper end to end upload --- eval_protocol/auth.py | 39 +++++++ eval_protocol/cli_commands/upload.py | 111 +++++++++---------- eval_protocol/evaluation.py | 157 +++++++++++++++++++++------ 3 files changed, 210 insertions(+), 97 deletions(-) diff --git a/eval_protocol/auth.py b/eval_protocol/auth.py index a9840d96..6b0845bb 100644 --- a/eval_protocol/auth.py +++ b/eval_protocol/auth.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Dict, Optional # Added Dict +import requests + logger = logging.getLogger(__name__) # Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file(). @@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str: else: logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base) return api_base + + +def verify_api_key_and_get_account_id( + api_key: Optional[str] = None, + api_base: Optional[str] = None, +) -> Optional[str]: + """ + Calls the Fireworks API verify endpoint to validate the API key and returns the + account id from response headers when available. + + Args: + api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key(). + api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base(). + + Returns: + The resolved account id if verification succeeds and the header is present; otherwise None. + """ + try: + resolved_key = api_key or get_fireworks_api_key() + if not resolved_key: + return None + resolved_base = api_base or get_fireworks_api_base() + url = f"{resolved_base.rstrip('/')}/verifyApiKey" + headers = {"Authorization": f"Bearer {resolved_key}"} + resp = requests.get(url, headers=headers, timeout=10) + if resp.status_code != 200: + logger.debug("verifyApiKey returned status %s", resp.status_code) + return None + # Header keys could vary in case; requests provides case-insensitive dict + account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id") + if account_id and account_id.strip(): + logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id) + return account_id.strip() + return None + except Exception as e: + logger.debug("Failed to verify API key for account id resolution: %s", e) + return None diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index 9cb49371..acb3501c 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -12,7 +12,12 @@ from typing import Any, Callable, Iterable, Optional import pytest -from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key +from eval_protocol.auth import ( + get_fireworks_account_id, + get_fireworks_api_key, + get_fireworks_api_base, + verify_api_key_and_get_account_id, +) from eval_protocol.platform_api import create_or_update_fireworks_secret from eval_protocol.evaluation import create_evaluation @@ -259,7 +264,7 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]: raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format") -def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]: +def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]: target, func = _parse_entry(entry, cwd) # Check if target looks like a file path @@ -293,47 +298,12 @@ def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, s raise ValueError(f"Function '{func}' not found in module '{module_name}'") qualname = f"{module_name}.{func}" - code, file_name = _generate_ts_mode_code( - DiscoveredTest( - module_path=module_name, - module_name=module_name, - qualname=qualname, - file_path=getattr(module, "__file__", module_name), - lineno=None, - has_parametrize=False, - param_count=0, - nodeids=[], - ) - ) - return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else "" + return qualname, os.path.abspath(source_file_path) if source_file_path else "" def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]: - # Generate a minimal main.py that imports the test module and calls the function - module = test.module_name - func = test.qualname.split(".")[-1] - code = f""" -from typing import Any, Dict, List, Optional, Union - -from eval_protocol.models import EvaluationRow, Message -from {module} import {func} as _ep_test - -def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs): - row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth) - result = _ep_test(row) # Supports sync/async via decorator's dual-mode - if hasattr(result, "__await__"): - import asyncio - result = asyncio.get_event_loop().run_until_complete(result) - if result.evaluation_result is None: - return {{"score": 0.0, "reason": "No evaluation_result set"}} - out = {{ - "score": float(result.evaluation_result.score or 0.0), - "reason": result.evaluation_result.reason, - "metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}}, - }} - return out -""" - return (code, "main.py") + # Deprecated: we no longer generate a shim; keep stub for import compatibility + return ("", "main.py") def _normalize_evaluator_id(evaluator_id: str) -> str: @@ -522,10 +492,10 @@ def upload_command(args: argparse.Namespace) -> int: entries_arg = getattr(args, "entry", None) if entries_arg: entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()] - selected_specs: list[tuple[str, str, str, str]] = [] + selected_specs: list[tuple[str, str]] = [] for e in entries: - code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root) - selected_specs.append((code, file_name, qualname, resolved_path)) + qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root) + selected_specs.append((qualname, resolved_path)) else: print("Scanning for evaluation tests...") tests = _discover_tests(root) @@ -545,11 +515,7 @@ def upload_command(args: argparse.Namespace) -> int: print(" handles all parameter combinations. The evaluator will work with") print(" the same logic regardless of which model/parameters are used.") - selected_specs = [] - for t in selected_tests: - code, file_name = _generate_ts_mode_code(t) - # Store test info for better ID generation - selected_specs.append((code, file_name, t.qualname, t.file_path)) + selected_specs = [(t.qualname, t.file_path) for t in selected_tests] base_id = getattr(args, "id", None) display_name = getattr(args, "display_name", None) @@ -560,6 +526,16 @@ def upload_command(args: argparse.Namespace) -> int: try: fw_account_id = get_fireworks_account_id() fw_api_key_value = get_fireworks_api_key() + if not fw_account_id and fw_api_key_value: + # Attempt to verify and resolve account id from server headers + resolved = verify_api_key_and_get_account_id( + api_key=fw_api_key_value, api_base=get_fireworks_api_base() + ) + if resolved: + fw_account_id = resolved + # Propagate to environment so downstream calls use it if needed + os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id + print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}") if fw_account_id and fw_api_key_value: print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...") if create_or_update_fireworks_secret( @@ -579,8 +555,7 @@ def upload_command(args: argparse.Namespace) -> int: print(f"Warning: Skipped Fireworks secret registration due to error: {e}") exit_code = 0 - for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs): - # Use ts_mode to upload evaluator + for i, (qualname, source_file_path) in enumerate(selected_specs): # Generate a short default ID from just the test function name if base_id: evaluator_id = base_id @@ -618,17 +593,31 @@ def upload_command(args: argparse.Namespace) -> int: print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...") try: - result = create_evaluation( - evaluator_id=evaluator_id, - python_code_to_evaluate=code, - python_file_name_for_code=file_name, - criterion_name_for_code=qualname, - criterion_description_for_code=description or f"Evaluator for {qualname}", - display_name=display_name or evaluator_id, - description=description or f"Evaluator for {qualname}", - force=force, - entry_point=entry_point, - ) + # Upload full directory of the test as multi-metric if the dir contains multiple files + test_dir = os.path.dirname(source_file_path) if source_file_path else root + # Use multi_metrics if multiple .py files exist at the root dir; otherwise treat as single metric dir + py_files = [f for f in os.listdir(test_dir) if f.endswith(".py")] + if len(py_files) > 1: + result = create_evaluation( + evaluator_id=evaluator_id, + multi_metrics=True, + folder=test_dir, + display_name=display_name or evaluator_id, + description=description or f"Evaluator for {qualname}", + force=force, + entry_point=entry_point, + ) + else: + # Single metric mode: metric name derived from folder name; include all files recursively + metric_name = os.path.basename(test_dir) or "metric" + result = create_evaluation( + evaluator_id=evaluator_id, + metric_folders=[f"{metric_name}={test_dir}"], + display_name=display_name or evaluator_id, + description=description or f"Evaluator for {qualname}", + force=force, + entry_point=entry_point, + ) name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id # Print success message with Fireworks dashboard link diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index 9d75a22a..d76824dc 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -15,7 +15,11 @@ import requests -from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key +from eval_protocol.auth import ( + get_fireworks_account_id, + get_fireworks_api_key, + verify_api_key_and_get_account_id, +) from eval_protocol.typed_interface import EvaluationMode logger = logging.getLogger(__name__) @@ -195,17 +199,16 @@ def __init__( def _load_python_files_from_folder(self, folder_path: str) -> Dict[str, str]: """ - Loads all Python files from a given folder. + Recursively loads all Python files from a given folder (excluding common ignored dirs). Args: folder_path: Absolute path to the folder. Returns: - A dictionary mapping filenames to their content. + A dictionary mapping relative file paths (within folder) to their content. Raises: - ValueError: If folder_path is invalid, not a directory, - or if main.py is missing or doesn't contain 'evaluate'. + ValueError: If folder_path is invalid or not a directory. """ if not os.path.exists(folder_path): raise ValueError(f"Folder does not exist: {folder_path}") @@ -213,21 +216,22 @@ def _load_python_files_from_folder(self, folder_path: str) -> Dict[str, str]: if not os.path.isdir(folder_path): raise ValueError(f"Not a directory: {folder_path}") - files = {} - for file_path in Path(folder_path).glob("*.py"): - if file_path.is_file(): - with open(file_path, "r") as f: - filename = file_path.name + files: Dict[str, str] = {} + ignored_dirs = {".git", "__pycache__", "node_modules", "venv", ".venv", "dist", "build", "vendor"} + base_path = Path(folder_path) + for dirpath, dirnames, filenames in os.walk(folder_path): + # prune ignored directories + dirnames[:] = [d for d in dirnames if d not in ignored_dirs and not d.startswith(".")] + for name in filenames: + if not name.endswith(".py"): + continue + abs_path = Path(dirpath) / name + rel_path = str(abs_path.relative_to(base_path)) + with open(abs_path, "r", encoding="utf-8") as f: content = f.read() - files[filename] = content - - # Check for main.py with evaluate function - if filename == "main.py" and "evaluate" not in content: - raise ValueError(f"main.py in {folder_path} must contain an evaluate function") - - if "main.py" not in files: - raise ValueError(f"main.py is required in {folder_path}") - + files[rel_path] = content + if not files: + raise ValueError(f"No Python files found in {folder_path}") return files def load_metric_folder(self, metric_name, folder_path): @@ -348,8 +352,10 @@ def preview(self, sample_file, max_samples=5): if not samples: raise ValueError(f"No valid samples found in {sample_file}") - account_id = self.account_id or get_fireworks_account_id() auth_token = self.api_key or get_fireworks_api_key() + account_id = self.account_id or get_fireworks_account_id() + if not account_id and auth_token: + account_id = verify_api_key_and_get_account_id(api_key=auth_token, api_base=self.api_base) logger.debug(f"Preview using account_id: {account_id}") if not account_id or not auth_token: @@ -507,8 +513,11 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) if not self.remote_url and not self.ts_mode_config and not self.code_files: raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.") - account_id = self.account_id or get_fireworks_account_id() auth_token = self.api_key or get_fireworks_api_key() + account_id = self.account_id or get_fireworks_account_id() + if not account_id and auth_token: + # Attempt to verify the API key and derive account id from server headers + account_id = verify_api_key_and_get_account_id(api_key=auth_token, api_base=self.api_base) if not auth_token or not account_id: logger.error("Authentication error: API credentials appear to be invalid or incomplete.") raise ValueError("Invalid or missing API credentials.") @@ -648,6 +657,12 @@ def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] description = self.ts_mode_config.get("description", "Python code execution") if not python_code: raise ValueError("python_code is required in ts_mode_config") + entry_func = "evaluate" + try: + if self.entry_point and "::" in self.entry_point: + entry_func = self.entry_point.split("::", 1)[1] + except Exception: + entry_func = "evaluate" assertions.append( { "type": "CODE_SNIPPETS", @@ -656,6 +671,8 @@ def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] "codeSnippets": { "language": "python", "fileContents": {file_name: python_code}, + "entryFile": file_name, + "entryFunc": entry_func, }, } ) @@ -667,11 +684,48 @@ def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] file_contents[filename] = self._update_evaluate_signature(content) if not file_contents: raise ValueError("No Python files found for multi-metrics mode.") + # Determine entry file from entry_point if provided; otherwise detect + entry_file = None + if self.entry_point and "::" in self.entry_point: + try: + ep_file = self.entry_point.split("::", 1)[0] + if ep_file in file_contents: + entry_file = ep_file + else: + ep_base = os.path.basename(ep_file) + for fname in file_contents.keys(): + if os.path.basename(fname) == ep_base: + entry_file = fname + break + except Exception: + entry_file = None + if not entry_file: + try: + for fname, content in file_contents.items(): + for line in content.split("\n"): + s = line.lstrip() + if s.startswith("def evaluate(") or s.startswith("async def evaluate("): + entry_file = fname + break + if entry_file: + break + except Exception: + entry_file = None + if not entry_file: + entry_file = "main.py" if "main.py" in file_contents else list(file_contents.keys())[0] + entry_func = "evaluate" + try: + if self.entry_point and "::" in self.entry_point: + entry_func = self.entry_point.split("::", 1)[1] + except Exception: + entry_func = "evaluate" assertions.append( { "codeSnippets": { "language": "python", "fileContents": file_contents, + "entryFile": entry_file, + "entryFunc": entry_func, }, "name": "eval", "type": "CODE_SNIPPETS", @@ -681,29 +735,60 @@ def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] else: # Folder-based, non-multi_metrics for metric_name in self.metric_folders: file_contents = {} - # Prioritize sending only main.py for the preview evaluator - main_py_key = f"{metric_name}/main.py" - if main_py_key in self.code_files: - file_contents["main.py"] = self._update_evaluate_signature(self.code_files[main_py_key]) - else: - # Fallback to sending all files if main.py isn't found directly under metric_name/ (should not happen with current loading logic) - # Or if a more complex structure was intended. For now, this path means an issue. - logger.warning( - f"main.py not found for metric '{metric_name}' with key '{main_py_key}'. " - "The preview payload might be incorrect or incomplete." - ) - + # Include all discovered files for this metric folder, preserving filenames + for filename, content in self.code_files.items(): + if filename.startswith(f"{metric_name}/") and filename.endswith(".py"): + # Use the file name within the metric folder for clarity + short_name = filename.split(f"{metric_name}/", 1)[1] + file_contents[short_name] = self._update_evaluate_signature(content) if not file_contents: logger.warning( - f"No Python files (specifically main.py) prepared for metric '{metric_name}', skipping this metric for criteria." + f"No Python files prepared for metric '{metric_name}', skipping this metric for criteria." ) continue - + # Determine entry file within this metric's files using entry_point if present + entry_file = None + if self.entry_point and "::" in self.entry_point: + try: + ep_file = self.entry_point.split("::", 1)[0] + if ep_file in file_contents: + entry_file = ep_file + else: + ep_base = os.path.basename(ep_file) + for fname in file_contents.keys(): + if os.path.basename(fname) == ep_base: + entry_file = fname + break + except Exception: + entry_file = None + if not entry_file: + try: + for fname, content in file_contents.items(): + for line in content.split("\n"): + s = line.lstrip() + if s.startswith("def evaluate(") or s.startswith("async def evaluate("): + entry_file = fname + break + if entry_file: + break + except Exception: + entry_file = None + if not entry_file: + entry_file = "main.py" if "main.py" in file_contents else list(file_contents.keys())[0] + + entry_func = "evaluate" + try: + if self.entry_point and "::" in self.entry_point: + entry_func = self.entry_point.split("::", 1)[1] + except Exception: + entry_func = "evaluate" assertions.append( { "codeSnippets": { "language": "python", - "fileContents": file_contents, # Should now ideally only contain main.py + "fileContents": file_contents, + "entryFile": entry_file, + "entryFunc": entry_func, }, "name": metric_name, "type": "CODE_SNIPPETS", From f33eb103937f9d3fc6b98f1da59cb848ad6430ae Mon Sep 17 00:00:00 2001 From: benjibc Date: Sun, 12 Oct 2025 04:56:33 +0000 Subject: [PATCH 2/3] fix multi metrics issue --- eval_protocol/cli_commands/upload.py | 38 ++++++++-------------------- eval_protocol/evaluation.py | 16 ++++-------- 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index acb3501c..410f9459 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -528,9 +528,7 @@ def upload_command(args: argparse.Namespace) -> int: fw_api_key_value = get_fireworks_api_key() if not fw_account_id and fw_api_key_value: # Attempt to verify and resolve account id from server headers - resolved = verify_api_key_and_get_account_id( - api_key=fw_api_key_value, api_base=get_fireworks_api_base() - ) + resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base()) if resolved: fw_account_id = resolved # Propagate to environment so downstream calls use it if needed @@ -593,31 +591,17 @@ def upload_command(args: argparse.Namespace) -> int: print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...") try: - # Upload full directory of the test as multi-metric if the dir contains multiple files + # Always treat as a single evaluator (single-metric) even if folder has helper modules test_dir = os.path.dirname(source_file_path) if source_file_path else root - # Use multi_metrics if multiple .py files exist at the root dir; otherwise treat as single metric dir - py_files = [f for f in os.listdir(test_dir) if f.endswith(".py")] - if len(py_files) > 1: - result = create_evaluation( - evaluator_id=evaluator_id, - multi_metrics=True, - folder=test_dir, - display_name=display_name or evaluator_id, - description=description or f"Evaluator for {qualname}", - force=force, - entry_point=entry_point, - ) - else: - # Single metric mode: metric name derived from folder name; include all files recursively - metric_name = os.path.basename(test_dir) or "metric" - result = create_evaluation( - evaluator_id=evaluator_id, - metric_folders=[f"{metric_name}={test_dir}"], - display_name=display_name or evaluator_id, - description=description or f"Evaluator for {qualname}", - force=force, - entry_point=entry_point, - ) + metric_name = os.path.basename(test_dir) or "metric" + result = create_evaluation( + evaluator_id=evaluator_id, + metric_folders=[f"{metric_name}={test_dir}"], + display_name=display_name or evaluator_id, + description=description or f"Evaluator for {qualname}", + force=force, + entry_point=entry_point, + ) name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id # Print success message with Fireworks dashboard link diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index d76824dc..d7b74d2b 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -362,9 +362,7 @@ def preview(self, sample_file, max_samples=5): logger.error("Authentication error: Missing Fireworks Account ID or API Key.") raise ValueError("Missing Fireworks Account ID or API Key.") - # Determine multiMetrics for payload based on ts_mode_config or original flag - payload_multi_metrics = True - payload_rollup_settings = {"skipRollup": True} + # Do not set multiMetrics/rollupSettings in preview payload; keep minimal # For preview, evaluator_id might not be as critical for shim's env var name, # but pass it for consistency. Use display_name as a proxy if no specific ID. @@ -372,10 +370,9 @@ def preview(self, sample_file, max_samples=5): evaluator_payload_data = { "displayName": self.display_name or "Preview Evaluator", "description": self.description or "Preview Evaluator", - "multiMetrics": payload_multi_metrics, + # multiMetrics omitted intentionally "criteria": self._construct_criteria(criteria_data={}), - "requirements": self._get_combined_requirements(), # Changed to use combined requirements - "rollupSettings": payload_rollup_settings, + "requirements": self._get_combined_requirements(), } sample_strings = [json.dumps(sample) for sample in samples] @@ -525,19 +522,16 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) self.display_name = display_name or evaluator_id self.description = description or f"Evaluator created from {evaluator_id}" - # Determine multiMetrics for payload - payload_multi_metrics = True - payload_rollup_settings = {"skipRollup": True} + # Do not set multiMetrics/rollupSettings; server will infer when needed payload_data = { "evaluator": { "displayName": self.display_name, "description": self.description, - "multiMetrics": payload_multi_metrics, # How results are structured + # multiMetrics omitted intentionally # "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func "criteria": self._construct_criteria(criteria_data={}), "requirements": "", - "rollupSettings": payload_rollup_settings, }, "evaluatorId": evaluator_id, } From 844aea63797a4bd08feb2ffd69a965db929f50bf Mon Sep 17 00:00:00 2001 From: benjibc Date: Sun, 12 Oct 2025 05:44:03 +0000 Subject: [PATCH 3/3] keep multi metric and rollout --- eval_protocol/evaluation.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index d7b74d2b..58e0beea 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -362,7 +362,9 @@ def preview(self, sample_file, max_samples=5): logger.error("Authentication error: Missing Fireworks Account ID or API Key.") raise ValueError("Missing Fireworks Account ID or API Key.") - # Do not set multiMetrics/rollupSettings in preview payload; keep minimal + # Keep multiMetrics/rollupSettings for backward compatibility with tests + payload_multi_metrics = True + payload_rollup_settings = {"skipRollup": True} # For preview, evaluator_id might not be as critical for shim's env var name, # but pass it for consistency. Use display_name as a proxy if no specific ID. @@ -370,9 +372,10 @@ def preview(self, sample_file, max_samples=5): evaluator_payload_data = { "displayName": self.display_name or "Preview Evaluator", "description": self.description or "Preview Evaluator", - # multiMetrics omitted intentionally + "multiMetrics": payload_multi_metrics, "criteria": self._construct_criteria(criteria_data={}), "requirements": self._get_combined_requirements(), + "rollupSettings": payload_rollup_settings, } sample_strings = [json.dumps(sample) for sample in samples] @@ -522,16 +525,19 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) self.display_name = display_name or evaluator_id self.description = description or f"Evaluator created from {evaluator_id}" - # Do not set multiMetrics/rollupSettings; server will infer when needed + # Keep multiMetrics/rollupSettings for backward compatibility with tests + payload_multi_metrics = True + payload_rollup_settings = {"skipRollup": True} payload_data = { "evaluator": { "displayName": self.display_name, "description": self.description, - # multiMetrics omitted intentionally + "multiMetrics": payload_multi_metrics, # "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func "criteria": self._construct_criteria(criteria_data={}), "requirements": "", + "rollupSettings": payload_rollup_settings, }, "evaluatorId": evaluator_id, }