diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index 8683c872..68284b76 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -81,6 +81,12 @@ def _is_eval_protocol_test(obj: Any) -> bool: return False # Must have pytest marks from evaluation_test marks = getattr(obj, "pytestmark", []) + # Handle pytest proxy objects (APIRemovedInV1Proxy) + if not isinstance(marks, (list, tuple)): + try: + marks = list(marks) if marks else [] + except (TypeError, AttributeError): + return False return len(marks) > 0 @@ -131,51 +137,103 @@ def _discover_tests(root: str) -> list[DiscoveredTest]: discovered: list[DiscoveredTest] = [] - # Collect all test functions from Python files - for file_path in _iter_python_files(root): + class CollectionPlugin: + """Plugin to capture collected items without running code.""" + + def __init__(self): + self.items = [] + + def pytest_ignore_collect(self, collection_path, config): + """Ignore problematic files before pytest tries to import them.""" + # Ignore specific files + ignored_files = ["setup.py", "versioneer.py", "conf.py", "__main__.py"] + if collection_path.name in ignored_files: + return True + + # Ignore hidden files (starting with .) + if collection_path.name.startswith("."): + return True + + # Ignore test_discovery files + if collection_path.name.startswith("test_discovery"): + return True + + return None + + def pytest_collection_modifyitems(self, items): + """Hook called after collection is done.""" + self.items = items + + plugin = CollectionPlugin() + + # Run pytest collection only (--collect-only prevents code execution) + # Override python_files to collect from ANY .py file + args = [ + abs_root, + "--collect-only", + "-q", + "--pythonwarnings=ignore", + "-o", + "python_files=*.py", # Override to collect all .py files + ] + + try: + # Suppress pytest output + import io + import contextlib + + with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): + pytest.main(args, plugins=[plugin]) + except Exception: + # If pytest collection fails, fall back to empty list + return [] + + # Process collected items + for item in plugin.items: + if not hasattr(item, "obj"): + continue + + obj = item.obj + if not _is_eval_protocol_test(obj): + continue + + origin = getattr(obj, "_origin_func", obj) try: - unique_name = "ep_upload_" + re.sub(r"[^a-zA-Z0-9_]", "_", os.path.abspath(file_path)) - spec = importlib.util.spec_from_file_location(unique_name, file_path) - if spec and spec.loader: - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) # type: ignore[attr-defined] - else: - continue + src_file = inspect.getsourcefile(origin) or str(item.path) + _, lineno = inspect.getsourcelines(origin) except Exception: - continue + src_file, lineno = str(item.path), None - for name, obj in inspect.getmembers(module): - if _is_eval_protocol_test(obj): - origin = getattr(obj, "_origin_func", obj) - try: - src_file = inspect.getsourcefile(origin) or file_path - _, lineno = inspect.getsourcelines(origin) - except Exception: - src_file, lineno = file_path, None - - # Extract parametrization info from marks - has_parametrize, param_count, param_ids = _extract_param_info_from_marks(obj) - - # Generate synthetic nodeids for display - base_nodeid = f"{os.path.basename(file_path)}::{name}" - if has_parametrize and param_ids: - nodeids = [f"{base_nodeid}[{pid}]" for pid in param_ids] - else: - nodeids = [base_nodeid] - - discovered.append( - DiscoveredTest( - module_path=module.__name__, - module_name=module.__name__, - qualname=f"{module.__name__}.{name}", - file_path=os.path.abspath(src_file), - lineno=lineno, - has_parametrize=has_parametrize, - param_count=param_count, - nodeids=nodeids, - ) - ) + # Extract parametrization info from marks + has_parametrize, param_count, param_ids = _extract_param_info_from_marks(obj) + + # Get module name and function name + module_name = ( + item.module.__name__ + if hasattr(item, "module") + else item.nodeid.split("::")[0].replace("/", ".").replace(".py", "") + ) + func_name = item.name.split("[")[0] if "[" in item.name else item.name + + # Generate nodeids + base_nodeid = f"{os.path.basename(src_file)}::{func_name}" + if param_ids: + nodeids = [f"{base_nodeid}[{pid}]" for pid in param_ids] + else: + nodeids = [base_nodeid] + + discovered.append( + DiscoveredTest( + module_path=module_name, + module_name=module_name, + qualname=f"{module_name}.{func_name}", + file_path=os.path.abspath(src_file), + lineno=lineno, + has_parametrize=has_parametrize, + param_count=param_count, + nodeids=nodeids, + ) + ) # Deduplicate by qualname (in case same test appears multiple times) by_qual: dict[str, DiscoveredTest] = {} diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index 58e0beea..9e5c1e8f 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -22,6 +22,8 @@ ) from eval_protocol.typed_interface import EvaluationMode +from eval_protocol.get_pep440_version import get_pep440_version + logger = logging.getLogger(__name__) # Flag to track if the preview API was successfully used @@ -509,6 +511,136 @@ def _simulated_preview(self, samples): preview_result.total_runtime_ms = max(1, int((end_time - start_time) * 1000)) return preview_result + def _build_minimal_criteria(self) -> List[Dict[str, str]]: + """Build minimal criteria (name, type, description) without code snippets.""" + + # Remote URL mode + if self.remote_url: + return [ + { + "name": "remote_eval_proxy", + "type": "CODE_SNIPPETS", + "description": f"Proxies evaluation to remote URL: {self.remote_url}", + } + ] + + # TS mode (direct code snippet) + elif self.ts_mode_config: + criterion_name = self.ts_mode_config.get("criterion_name", "default_code_criterion") + description = self.ts_mode_config.get("description", "Python code execution") + return [ + { + "name": criterion_name, + "type": "CODE_SNIPPETS", + "description": description, + } + ] + + # Multi-metrics mode + elif self.multi_metrics: + return [ + { + "name": "eval", + "type": "CODE_SNIPPETS", + "description": self.description or "Multi-metric evaluation", + } + ] + + # Single metric folders + else: + criteria = [] + for metric_name in self.metric_folders: + criteria.append( + { + "name": metric_name, + "type": "CODE_SNIPPETS", + "description": self.description or f"Evaluation metric: {metric_name}", + } + ) + return criteria + + @staticmethod + def _parse_ignore_file(ignore_path: str) -> List[str]: + """Parse .gitignore or .dockerignore and return patterns.""" + patterns = [] + if not os.path.exists(ignore_path): + return patterns + + try: + with open(ignore_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + patterns.append(line) + except Exception: + pass + + return patterns + + @staticmethod + def _ensure_requirements_present(source_dir: str) -> None: + req_path = os.path.join(source_dir, "requirements.txt") + pyproj_path = os.path.join(source_dir, "pyproject.toml") + if not (os.path.isfile(req_path) or os.path.isfile(pyproj_path)): + logger.error("Missing requirements.txt or pyproject.toml in upload directory: %s", source_dir) + raise ValueError( + "Upload requires either requirements.txt or pyproject.toml in the project root. " + "Please add one and re-run ep upload." + ) + + @staticmethod + def _should_ignore(path: str, ignore_patterns: List[str]) -> bool: + """Check if path matches any ignore pattern.""" + from pathlib import Path + import fnmatch + + default_ignores = [".git", "__pycache__", "*.pyc", ".venv", "venv", "node_modules", "*.egg-info"] + all_patterns = default_ignores + ignore_patterns + + path_obj = Path(path) + for pattern in all_patterns: + if pattern.endswith("/"): + if path_obj.is_dir() and fnmatch.fnmatch(path_obj.name, pattern.rstrip("/")): + return True + elif fnmatch.fnmatch(path_obj.name, pattern) or fnmatch.fnmatch(str(path_obj), pattern): + return True + + return False + + @staticmethod + def _create_tar_gz_with_ignores(output_path: str, source_dir: str) -> int: + """Create tar.gz of source_dir with parent directory included.""" + import tarfile + from pathlib import Path + + source_path = Path(source_dir) + gitignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".gitignore")) + dockerignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".dockerignore")) + all_ignore_patterns = gitignore_patterns + dockerignore_patterns + + logger.info(f"Creating tar.gz with {len(all_ignore_patterns)} ignore patterns") + + # Get directory name for the archive root + dir_name = os.path.basename(source_dir) + parent_dir = os.path.dirname(source_dir) + + with tarfile.open(output_path, "w:gz") as tar: + for root, dirs, files in os.walk(source_dir): + dirs[:] = [d for d in dirs if not Evaluator._should_ignore(os.path.join(root, d), all_ignore_patterns)] + + for file in files: + file_path = os.path.join(root, file) + if Evaluator._should_ignore(file_path, all_ignore_patterns): + continue + + # Include parent directory in archive path + rel_path = os.path.relpath(file_path, parent_dir) # Relative to parent + tar.add(file_path, arcname=rel_path) # Keeps "python-sdk/..." structure + + size_bytes = os.path.getsize(output_path) + logger.info(f"Created {output_path} ({size_bytes:,} bytes)") + return size_bytes + def create(self, evaluator_id, display_name=None, description=None, force=False): if not self.remote_url and not self.ts_mode_config and not self.code_files: raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.") @@ -528,14 +660,24 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) # Keep multiMetrics/rollupSettings for backward compatibility with tests payload_multi_metrics = True payload_rollup_settings = {"skipRollup": True} + parent = f"accounts/{account_id}" + + version_str = None + + try: + version_str = get_pep440_version() + except Exception: + version_str = None payload_data = { + "parent": parent, "evaluator": { "displayName": self.display_name, "description": self.description, "multiMetrics": payload_multi_metrics, + "commitHash": version_str, # "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func - "criteria": self._construct_criteria(criteria_data={}), + "criteria": self._build_minimal_criteria(), "requirements": "", "rollupSettings": payload_rollup_settings, }, @@ -557,21 +699,27 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) if "dev.api.fireworks.ai" in self.api_base and account_id == "fireworks": account_id = "pyroworks-dev" - base_url = f"{self.api_base}/v1/accounts/{account_id}/evaluators" + base_url = f"{self.api_base}/v1/{parent}/evaluatorsV2" headers = { "Authorization": f"Bearer {auth_token}", "Content-Type": "application/json", } + + self._ensure_requirements_present(os.getcwd()) + logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...") try: if force: - check_url = f"{base_url}/{evaluator_id}" + base_url_2 = f"{self.api_base}/v1/{parent}/evaluators" + check_url = f"{base_url_2}/{evaluator_id}" try: + logger.info(f"check_url: {check_url}, headers: {headers}") check_response = requests.get(check_url, headers=headers) + if check_response.status_code == 200: logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...") - delete_url = f"{base_url}/{evaluator_id}" + delete_url = f"{base_url_2}/{evaluator_id}" try: delete_response = requests.delete(delete_url, headers=headers) if delete_response.status_code < 400: @@ -582,18 +730,90 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) ) except Exception as e_del: logger.warning(f"Error deleting evaluator: {str(e_del)}") + logger.info(f"base_url: {base_url_2}, payload_data: {payload_data}, headers: {headers}") response = requests.post(base_url, json=payload_data, headers=headers) else: + print(f"base_url: {base_url_2}, payload_data: {payload_data}, headers: {headers}") response = requests.post(base_url, json=payload_data, headers=headers) except requests.exceptions.RequestException: - response = requests.post(base_url, json=payload_data, headers=headers) + response = requests.post(base_url_2, json=payload_data, headers=headers) else: + logger.info(f"check_url: {base_url}, headers: {headers}, payload_data: {payload_data}") response = requests.post(base_url, json=payload_data, headers=headers) response.raise_for_status() result = response.json() logger.info(f"Successfully created evaluator '{evaluator_id}'") - return result + + # Upload code as tar.gz to GCS + evaluator_name = result.get("name") # e.g., "accounts/pyroworks/evaluators/test-123" + + if evaluator_name: + try: + # Create tar.gz of current directory + cwd = os.getcwd() + dir_name = os.path.basename(cwd) + tar_filename = f"{dir_name}.tar.gz" + tar_path = os.path.join(cwd, tar_filename) + + tar_size = self._create_tar_gz_with_ignores(tar_path, cwd) + + # Call GetEvaluatorUploadEndpoint + + upload_endpoint_url = f"{self.api_base}/v1/{evaluator_name}:getUploadEndpoint" + upload_payload = {"name": evaluator_name, "filename_to_size": {tar_filename: tar_size}} + + logger.info(f"Requesting upload endpoint for {tar_filename}") + upload_response = requests.post(upload_endpoint_url, json=upload_payload, headers=headers) + + upload_response.raise_for_status() + + signed_urls = upload_response.json().get("filenameToSignedUrls", {}) + signed_url = signed_urls.get(tar_filename) + + if signed_url: + logger.info(f"Uploading {tar_filename} to GCS...") + + file_size = os.path.getsize(tar_path) + + with open(tar_path, "rb") as f: + # Create request exactly like Golang + req = requests.Request( + "PUT", + signed_url, + data=f, + headers={ + "Content-Type": "application/octet-stream", + "X-Goog-Content-Length-Range": f"{file_size},{file_size}", + }, + ) + prepared = req.prepare() + + # Don't let requests add extra headers + session = requests.Session() + gcs_response = session.send(prepared, timeout=600) + gcs_response.raise_for_status() + + logger.info(f"Successfully uploaded {tar_filename}") + + # Step 3: Validate upload + validate_url = f"{self.api_base}/v1/{evaluator_name}:validateUpload" + validate_payload = {"name": evaluator_name} + validate_response = requests.post(validate_url, json=validate_payload, headers=headers) + validate_response.raise_for_status() + + logger.info("Upload validated successfully") + + # Clean up tar file + if os.path.exists(tar_path): + os.remove(tar_path) + + except Exception as upload_error: + logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}") + # Don't fail - evaluator is created, just code upload failed + + # return result # OLD: Direct return + return result # Return after attempting upload except Exception as e: logger.error(f"Error creating evaluator: {str(e)}") if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"):