diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index 8683c872..d696e664 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -81,6 +81,12 @@ def _is_eval_protocol_test(obj: Any) -> bool: return False # Must have pytest marks from evaluation_test marks = getattr(obj, "pytestmark", []) + # Handle pytest proxy objects (APIRemovedInV1Proxy) + if not isinstance(marks, (list, tuple)): + try: + marks = list(marks) if marks else [] + except (TypeError, AttributeError): + return False return len(marks) > 0 @@ -91,6 +97,14 @@ def _extract_param_info_from_marks(obj: Any) -> tuple[bool, int, list[str]]: (has_parametrize, param_count, param_ids) """ marks = getattr(obj, "pytestmark", []) + + # Handle pytest proxy objects (APIRemovedInV1Proxy) - same as _is_eval_protocol_test + if not isinstance(marks, (list, tuple)): + try: + marks = list(marks) if marks else [] + except (TypeError, AttributeError): + marks = [] + has_parametrize = False total_combinations = 0 all_param_ids: list[str] = [] @@ -131,51 +145,103 @@ def _discover_tests(root: str) -> list[DiscoveredTest]: discovered: list[DiscoveredTest] = [] - # Collect all test functions from Python files - for file_path in _iter_python_files(root): + class CollectionPlugin: + """Plugin to capture collected items without running code.""" + + def __init__(self): + self.items = [] + + def pytest_ignore_collect(self, collection_path, config): + """Ignore problematic files before pytest tries to import them.""" + # Ignore specific files + ignored_files = ["setup.py", "versioneer.py", "conf.py", "__main__.py"] + if collection_path.name in ignored_files: + return True + + # Ignore hidden files (starting with .) + if collection_path.name.startswith("."): + return True + + # Ignore test_discovery files + if collection_path.name.startswith("test_discovery"): + return True + + return None + + def pytest_collection_modifyitems(self, items): + """Hook called after collection is done.""" + self.items = items + + plugin = CollectionPlugin() + + # Run pytest collection only (--collect-only prevents code execution) + # Override python_files to collect from ANY .py file + args = [ + abs_root, + "--collect-only", + "-q", + "--pythonwarnings=ignore", + "-o", + "python_files=*.py", # Override to collect all .py files + ] + + try: + # Suppress pytest output + import io + import contextlib + + with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): + pytest.main(args, plugins=[plugin]) + except Exception: + # If pytest collection fails, fall back to empty list + return [] + + # Process collected items + for item in plugin.items: + if not hasattr(item, "obj"): + continue + + obj = item.obj + if not _is_eval_protocol_test(obj): + continue + + origin = getattr(obj, "_origin_func", obj) try: - unique_name = "ep_upload_" + re.sub(r"[^a-zA-Z0-9_]", "_", os.path.abspath(file_path)) - spec = importlib.util.spec_from_file_location(unique_name, file_path) - if spec and spec.loader: - module = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = module - spec.loader.exec_module(module) # type: ignore[attr-defined] - else: - continue + src_file = inspect.getsourcefile(origin) or str(item.path) + _, lineno = inspect.getsourcelines(origin) except Exception: - continue + src_file, lineno = str(item.path), None - for name, obj in inspect.getmembers(module): - if _is_eval_protocol_test(obj): - origin = getattr(obj, "_origin_func", obj) - try: - src_file = inspect.getsourcefile(origin) or file_path - _, lineno = inspect.getsourcelines(origin) - except Exception: - src_file, lineno = file_path, None - - # Extract parametrization info from marks - has_parametrize, param_count, param_ids = _extract_param_info_from_marks(obj) - - # Generate synthetic nodeids for display - base_nodeid = f"{os.path.basename(file_path)}::{name}" - if has_parametrize and param_ids: - nodeids = [f"{base_nodeid}[{pid}]" for pid in param_ids] - else: - nodeids = [base_nodeid] - - discovered.append( - DiscoveredTest( - module_path=module.__name__, - module_name=module.__name__, - qualname=f"{module.__name__}.{name}", - file_path=os.path.abspath(src_file), - lineno=lineno, - has_parametrize=has_parametrize, - param_count=param_count, - nodeids=nodeids, - ) - ) + # Extract parametrization info from marks + has_parametrize, param_count, param_ids = _extract_param_info_from_marks(obj) + + # Get module name and function name + module_name = ( + item.module.__name__ + if hasattr(item, "module") + else item.nodeid.split("::")[0].replace("/", ".").replace(".py", "") + ) + func_name = item.name.split("[")[0] if "[" in item.name else item.name + + # Generate nodeids + base_nodeid = f"{os.path.basename(src_file)}::{func_name}" + if param_ids: + nodeids = [f"{base_nodeid}[{pid}]" for pid in param_ids] + else: + nodeids = [base_nodeid] + + discovered.append( + DiscoveredTest( + module_path=module_name, + module_name=module_name, + qualname=f"{module_name}.{func_name}", + file_path=os.path.abspath(src_file), + lineno=lineno, + has_parametrize=has_parametrize, + param_count=param_count, + nodeids=nodeids, + ) + ) # Deduplicate by qualname (in case same test appears multiple times) by_qual: dict[str, DiscoveredTest] = {} diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index 58e0beea..c2a41a60 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -22,6 +22,8 @@ ) from eval_protocol.typed_interface import EvaluationMode +from eval_protocol.get_pep440_version import get_pep440_version + logger = logging.getLogger(__name__) # Flag to track if the preview API was successfully used @@ -386,7 +388,7 @@ def preview(self, sample_file, max_samples=5): } api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai") - print("show payload", payload) + if "dev.api.fireworks.ai" in api_base and account_id == "fireworks": account_id = "pyroworks-dev" @@ -509,6 +511,135 @@ def _simulated_preview(self, samples): preview_result.total_runtime_ms = max(1, int((end_time - start_time) * 1000)) return preview_result + def _build_minimal_criteria(self) -> List[Dict[str, str]]: + """Build minimal criteria (name, type, description) without code snippets.""" + + # Remote URL mode + if self.remote_url: + return [ + { + "name": "remote_eval_proxy", + "type": "CODE_SNIPPETS", + "description": f"Proxies evaluation to remote URL: {self.remote_url}", + } + ] + + # TS mode (direct code snippet) + elif self.ts_mode_config: + criterion_name = self.ts_mode_config.get("criterion_name", "default_code_criterion") + description = self.ts_mode_config.get("description", "Python code execution") + return [ + { + "name": criterion_name, + "type": "CODE_SNIPPETS", + "description": description, + } + ] + + # Multi-metrics mode + elif self.multi_metrics: + return [ + { + "name": "eval", + "type": "CODE_SNIPPETS", + "description": self.description or "Multi-metric evaluation", + } + ] + + # Single metric folders + else: + criteria = [] + for metric_name in self.metric_folders: + criteria.append( + { + "name": metric_name, + "type": "CODE_SNIPPETS", + "description": self.description or f"Evaluation metric: {metric_name}", + } + ) + return criteria + + @staticmethod + def _parse_ignore_file(ignore_path: str) -> List[str]: + """Parse .gitignore or .dockerignore and return patterns.""" + patterns = [] + if not os.path.exists(ignore_path): + return patterns + + try: + with open(ignore_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + patterns.append(line) + except Exception: + pass + + return patterns + + @staticmethod + def _ensure_requirements_present(source_dir: str) -> None: + req_path = os.path.join(source_dir, "requirements.txt") + if not os.path.isfile(req_path): + logger.error("Missing requirements.txt in upload directory: %s", source_dir) + raise ValueError( + "Upload requires requirements.txt in the project root. " + "Please add requirements.txt and re-run ep upload." + ) + + @staticmethod + def _should_ignore(path: str, ignore_patterns: List[str]) -> bool: + """Check if path matches any ignore pattern.""" + from pathlib import Path + import fnmatch + + default_ignores = [".git", "__pycache__", "*.pyc", ".venv", "venv", "node_modules", "*.egg-info"] + all_patterns = default_ignores + ignore_patterns + + path_obj = Path(path) + for pattern in all_patterns: + if pattern.endswith("/"): + if path_obj.is_dir() and fnmatch.fnmatch(path_obj.name, pattern.rstrip("/")): + return True + elif fnmatch.fnmatch(path_obj.name, pattern) or fnmatch.fnmatch(str(path_obj), pattern): + return True + + return False + + @staticmethod + def _create_tar_gz_with_ignores(output_path: str, source_dir: str) -> int: + """Create tar.gz of source_dir with parent directory included.""" + import tarfile + from pathlib import Path + + source_path = Path(source_dir) + gitignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".gitignore")) + dockerignore_patterns = Evaluator._parse_ignore_file(str(source_path / ".dockerignore")) + all_ignore_patterns = gitignore_patterns + dockerignore_patterns + + logger.info(f"Creating tar.gz with {len(all_ignore_patterns)} ignore patterns") + + # Get directory name for the archive root + dir_name = os.path.basename(source_dir) + parent_dir = os.path.dirname(source_dir) + + with tarfile.open(output_path, "w:gz") as tar: + for root, dirs, files in os.walk(source_dir): + dirs[:] = [d for d in dirs if not Evaluator._should_ignore(os.path.join(root, d), all_ignore_patterns)] + + for file in files: + file_path = os.path.join(root, file) + if Evaluator._should_ignore(file_path, all_ignore_patterns): + continue + + # Include parent directory in archive path + rel_path = os.path.relpath(file_path, parent_dir) # Relative to parent + tar.add(file_path, arcname=rel_path) # Keeps "python-sdk/..." structure + + size_bytes = os.path.getsize(output_path) + logger.info(f"Created {output_path} ({size_bytes:,} bytes)") + return size_bytes + def create(self, evaluator_id, display_name=None, description=None, force=False): if not self.remote_url and not self.ts_mode_config and not self.code_files: raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.") @@ -528,14 +659,21 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) # Keep multiMetrics/rollupSettings for backward compatibility with tests payload_multi_metrics = True payload_rollup_settings = {"skipRollup": True} + parent = f"accounts/{account_id}" + + try: + version_str = get_pep440_version() + except Exception: + version_str = None payload_data = { + "parent": parent, "evaluator": { "displayName": self.display_name, "description": self.description, "multiMetrics": payload_multi_metrics, - # "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func - "criteria": self._construct_criteria(criteria_data={}), + "commitHash": version_str, + "criteria": self._build_minimal_criteria(), "requirements": "", "rollupSettings": payload_rollup_settings, }, @@ -557,21 +695,26 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) if "dev.api.fireworks.ai" in self.api_base and account_id == "fireworks": account_id = "pyroworks-dev" - base_url = f"{self.api_base}/v1/accounts/{account_id}/evaluators" + base_url = f"{self.api_base}/v1/{parent}/evaluatorsV2" headers = { "Authorization": f"Bearer {auth_token}", "Content-Type": "application/json", } + + self._ensure_requirements_present(os.getcwd()) + logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...") try: if force: - check_url = f"{base_url}/{evaluator_id}" + check_url = f"{self.api_base}/v1/{parent}/evaluators/{evaluator_id}" try: + logger.info(f"Checking if evaluator exists: {check_url}") check_response = requests.get(check_url, headers=headers) + if check_response.status_code == 200: logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...") - delete_url = f"{base_url}/{evaluator_id}" + delete_url = f"{self.api_base}/v1/{parent}/evaluators/{evaluator_id}" try: delete_response = requests.delete(delete_url, headers=headers) if delete_response.status_code < 400: @@ -588,12 +731,134 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) except requests.exceptions.RequestException: response = requests.post(base_url, json=payload_data, headers=headers) else: + logger.info(f"Creating evaluator at: {base_url}") response = requests.post(base_url, json=payload_data, headers=headers) response.raise_for_status() result = response.json() logger.info(f"Successfully created evaluator '{evaluator_id}'") - return result + + # Upload code as tar.gz to GCS + evaluator_name = result.get("name") # e.g., "accounts/pyroworks/evaluators/test-123" + + if not evaluator_name: + raise ValueError( + "Create evaluator response missing 'name' field. " + f"Cannot proceed with code upload. Response: {result}" + ) + + try: + # Create tar.gz of current directory + cwd = os.getcwd() + dir_name = os.path.basename(cwd) + tar_filename = f"{dir_name}.tar.gz" + tar_path = os.path.join(cwd, tar_filename) + + tar_size = self._create_tar_gz_with_ignores(tar_path, cwd) + + # Call GetEvaluatorUploadEndpoint + upload_endpoint_url = f"{self.api_base}/v1/{evaluator_name}:getUploadEndpoint" + upload_payload = {"name": evaluator_name, "filename_to_size": {tar_filename: tar_size}} + + logger.info(f"Requesting upload endpoint for {tar_filename}") + upload_response = requests.post(upload_endpoint_url, json=upload_payload, headers=headers) + upload_response.raise_for_status() + + # Check for signed URLs + upload_response_data = upload_response.json() + signed_urls = upload_response_data.get("filenameToSignedUrls", {}) + + if not signed_urls: + raise ValueError(f"GetUploadEndpoint returned no signed URLs. Response: {upload_response_data}") + + signed_url = signed_urls.get(tar_filename) + + if not signed_url: + raise ValueError( + f"No signed URL received for {tar_filename}. Available files: {list(signed_urls.keys())}" + ) + + # Upload to GCS + logger.info(f"Uploading {tar_filename} to GCS...") + + file_size = os.path.getsize(tar_path) + + # Retry configuration + max_retries = 3 + retry_delay = 2 # seconds + + for attempt in range(max_retries): + try: + with open(tar_path, "rb") as f: + # Create request exactly like Golang + req = requests.Request( + "PUT", + signed_url, + data=f, + headers={ + "Content-Type": "application/octet-stream", + "X-Goog-Content-Length-Range": f"{file_size},{file_size}", + }, + ) + prepared = req.prepare() + + # Don't let requests add extra headers + session = requests.Session() + gcs_response = session.send(prepared, timeout=600) + gcs_response.raise_for_status() + + logger.info(f"Successfully uploaded {tar_filename}") + break # Success, exit retry loop + + except (requests.exceptions.RequestException, IOError) as e: + if attempt < max_retries - 1: + # Check if it's a retryable error + is_retryable = False + if isinstance(e, requests.exceptions.RequestException): + if hasattr(e, "response") and e.response is not None: + # Retry on 5xx errors or 408 (timeout) + is_retryable = e.response.status_code >= 500 or e.response.status_code == 408 + else: + # Network errors (no response) are retryable + is_retryable = True + else: + # IOError is retryable + is_retryable = True + + if is_retryable: + wait_time = retry_delay * (2**attempt) # Exponential backoff + logger.warning( + f"Upload attempt {attempt + 1}/{max_retries} failed: {e}. " + f"Retrying in {wait_time}s..." + ) + time.sleep(wait_time) + else: + # Non-retryable error, raise immediately + raise + else: + # Last attempt failed + logger.error(f"Upload failed after {max_retries} attempts") + raise + + # Step 3: Validate upload + validate_url = f"{self.api_base}/v1/{evaluator_name}:validateUpload" + validate_payload = {"name": evaluator_name} + validate_response = requests.post(validate_url, json=validate_payload, headers=headers) + validate_response.raise_for_status() + + validate_data = validate_response.json() + + logger.info("Upload validated successfully") + + # Clean up tar file + if os.path.exists(tar_path): + os.remove(tar_path) + + except Exception as upload_error: + logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}") + # Don't fail - evaluator is created, just code upload failed + + return result # Return after attempting upload except Exception as e: logger.error(f"Error creating evaluator: {str(e)}") if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"): diff --git a/tests/test_ep_upload_e2e.py b/tests/test_ep_upload_e2e.py new file mode 100644 index 00000000..f7986e5d --- /dev/null +++ b/tests/test_ep_upload_e2e.py @@ -0,0 +1,646 @@ +""" +End-to-end tests for ep upload command. + +Tests the complete upload workflow: +1. Discovery of @evaluation_test decorated functions +2. Upload command execution +3. API calls (create, getUploadEndpoint, validateUpload) +4. Tar.gz creation and GCS upload +""" + +import argparse +import json +import os +import shutil +import sys +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +def create_test_project_with_evaluation_test(test_content: str, filename: str = "test_eval.py"): + """ + Helper to create a proper test project structure for pytest discovery. + + Creates: + project_dir/ + requirements.txt + {filename} <- test_content goes here (at root, not in subdirectory) + + Returns: + tuple: (project_dir, test_file_path) + """ + test_project_dir = tempfile.mkdtemp() + + # Put test file at root (not in subdirectory) to avoid import issues + test_file_path = Path(test_project_dir) / filename + test_file_path.write_text(test_content) + + # Create requirements.txt (required for upload) + (Path(test_project_dir) / "requirements.txt").write_text("eval-protocol>=0.1.0\n") + + # Add to sys.path for imports + if test_project_dir not in sys.path: + sys.path.insert(0, test_project_dir) + + return test_project_dir, test_file_path + + +@pytest.fixture +def mock_env_variables(monkeypatch): + """Set up test environment variables""" + monkeypatch.setenv("FIREWORKS_API_KEY", "test_api_key") + monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "test_account") + monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") + + +@pytest.fixture +def mock_requests_get(): + """Mock requests.get for force flow check""" + with patch("requests.get") as mock_get: + mock_get.return_value.status_code = 404 # Evaluator doesn't exist + mock_get.return_value.raise_for_status = MagicMock() + yield mock_get + + +@pytest.fixture +def mock_requests_delete(): + """Mock requests.delete for force flow""" + with patch("requests.delete") as mock_delete: + mock_delete.return_value.status_code = 200 + mock_delete.return_value.raise_for_status = MagicMock() + yield mock_delete + + +@pytest.fixture +def mock_gcs_upload(): + """Mock the GCS upload via requests.Session""" + with patch("requests.Session") as mock_session_class: + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + # Mock successful GCS upload + mock_gcs_response = MagicMock() + mock_gcs_response.status_code = 200 + mock_gcs_response.raise_for_status = MagicMock() + mock_session.send.return_value = mock_gcs_response + + yield mock_session + + +@pytest.fixture +def mock_requests_post(): + """Mock requests.post for all API endpoints""" + with patch("requests.post") as mock_post: + validate_response = {"success": True, "valid": True} + create_response = { + "name": "accounts/test_account/evaluators/test-eval", + "displayName": "Test Evaluator", + "description": "Test description", + } + + def side_effect(*args, **kwargs): + url = args[0] + payload = kwargs.get("json", {}) + response = mock_post.return_value + + if "getUploadEndpoint" in url: + # Dynamically create signed URLs for whatever filenames are requested + filename_to_size = payload.get("filename_to_size", {}) + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + response.json.return_value = {"filenameToSignedUrls": signed_urls} + elif "validateUpload" in url: + response.json.return_value = validate_response + else: + # Create evaluator endpoint + response.json.return_value = create_response + + response.status_code = 200 + return response + + mock_post.side_effect = side_effect + mock_post.return_value.status_code = 200 + mock_post.return_value.raise_for_status = MagicMock() + yield mock_post + + +def test_ep_upload_discovers_and_uploads_evaluation_test( + mock_env_variables, mock_requests_post, mock_requests_get, mock_gcs_upload, monkeypatch +): + """ + Test the complete ep upload flow: + - Create a test file with @evaluation_test + - Discover it using _discover_tests + - Upload via upload_command + - Verify all API calls + """ + from eval_protocol.cli_commands.upload import upload_command, _discover_tests + + # 1. CREATE TEST PROJECT STRUCTURE + test_content = """ +from typing import List +from eval_protocol.models import EvaluationRow, Message, EvaluateResult +from eval_protocol.pytest import evaluation_test + +@evaluation_test( + input_rows=[[ + EvaluationRow(messages=[Message(role="user", content="Hello")]), + EvaluationRow(messages=[Message(role="user", content="Test message")]), + ]], + mode="pointwise" +) +async def test_simple_evaluation(row: EvaluationRow) -> EvaluationRow: + '''Simple test evaluator''' + content = row.messages[-1].content if row.messages else "" + word_count = len(content.split()) + score = min(word_count / 10.0, 1.0) + + row.evaluation_result = EvaluateResult( + score=score, + reason=f"Words: {word_count}", + metrics={"words": {"score": score, "is_score_valid": True}} + ) + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test(test_content, "test_simple_eval.py") + + # Save current directory + original_cwd = os.getcwd() + + try: + # Change to test project directory - all operations happen from here + os.chdir(test_project_dir) + + # 2. TEST DISCOVERY + discovered_tests = _discover_tests(test_project_dir) + + # Verify discovery + assert len(discovered_tests) == 1, f"Expected 1 test, found {len(discovered_tests)}" + discovered_test = discovered_tests[0] + assert "test_simple_evaluation" in discovered_test.qualname + assert str(test_file_path) in discovered_test.file_path + # input_rows automatically creates parametrization, so has_parametrize is True + assert discovered_test.has_parametrize is True + + # 3. RUN EP UPLOAD COMMAND + args = argparse.Namespace( + path=test_project_dir, + entry=None, # Discover all tests + id="test-simple-eval", # Explicit ID + display_name="Simple Word Count Eval", + description="E2E test evaluator", + force=False, + yes=True, # Non-interactive + ) + + # Mock the selection (auto-select the discovered test) + with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: + mock_select.return_value = discovered_tests + + # Execute upload command + exit_code = upload_command(args) + + # 4. VERIFY SUCCESS + assert exit_code == 0, "Upload command should return 0 (success)" + + # 5. VERIFY ALL API CALLS IN UPLOAD FLOW + post_calls = [call[0][0] for call in mock_requests_post.call_args_list] + + # Step 1: Create evaluator (V2 endpoint) + create_calls = [url for url in post_calls if "evaluatorsV2" in url] + assert len(create_calls) >= 1, "Should call V2 create endpoint" + + # Step 2: Get upload endpoint + upload_endpoint_calls = [url for url in post_calls if "getUploadEndpoint" in url] + assert len(upload_endpoint_calls) >= 1, "Should call getUploadEndpoint" + + # Step 3: Validate upload + validate_calls = [url for url in post_calls if "validateUpload" in url] + assert len(validate_calls) >= 1, "Should call validateUpload" + + # Step 4: GCS upload + assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" + gcs_request = mock_gcs_upload.send.call_args[0][0] + assert gcs_request.method == "PUT", "GCS upload should use PUT" + assert "storage.googleapis.com" in gcs_request.url, "Should upload to GCS" + + # 6. VERIFY CREATE PAYLOAD STRUCTURE + create_payload = None + for call in mock_requests_post.call_args_list: + url = call[0][0] + if "evaluatorsV2" in url: + create_payload = call[1].get("json") + break + + assert create_payload is not None + assert "evaluator" in create_payload + assert create_payload["evaluatorId"] == "test-simple-eval" + + evaluator_data = create_payload["evaluator"] + assert evaluator_data["displayName"] == "Simple Word Count Eval" + assert evaluator_data["description"] == "E2E test evaluator" + + # Verify entry point is included + assert "entryPoint" in evaluator_data, "Should include entry point" + entry_point = evaluator_data["entryPoint"] + assert "test_simple_eval.py::test_simple_evaluation" in entry_point + + # Verify criteria structure (minimal, no embedded code) + criteria = evaluator_data["criteria"] + assert len(criteria) > 0 + assert criteria[0]["type"] == "CODE_SNIPPETS" + # Code is uploaded as tar.gz, not embedded in criteria + + finally: + # Restore original directory + os.chdir(original_cwd) + + # Cleanup + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) + + +def test_ep_upload_with_parametrized_test( + mock_env_variables, + mock_requests_post, + mock_requests_get, + mock_gcs_upload, +): + """ + Test ep upload with a parametrized @evaluation_test + Verifies that parametrized tests are discovered and uploaded as single evaluator + """ + from eval_protocol.cli_commands.upload import upload_command, _discover_tests + + test_content = """ +import pytest +from typing import List +from eval_protocol.models import EvaluationRow, Message, EvaluateResult +from eval_protocol.pytest import evaluation_test + +@pytest.mark.parametrize("completion_params", [ + {"model": "model-a", "temperature": 0.0}, + {"model": "model-b", "temperature": 0.5}, +]) +@evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Test")])]], + mode="pointwise" +) +async def test_multi_model_eval(row: EvaluationRow) -> EvaluationRow: + row.evaluation_result = EvaluateResult(score=1.0, reason="Pass") + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test(test_content, "test_parametrized.py") + + original_cwd = os.getcwd() + + try: + os.chdir(test_project_dir) + + # Discovery should find it as 1 test (with 2 variants) + discovered_tests = _discover_tests(test_project_dir) + + assert len(discovered_tests) == 1 + discovered_test = discovered_tests[0] + assert "test_multi_model_eval" in discovered_test.qualname + assert discovered_test.has_parametrize is True + assert discovered_test.param_count == 2 + + # Upload should work for parametrized tests + args = argparse.Namespace( + path=test_project_dir, + entry=None, + id="test-param-eval", + display_name="Parametrized Eval", + description="Test parametrized evaluator", + force=False, + yes=True, + ) + + with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: + mock_select.return_value = discovered_tests + exit_code = upload_command(args) + + assert exit_code == 0 + + # Verify upload flow completed + post_calls = [call[0][0] for call in mock_requests_post.call_args_list] + assert any("evaluatorsV2" in url for url in post_calls) + assert any("getUploadEndpoint" in url for url in post_calls) + assert any("validateUpload" in url for url in post_calls) + assert mock_gcs_upload.send.called + + finally: + os.chdir(original_cwd) + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) + + +def test_ep_upload_discovery_skips_problematic_files(mock_env_variables): + """ + Test that discovery properly skips files like setup.py, versioneer.py + that would cause issues during pytest collection + """ + from eval_protocol.cli_commands.upload import _discover_tests + + test_content = """ +from eval_protocol.pytest import evaluation_test +from eval_protocol.models import EvaluationRow + +@evaluation_test(input_rows=[[EvaluationRow()]]) +async def test_good_eval(row: EvaluationRow) -> EvaluationRow: + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test(test_content, "test_good.py") + + original_cwd = os.getcwd() + + try: + os.chdir(test_project_dir) + + # Create problematic files that should be ignored + setup_py = Path(test_project_dir) / "setup.py" + setup_py.write_text(""" +from setuptools import setup +setup(name='test') +""") + + versioneer_py = Path(test_project_dir) / "versioneer.py" + versioneer_py.write_text("# versioneer content") + + # Discovery should find only the good test + discovered_tests = _discover_tests(test_project_dir) + + assert len(discovered_tests) == 1 + assert "test_good_eval" in discovered_tests[0].qualname + assert "setup.py" not in discovered_tests[0].file_path + assert "versioneer.py" not in discovered_tests[0].file_path + + finally: + os.chdir(original_cwd) + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) + + +def test_ep_upload_discovers_non_test_prefixed_files(mock_env_variables): + """ + Test that discovery finds @evaluation_test in files like quickstart.py + (files that don't start with 'test_') + """ + from eval_protocol.cli_commands.upload import _discover_tests + + test_content = """ +from eval_protocol.pytest import evaluation_test +from eval_protocol.models import EvaluationRow + +@evaluation_test(input_rows=[[EvaluationRow()]]) +async def test_quickstart_eval(row: EvaluationRow) -> EvaluationRow: + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test( + test_content, + "quickstart.py", # Non test_* filename + ) + + original_cwd = os.getcwd() + + try: + os.chdir(test_project_dir) + + # Discovery should find it + discovered_tests = _discover_tests(test_project_dir) + + assert len(discovered_tests) == 1 + assert "test_quickstart_eval" in discovered_tests[0].qualname + assert "quickstart.py" in discovered_tests[0].file_path + + finally: + os.chdir(original_cwd) + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) + + +def test_ep_upload_complete_workflow_with_entry_point_validation( + mock_env_variables, + mock_requests_post, + mock_requests_get, + mock_gcs_upload, +): + """ + Complete workflow test validating: + - Test file discovery + - Entry point generation + - Upload command execution + - Full 5-step upload flow + - Payload structure + """ + from eval_protocol.cli_commands.upload import upload_command, _discover_tests + + test_content = """ +from typing import List +from eval_protocol.models import EvaluationRow, Message, EvaluateResult +from eval_protocol.pytest import evaluation_test + +@evaluation_test( + input_rows=[[ + EvaluationRow( + messages=[Message(role="user", content="What is 2+2?")], + ground_truth="4" + ) + ]], + mode="pointwise" +) +async def test_math_correctness(row: EvaluationRow) -> EvaluationRow: + '''Evaluates math responses''' + response = row.messages[-1].content if len(row.messages) > 1 else "" + ground_truth = row.ground_truth or "" + + score = 1.0 if response.strip() == ground_truth.strip() else 0.0 + + row.evaluation_result = EvaluateResult( + score=score, + reason="Match" if score == 1.0 else "Mismatch", + metrics={"correctness": {"score": score, "is_score_valid": True}} + ) + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test(test_content, "test_math_eval.py") + + original_cwd = os.getcwd() + + try: + os.chdir(test_project_dir) + + # 1. TEST DISCOVERY + discovered_tests = _discover_tests(test_project_dir) + + assert len(discovered_tests) == 1 + test = discovered_tests[0] + assert "test_math_correctness" in test.qualname + assert test.lineno is not None + + # 2. RUN UPLOAD COMMAND + args = argparse.Namespace( + path=test_project_dir, + entry=None, + id=None, # Auto-generate from test name + display_name=None, # Auto-generate + description=None, # Auto-generate + force=False, + yes=True, + ) + + with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: + mock_select.return_value = discovered_tests + exit_code = upload_command(args) + + assert exit_code == 0 + + # 3. VERIFY 5-STEP UPLOAD FLOW + post_calls = [call[0][0] for call in mock_requests_post.call_args_list] + + # Step 1: Create evaluator + assert any("evaluatorsV2" in url for url in post_calls), "Missing create call" + + # Step 2: Get upload endpoint + assert any("getUploadEndpoint" in url for url in post_calls), "Missing getUploadEndpoint call" + + # Step 3: Upload to GCS + assert mock_gcs_upload.send.called, "Missing GCS upload" + gcs_request = mock_gcs_upload.send.call_args[0][0] + assert gcs_request.method == "PUT" + assert "storage.googleapis.com" in gcs_request.url + + # Step 4: Validate + assert any("validateUpload" in url for url in post_calls), "Missing validateUpload call" + + # 4. VERIFY PAYLOAD DETAILS + create_payload = None + for call in mock_requests_post.call_args_list: + url = call[0][0] + if "evaluatorsV2" in url: + create_payload = call[1].get("json") + break + + assert create_payload is not None + + # Verify evaluator ID auto-generated from filename + test name + evaluator_id = create_payload["evaluatorId"] + assert "test-math-eval" in evaluator_id or "math-correctness" in evaluator_id + + # Verify entry point is path-based (not module-based) + evaluator_data = create_payload["evaluator"] + assert "entryPoint" in evaluator_data, "Should include entry point" + entry_point = evaluator_data["entryPoint"] + assert "test_math_eval.py::test_math_correctness" in entry_point + + # Verify criteria is minimal + criteria = evaluator_data["criteria"] + assert len(criteria) > 0 + assert criteria[0]["type"] == "CODE_SNIPPETS" + # Code is in tar.gz, not in payload + + # 5. VERIFY TAR.GZ WAS CREATED AND UPLOADED + # Check getUploadEndpoint call payload + upload_endpoint_payload = None + for call in mock_requests_post.call_args_list: + url = call[0][0] + if "getUploadEndpoint" in url: + upload_endpoint_payload = call[1].get("json") + break + + assert upload_endpoint_payload is not None + assert "filename_to_size" in upload_endpoint_payload + # Tar filename is dynamic (based on directory name) + tar_files = list(upload_endpoint_payload["filename_to_size"].keys()) + assert len(tar_files) == 1, "Should have exactly one tar file" + tar_filename = tar_files[0] + assert tar_filename.endswith(".tar.gz"), "Should be a tar.gz file" + tar_size = upload_endpoint_payload["filename_to_size"][tar_filename] + assert tar_size > 0, "Tar file should have non-zero size" + + finally: + os.chdir(original_cwd) + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) + + +def test_ep_upload_force_flag_triggers_delete_flow( + mock_env_variables, + mock_requests_post, + mock_gcs_upload, +): + """ + Test that --force flag triggers the check/delete/recreate flow + """ + from eval_protocol.cli_commands.upload import upload_command, _discover_tests + + test_content = """ +from eval_protocol.pytest import evaluation_test +from eval_protocol.models import EvaluationRow + +@evaluation_test(input_rows=[[EvaluationRow()]]) +async def test_force_eval(row: EvaluationRow) -> EvaluationRow: + return row +""" + + test_project_dir, test_file_path = create_test_project_with_evaluation_test(test_content, "test_force.py") + + original_cwd = os.getcwd() + + try: + os.chdir(test_project_dir) + + # Mock requests.get to return 200 (evaluator exists) + with patch("requests.get") as mock_get: + mock_get.return_value.status_code = 200 + mock_get.return_value.raise_for_status = MagicMock() + + # Mock requests.delete + with patch("requests.delete") as mock_delete: + mock_delete.return_value.status_code = 200 + mock_delete.return_value.raise_for_status = MagicMock() + + discovered_tests = _discover_tests(test_project_dir) + + args = argparse.Namespace( + path=test_project_dir, + entry=None, + id="test-force", + display_name=None, + description=None, + force=True, # Force flag enabled + yes=True, + ) + + with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: + mock_select.return_value = discovered_tests + exit_code = upload_command(args) + + assert exit_code == 0 + + # Verify check happened + assert mock_get.called, "Should check if evaluator exists" + + # Verify delete happened (since mock_get returned 200) + assert mock_delete.called, "Should delete existing evaluator" + + finally: + os.chdir(original_cwd) + if test_project_dir in sys.path: + sys.path.remove(test_project_dir) + shutil.rmtree(test_project_dir, ignore_errors=True) diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 2c623a4c..7c4ab808 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,5 +1,6 @@ import json import os +import shutil import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -30,6 +31,9 @@ def evaluate(messages, original_messages=None, tools=None, **kwargs): } """ ) + # Create requirements.txt (required for upload) + with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: + f.write("eval-protocol>=0.1.0\n") return tmp_dir @@ -84,8 +88,7 @@ def test_evaluator_load_metric_folder(): assert "test_metric/main.py" in evaluator.code_files assert "evaluate" in evaluator.code_files["test_metric/main.py"] finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + shutil.rmtree(tmp_dir, ignore_errors=True) def test_evaluator_load_multi_metrics_folder(): @@ -97,8 +100,7 @@ def test_evaluator_load_multi_metrics_folder(): assert "main.py" in evaluator.code_files assert "evaluate" in evaluator.code_files["main.py"] finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + shutil.rmtree(tmp_dir, ignore_errors=True) def test_evaluator_update_evaluate_signature(): @@ -228,8 +230,7 @@ def mock_post_preview(*args, **kwargs): assert hasattr(preview_result.results[0], "per_metric_evals") # Attribute name in Python object assert "test_metric" in preview_result.results[0].per_metric_evals finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + shutil.rmtree(tmp_dir, ignore_errors=True) os.unlink(sample_file) @@ -325,8 +326,7 @@ def mock_post_helper_preview(*args, **kwargs): assert len(preview_result.results) == 2 assert preview_result.results[0].score == 0.65 finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + shutil.rmtree(tmp_dir, ignore_errors=True) os.unlink(sample_file) @@ -336,6 +336,8 @@ def test_create_evaluation_helper(monkeypatch): monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "test_account") monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") + original_cwd = os.getcwd() + class MockResponse: def __init__(self, json_data, status_code=200): self.json_data = json_data @@ -349,46 +351,81 @@ def raise_for_status(self): # pragma: no cover if self.status_code != 200: raise Exception("API Error") + create_called = False + upload_endpoint_called = False + validate_called = False + def mock_post(*args, **kwargs): + nonlocal create_called, upload_endpoint_called, validate_called + url = args[0] payload = kwargs.get("json", {}) - assert "evaluator" in payload - assert "evaluatorId" in payload - evaluator_data = payload["evaluator"] - assert "criteria" in evaluator_data - criteria = evaluator_data["criteria"] - assert len(criteria) > 0 - criterion = criteria[0] - assert criterion["type"] == "CODE_SNIPPETS" - assert "codeSnippets" in criterion - assert "fileContents" in criterion["codeSnippets"] - assert "main.py" in criterion["codeSnippets"]["fileContents"] - - return MockResponse( - { - "evaluator": { + + # Handle different endpoints in the upload flow + if "getUploadEndpoint" in url: + upload_endpoint_called = True + # Dynamically create signed URLs for whatever filenames are requested + filename_to_size = payload.get("filename_to_size", {}) + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + return MockResponse({"filenameToSignedUrls": signed_urls}) + elif "validateUpload" in url: + validate_called = True + return MockResponse({"success": True, "valid": True}) + else: + # Create evaluator endpoint + create_called = True + assert "evaluator" in payload + assert "evaluatorId" in payload + evaluator_data = payload["evaluator"] + assert "criteria" in evaluator_data + criteria = evaluator_data["criteria"] + assert len(criteria) > 0 + criterion = criteria[0] + assert criterion["type"] == "CODE_SNIPPETS" + # Code is now uploaded as tar.gz, not in criteria + + return MockResponse( + { "name": "accounts/test_account/evaluators/test-eval", "displayName": "Test Evaluator", "description": "Test description", "multiMetrics": False, - }, - "evaluatorId": "test-eval", - } - ) + } + ) + + # Mock GCS upload + from unittest.mock import MagicMock + + mock_session = MagicMock() + mock_gcs_response = MagicMock() + mock_gcs_response.status_code = 200 + mock_gcs_response.raise_for_status = MagicMock() + mock_session.send.return_value = mock_gcs_response monkeypatch.setattr("requests.post", mock_post) + monkeypatch.setattr("requests.Session", lambda: mock_session) try: + os.chdir(tmp_dir) api_response = create_evaluation( evaluator_id="test-eval", metric_folders=[f"test_metric={tmp_dir}"], display_name="Test Evaluator", description="Test description", ) - assert "evaluator" in api_response - created_evaluator_data = api_response["evaluator"] - assert created_evaluator_data["name"] == "accounts/test_account/evaluators/test-eval" - assert created_evaluator_data["displayName"] == "Test Evaluator" - assert created_evaluator_data["description"] == "Test description" + + # Verify response + assert api_response["name"] == "accounts/test_account/evaluators/test-eval" + assert api_response["displayName"] == "Test Evaluator" + assert api_response["description"] == "Test description" + + # Verify full upload flow was executed + assert create_called, "Create endpoint should be called" + assert upload_endpoint_called, "GetUploadEndpoint should be called" + assert validate_called, "ValidateUpload should be called" + assert mock_session.send.called, "GCS upload should happen" + finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + os.chdir(original_cwd) + shutil.rmtree(tmp_dir, ignore_errors=True) diff --git a/tests/test_evaluation_integration.py b/tests/test_evaluation_integration.py index 690d765b..003dee30 100644 --- a/tests/test_evaluation_integration.py +++ b/tests/test_evaluation_integration.py @@ -1,5 +1,6 @@ import json import os +import shutil import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -28,6 +29,9 @@ def evaluate(messages, ground_truth=None, tools=None, **kwargs): # Changed origi } """ ) + # Create requirements.txt (required for upload) + with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: + f.write("eval-protocol>=0.1.0\n") return tmp_dir @@ -72,6 +76,40 @@ def mock_env_variables(monkeypatch): monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") +@pytest.fixture +def mock_requests_get(): + """Mock requests.get for force flow check""" + with patch("requests.get") as mock_get: + mock_get.return_value.status_code = 404 # Evaluator doesn't exist + mock_get.return_value.raise_for_status = MagicMock() + yield mock_get + + +@pytest.fixture +def mock_requests_delete(): + """Mock requests.delete for force flow""" + with patch("requests.delete") as mock_delete: + mock_delete.return_value.status_code = 200 + mock_delete.return_value.raise_for_status = MagicMock() + yield mock_delete + + +@pytest.fixture +def mock_gcs_upload(): + """Mock the GCS upload via requests.Session""" + with patch("requests.Session") as mock_session_class: + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + # Mock successful GCS upload + mock_gcs_response = MagicMock() + mock_gcs_response.status_code = 200 + mock_gcs_response.raise_for_status = MagicMock() + mock_session.send.return_value = mock_gcs_response + + yield mock_session + + @pytest.fixture def mock_requests_post(): with patch("requests.post") as mock_post: @@ -97,12 +135,23 @@ def mock_requests_post(): }, ], } + validate_response = {"success": True, "valid": True} def side_effect(*args, **kwargs): url = args[0] + payload = kwargs.get("json", {}) response = mock_post.return_value if "previewEvaluator" in url: response.json.return_value = preview_response + elif "getUploadEndpoint" in url: + # Dynamically create signed URLs for whatever filenames are requested + filename_to_size = payload.get("filename_to_size", {}) + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + response.json.return_value = {"filenameToSignedUrls": signed_urls} + elif "validateUpload" in url: + response.json.return_value = validate_response else: response.json.return_value = default_response return response @@ -110,13 +159,16 @@ def side_effect(*args, **kwargs): mock_post.side_effect = side_effect mock_post.return_value.status_code = 200 mock_post.return_value.json.return_value = default_response + mock_post.return_value.raise_for_status = MagicMock() yield mock_post -def test_integration_single_metric(mock_env_variables, mock_requests_post): +def test_integration_single_metric(mock_env_variables, mock_requests_post, mock_gcs_upload): tmp_dir = create_test_folder() sample_file = create_sample_file() + original_cwd = os.getcwd() try: + os.chdir(tmp_dir) preview_result = preview_evaluation( metric_folders=[f"test_metric={tmp_dir}"], sample_file=sample_file, @@ -132,29 +184,51 @@ def test_integration_single_metric(mock_env_variables, mock_requests_post): ) assert evaluator["name"] == "accounts/test_account/evaluators/test-eval" assert evaluator["displayName"] == "Test Evaluator" - assert mock_requests_post.call_count >= 1 - args_call, kwargs_call = mock_requests_post.call_args_list[-1] - url = args_call[0] - payload = kwargs_call.get("json") - assert "api.fireworks.ai/v1/accounts/test_account/evaluators" in url - if "evaluator" in payload: # Dev API - assert "evaluatorId" in payload and payload["evaluatorId"] == "test-eval" - assert "criteria" in payload["evaluator"] and len(payload["evaluator"]["criteria"]) > 0 - assert payload["evaluator"]["criteria"][0]["type"] == "CODE_SNIPPETS" - else: # Prod API - assert "evaluationId" in payload and payload["evaluationId"] == "test-eval" - assert "assertions" in payload["evaluation"] and len(payload["evaluation"]["assertions"]) > 0 - assert payload["evaluation"]["assertions"][0]["assertionType"] == "CODE" + + # Verify all API calls in the new upload flow + post_calls = [call[0][0] for call in mock_requests_post.call_args_list] + + # 1. Create evaluator call (V2 endpoint) + assert any("evaluatorsV2" in url for url in post_calls), "Should call V2 create endpoint" + + # 2. Get upload endpoint call + assert any("getUploadEndpoint" in url for url in post_calls), "Should call getUploadEndpoint" + + # 3. Validate upload call + assert any("validateUpload" in url for url in post_calls), "Should call validateUpload" + + # 4. Verify GCS upload happened + assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" + gcs_request = mock_gcs_upload.send.call_args[0][0] + assert gcs_request.method == "PUT", "GCS upload should use PUT" + assert "storage.googleapis.com" in gcs_request.url, "Should upload to GCS" + + # Verify create payload structure + create_call_payload = None + for call in mock_requests_post.call_args_list: + url = call[0][0] + if "evaluatorsV2" in url: + create_call_payload = call[1].get("json") + break + + assert create_call_payload is not None, "Should have create payload" + assert "evaluator" in create_call_payload + assert "evaluatorId" in create_call_payload and create_call_payload["evaluatorId"] == "test-eval" + assert "criteria" in create_call_payload["evaluator"] + assert len(create_call_payload["evaluator"]["criteria"]) > 0 + assert create_call_payload["evaluator"]["criteria"][0]["type"] == "CODE_SNIPPETS" finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + os.chdir(original_cwd) + shutil.rmtree(tmp_dir, ignore_errors=True) os.unlink(sample_file) -def test_integration_multi_metrics(mock_env_variables, mock_requests_post): +def test_integration_multi_metrics(mock_env_variables, mock_requests_post, mock_gcs_upload): tmp_dir = create_test_folder() sample_file = create_sample_file() + original_cwd = os.getcwd() try: + os.chdir(tmp_dir) preview_result = preview_evaluation(multi_metrics=True, folder=tmp_dir, sample_file=sample_file, max_samples=2) assert preview_result.total_samples == 2 assert len(preview_result.results) == 2 @@ -175,17 +249,33 @@ def test_integration_multi_metrics(mock_env_variables, mock_requests_post): description="Test multi-metrics evaluator", ) assert evaluator["name"] == "accounts/test_account/evaluators/test-eval" - assert mock_requests_post.call_count >= 1 - args_call, kwargs_call = mock_requests_post.call_args_list[-1] - payload = kwargs_call.get("json") - if "evaluator" in payload: # Dev API - assert payload["evaluatorId"] == "multi-metrics-eval" - assert payload["evaluator"]["multiMetrics"] is True - else: # Prod API - assert payload["evaluationId"] == "multi-metrics-eval" + + # Verify all API calls in the new upload flow + post_calls = [call[0][0] for call in mock_requests_post.call_args_list] + assert any("evaluatorsV2" in url for url in post_calls), "Should call V2 create endpoint" + assert any("getUploadEndpoint" in url for url in post_calls), "Should call getUploadEndpoint" + assert any("validateUpload" in url for url in post_calls), "Should call validateUpload" + + # Verify GCS upload happened + assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" + + # Verify create payload uses V2 format + create_call_payload = None + for call in mock_requests_post.call_args_list: + url = call[0][0] + if "evaluatorsV2" in url: + create_call_payload = call[1].get("json") + break + + assert create_call_payload is not None + assert "evaluator" in create_call_payload + assert create_call_payload["evaluatorId"] == "multi-metrics-eval" + assert create_call_payload["evaluator"]["multiMetrics"] is True finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + import shutil + + os.chdir(original_cwd) + shutil.rmtree(tmp_dir, ignore_errors=True) os.unlink(sample_file) @@ -197,7 +287,9 @@ def test_integration_cli_commands(mock_sys_exit, mock_env_variables, mock_reques tmp_dir = create_test_folder() sample_file = create_sample_file() + original_cwd = os.getcwd() try: + os.chdir(tmp_dir) # Test preview command with patch("eval_protocol.cli_commands.preview.preview_evaluation") as mock_preview_eval_func: mock_preview_result = MagicMock() @@ -266,6 +358,8 @@ def test_integration_cli_commands(mock_sys_exit, mock_env_variables, mock_reques huggingface_response_key="response", ) finally: - os.unlink(os.path.join(tmp_dir, "main.py")) - os.rmdir(tmp_dir) + os.chdir(original_cwd) + import shutil + + shutil.rmtree(tmp_dir, ignore_errors=True) os.unlink(sample_file) diff --git a/tests/test_evaluation_preview_integration.py b/tests/test_evaluation_preview_integration.py index 092d4e20..fe45ca0c 100644 --- a/tests/test_evaluation_preview_integration.py +++ b/tests/test_evaluation_preview_integration.py @@ -91,13 +91,51 @@ def mock_create_api(): "description": "Evaluates responses based on word count", } + def side_effect(*args, **kwargs): + url = args[0] + payload = kwargs.get("json", {}) + response = mock_post.return_value + + if "getUploadEndpoint" in url: + # Return signed URL for upload + filename_to_size = payload.get("filename_to_size", {}) + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + response.json.return_value = {"filenameToSignedUrls": signed_urls} + elif "validateUpload" in url: + response.json.return_value = {"success": True, "valid": True} + else: + response.json.return_value = create_response + + response.status_code = 200 + return response + + mock_post.side_effect = side_effect mock_post.return_value = MagicMock() mock_post.return_value.status_code = 200 mock_post.return_value.json.return_value = create_response + mock_post.return_value.raise_for_status = MagicMock() yield mock_post +@pytest.fixture +def mock_gcs_upload(): + """Mock the GCS upload via requests.Session""" + with patch("requests.Session") as mock_session_class: + mock_session = MagicMock() + mock_session_class.return_value = mock_session + + # Mock successful GCS upload + mock_gcs_response = MagicMock() + mock_gcs_response.status_code = 200 + mock_gcs_response.raise_for_status = MagicMock() + mock_session.send.return_value = mock_gcs_response + + yield mock_session + + @pytest.fixture def mock_word_count_metric(): """Create a temporary directory with a word count metric""" @@ -255,7 +293,7 @@ def evaluate(messages, ground_truth=None, tools=None, **kwargs): assert "word_count" in result.results[0].per_metric_evals -def test_create_evaluation(mock_env_variables, mock_create_api, monkeypatch): +def test_create_evaluation(mock_env_variables, mock_create_api, mock_gcs_upload, monkeypatch): """Test the create_evaluation function in isolation""" from eval_protocol.evaluation import create_evaluation @@ -285,22 +323,33 @@ def evaluate(messages, ground_truth=None, tools=None, **kwargs): """ ) - # Call create_evaluation - result = create_evaluation( - evaluator_id="word-count-eval", - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - display_name="Word Count Evaluator", - description="Evaluates responses based on word count", - force=True, - ) + # Create requirements.txt + with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: + f.write("eval-protocol>=0.1.0\n") - # Verify results - assert result["name"] == "accounts/test_account/evaluators/word-count-eval" - assert result["displayName"] == "Word Count Evaluator" - assert result["description"] == "Evaluates responses based on word count" + # Change to temp directory + original_cwd = os.getcwd() + os.chdir(tmp_dir) + + try: + # Call create_evaluation + result = create_evaluation( + evaluator_id="word-count-eval", + metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], + display_name="Word Count Evaluator", + description="Evaluates responses based on word count", + force=True, + ) + # Verify results + assert result["name"] == "accounts/test_account/evaluators/word-count-eval" + assert result["displayName"] == "Word Count Evaluator" + assert result["description"] == "Evaluates responses based on word count" + finally: + os.chdir(original_cwd) -def test_preview_then_create(monkeypatch, mock_env_variables, mock_preview_api, mock_create_api): + +def test_preview_then_create(monkeypatch, mock_env_variables, mock_preview_api, mock_create_api, mock_gcs_upload): """Test the full example flow (simulated)""" # Patch input to always return 'y' monkeypatch.setattr("builtins.input", lambda _: "y") @@ -330,6 +379,10 @@ def evaluate(messages, ground_truth=None, tools=None, **kwargs): """ ) + # Create requirements.txt + with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: + f.write("eval-protocol>=0.1.0\n") + # Create a temporary sample file sample_fd, sample_path = tempfile.mkstemp(suffix=".jsonl") with os.fdopen(sample_fd, "w") as f: @@ -365,46 +418,53 @@ def evaluate(messages, ground_truth=None, tools=None, **kwargs): # Create a patched example module with modified paths from eval_protocol.evaluation import create_evaluation, preview_evaluation - # Define a patched main function - def patched_main(): - # Preview the evaluation using metrics folder and samples file - print("Previewing evaluation...") - preview_result = preview_evaluation( - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - sample_file=sample_path, - max_samples=2, - ) - - preview_result.display() - - # Check if 'used_preview_api' attribute exists and is True - import eval_protocol.evaluation as evaluation_module + # Change to temp directory + original_cwd = os.getcwd() + os.chdir(tmp_dir) - # For testing, always assume the API was used successfully - evaluation_module.used_preview_api = True - - print("\nCreating evaluation...") - try: - evaluator = create_evaluation( - evaluator_id="word-count-eval", + try: + # Define a patched main function + def patched_main(): + # Preview the evaluation using metrics folder and samples file + print("Previewing evaluation...") + preview_result = preview_evaluation( metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - display_name="Word Count Evaluator", - description="Evaluates responses based on word count", - force=True, + sample_file=sample_path, + max_samples=2, ) - print(f"Created evaluator: {evaluator['name']}") - return evaluator - except Exception as e: - print(f"Error creating evaluator: {str(e)}") - print("Make sure you have proper Fireworks API credentials set up.") - return None - - # Run the patched main function - result = patched_main() - - # Clean up - os.unlink(sample_path) - # Verify the result - assert result is not None - assert result["name"] == "accounts/test_account/evaluators/word-count-eval" + preview_result.display() + + # Check if 'used_preview_api' attribute exists and is True + import eval_protocol.evaluation as evaluation_module + + # For testing, always assume the API was used successfully + evaluation_module.used_preview_api = True + + print("\nCreating evaluation...") + try: + evaluator = create_evaluation( + evaluator_id="word-count-eval", + metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], + display_name="Word Count Evaluator", + description="Evaluates responses based on word count", + force=True, + ) + print(f"Created evaluator: {evaluator['name']}") + return evaluator + except Exception as e: + print(f"Error creating evaluator: {str(e)}") + print("Make sure you have proper Fireworks API credentials set up.") + return None + + # Run the patched main function + result = patched_main() + + # Clean up + os.unlink(sample_path) + + # Verify the result + assert result is not None + assert result["name"] == "accounts/test_account/evaluators/word-count-eval" + finally: + os.chdir(original_cwd)