Enhance evaluator handling by returning version ID on creation and updating polling functions to target specific evaluator versions. Refactor related CLI commands and tests to accommodate these changes, ensuring clearer status messages and improved error handling.

Dylan Huang · Dylan Huang · commit 26fbc2de81a9 · 2026-01-13T13:09:53.000-08:00
diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
@@ -7,20 +7,18 @@
 import time
 from typing import Any, Callable, Dict, Optional
 import inspect
-import requests
 import tempfile
 from pydantic import ValidationError
 
 from ..auth import get_fireworks_api_base, get_fireworks_api_key
 from ..fireworks_client import create_fireworks_client
-from ..common_utils import get_user_agent, load_jsonl
+from ..common_utils import load_jsonl
 from ..fireworks_rft import (
     create_dataset_from_jsonl,
     detect_dataset_builder,
     materialize_dataset_via_builder,
 )
 from ..models import EvaluationRow
-from .upload import upload_command
 from .utils import (
     _build_entry_point,
     _build_trimmed_dataset_id,
@@ -222,64 +220,68 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
         return None
 
 
-def _poll_evaluator_status(
-    evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
+def _poll_evaluator_version_status(
+    evaluator_id: str,
+    version_id: str,
+    api_key: str,
+    api_base: str,
+    timeout_minutes: int = 10,
 ) -> bool:
     """
-    Poll evaluator status until it becomes ACTIVE or times out.
+    Poll a specific evaluator version status until it becomes ACTIVE or times out.
+
+    Uses the Fireworks SDK to get the specified version of the evaluator and checks
+    its build state.
 
     Args:
-        evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
+        evaluator_id: The evaluator ID (not full resource name)
+        version_id: The specific version ID to poll
         api_key: Fireworks API key
         api_base: Fireworks API base URL
         timeout_minutes: Maximum time to wait in minutes
 
     Returns:
-        True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
+        True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
     """
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-        "User-Agent": get_user_agent(),
-    }
-
-    check_url = f"{api_base}/v1/{evaluator_resource_name}"
     timeout_seconds = timeout_minutes * 60
     poll_interval = 10  # seconds
     start_time = time.time()
 
-    print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
+    print(
+        f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
+    )
+
+    client = create_fireworks_client(api_key=api_key, base_url=api_base)
 
     while time.time() - start_time < timeout_seconds:
         try:
-            response = requests.get(check_url, headers=headers, timeout=30)
-            response.raise_for_status()
-
-            evaluator_data = response.json()
-            state = evaluator_data.get("state", "STATE_UNSPECIFIED")
-            status = evaluator_data.get("status", "")
+            version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
+            state = version.state or "STATE_UNSPECIFIED"
+            status_msg = ""
+            if version.status and version.status.message:
+                status_msg = version.status.message
 
             if state == "ACTIVE":
-                print("✅ Evaluator is ACTIVE and ready!")
+                print("✅ Evaluator version is ACTIVE and ready!")
                 return True
             elif state == "BUILD_FAILED":
-                print(f"❌ Evaluator build failed. Status: {status}")
+                print(f"❌ Evaluator version build failed. Status: {status_msg}")
                 return False
             elif state == "BUILDING":
                 elapsed_minutes = (time.time() - start_time) / 60
-                print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
+                print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
             else:
-                print(f"⏳ Evaluator state: {state}, status: {status}")
+                print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
 
-        except requests.exceptions.RequestException as e:
-            print(f"Warning: Failed to check evaluator status: {e}")
+        except Exception as e:
+            print(f"Warning: Failed to check evaluator version status: {e}")
 
         # Wait before next poll
         time.sleep(poll_interval)
 
     # Timeout reached
     elapsed_minutes = (time.time() - start_time) / 60
-    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
+    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
     return False
 
 
@@ -564,40 +566,16 @@ def _upload_dataset(
 def _upload_and_ensure_evaluator(
     project_root: str,
     evaluator_id: str,
-    evaluator_resource_name: str,
     api_key: str,
     api_base: str,
 ) -> bool:
-    """Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
-    # Check if evaluator already exists
-    try:
-        headers = {
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": "application/json",
-            "User-Agent": get_user_agent(),
-        }
-        resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
-        if resp.ok:
-            state = resp.json().get("state", "STATE_UNSPECIFIED")
-            print(f"✓ Evaluator exists (state: {state}). Skipping upload.")
-            # Poll for ACTIVE before proceeding
-            print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-            if not _poll_evaluator_status(
-                evaluator_resource_name=evaluator_resource_name,
-                api_key=api_key,
-                api_base=api_base,
-                timeout_minutes=10,
-            ):
-                dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                print("\n❌ Evaluator is not ready within the timeout period.")
-                print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                return False
-            return True
-    except requests.exceptions.RequestException:
-        pass
+    """Upload evaluator and ensure its version becomes ACTIVE.
+
+    Creates/updates the evaluator and uploads the code, then polls the specific
+    version until it becomes ACTIVE.
+    """
+    from eval_protocol.evaluation import create_evaluation
 
-    # Ensure evaluator exists by invoking the upload flow programmatically
     try:
         tests = _discover_tests(project_root)
         selected_entry: Optional[str] = None
@@ -614,39 +592,37 @@ def _upload_and_ensure_evaluator(
             )
             return False
 
-        upload_args = argparse.Namespace(
-            path=project_root,
-            entry=selected_entry,
-            id=evaluator_id,
-            display_name=None,
-            description=None,
-            yes=True,
-            env_file=None,
+        print(f"\nUploading evaluator '{evaluator_id}'...")
+        result, version_id = create_evaluation(
+            evaluator_id=evaluator_id,
+            display_name=evaluator_id,
+            description=f"Evaluator for {evaluator_id}",
+            entry_point=selected_entry,
         )
 
-        rc = upload_command(upload_args)
-        if rc == 0:
-            print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
-
-            # Poll for evaluator status
-            print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-            is_active = _poll_evaluator_status(
-                evaluator_resource_name=evaluator_resource_name,
-                api_key=api_key,
-                api_base=api_base,
-                timeout_minutes=10,
-            )
+        if not version_id:
+            print("Warning: Evaluator created but version upload failed.")
+            return False
 
-            if not is_active:
-                dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                print("\n❌ Evaluator is not ready within the timeout period.")
-                print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                return False
-            return True
-        else:
-            print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
+        print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
+
+        # Poll for the specific evaluator version status
+        print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
+        is_active = _poll_evaluator_version_status(
+            evaluator_id=evaluator_id,
+            version_id=version_id,
+            api_key=api_key,
+            api_base=api_base,
+            timeout_minutes=10,
+        )
+
+        if not is_active:
+            dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
+            print("\n❌ Evaluator version is not ready within the timeout period.")
+            print(f"📊 Please check the evaluator status at: {dashboard_url}")
+            print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
             return False
+        return True
     except Exception as e:
         print(f"Warning: Failed to upload evaluator automatically: {e}")
         return False
@@ -802,11 +778,10 @@ def create_rft_command(args) -> int:
     if not dataset_id or not dataset_resource:
         return 1
 
-    # 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
+    # 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
     if not _upload_and_ensure_evaluator(
         project_root=project_root,
         evaluator_id=evaluator_id,
-        evaluator_resource_name=evaluator_resource_name,
         api_key=api_key,
         api_base=api_base,
     ):
diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py
@@ -377,7 +377,7 @@ def upload_command(args: argparse.Namespace) -> int:
 
         print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
         try:
-            result = create_evaluation(
+            result, version_id = create_evaluation(
                 evaluator_id=evaluator_id,
                 display_name=display_name or evaluator_id,
                 description=description or f"Evaluator for {qualname}",
@@ -387,6 +387,8 @@ def upload_command(args: argparse.Namespace) -> int:
 
             # Print success message with Fireworks dashboard link
             print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
+            if version_id:
+                print(f"   Version: {version_id}")
             print("📊 View in Fireworks Dashboard:")
             dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
             print(f"   {dashboard_url}\n")
diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py
@@ -351,8 +351,10 @@ def create(self, evaluator_id, display_name=None, description=None):
             except Exception as upload_error:
                 logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
                 # Don't fail - evaluator is created, just code upload failed
+                # Return None for version_id since upload failed
+                return result, None
 
-            return result  # Return after attempting upload
+            return result, evaluator_version_id  # Return evaluator result and version ID
         except fireworks.APIStatusError as e:
             logger.error(f"Error creating evaluator: {str(e)}")
             logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
@@ -392,6 +394,10 @@ def create_evaluation(
         account_id: Optional Fireworks account ID
         api_key: Optional Fireworks API key
         entry_point: Optional entry point (module::function or path::function)
+
+    Returns:
+        A tuple of (evaluator_result, version_id) where version_id is the ID of the
+        created evaluator version, or None if upload failed.
     """
     evaluator = Evaluator(
         account_id=account_id,
diff --git a/tests/test_cli_create_rft.py b/tests/test_cli_create_rft.py
@@ -1,7 +1,6 @@
 import json
 import os
 import argparse
-import requests
 from types import SimpleNamespace
 from unittest.mock import patch
 from typing import Any, cast
@@ -106,7 +105,7 @@ def rft_test_harness(tmp_path, monkeypatch, stub_fireworks):
 
     monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
     monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
-    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+    monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
     monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
 
     return project
@@ -446,7 +445,7 @@ def test_create_rft_picks_most_recent_evaluator_and_dataset_id_follows(rft_test_
     monkeypatch.setattr(cr, "_discover_and_select_tests", lambda cwd, non_interactive=False: [single_disc])
     monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
     monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
-    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+    monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
 
     captured = {"dataset_id": None}
 
@@ -641,17 +640,8 @@ def test_create_rft_quiet_existing_evaluator_skips_upload(tmp_path, monkeypatch,
     monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
     monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "acct123")
 
-    # Mock evaluator exists and is ACTIVE
-    class _Resp:
-        ok = True
-
-        def json(self):
-            return {"state": "ACTIVE"}
-
-        def raise_for_status(self):
-            return None
-
-    monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
+    # Mock evaluator upload and version polling - evaluator becomes ACTIVE
+    monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
 
     # Provide dataset via --dataset-jsonl so no test discovery needed
     ds_path = project / "dataset.jsonl"
@@ -703,11 +693,8 @@ def test_create_rft_quiet_new_evaluator_ambiguous_without_entry_errors(tmp_path,
     monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
     monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "acct123")
 
-    # Evaluator does not exist (force path into upload section)
-    def _raise(*a, **k):
-        raise requests.exceptions.RequestException("nope")
-
-    monkeypatch.setattr(cr.requests, "get", _raise)
+    # Mock _upload_and_ensure_evaluator to fail (ambiguous tests)
+    monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: False)
 
     # Two discovered tests (ambiguous)
     f1 = project / "a.py"
@@ -948,18 +935,8 @@ def test_create_rft_quiet_existing_evaluator_infers_dataset_from_matching_test(r
     d2 = SimpleNamespace(qualname="beta.test_two", file_path=str(f2))
     monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [d1, d2])
 
-    # Evaluator exists and is ACTIVE (skip upload)
-    class _Resp:
-        ok = True
-
-        def json(self):
-            return {"state": "ACTIVE"}
-
-        def raise_for_status(self):
-            return None
-
-    monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
-    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+    # Evaluator upload succeeds and version becomes ACTIVE
+    monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
 
     # We will provide JSONL via input_dataset extractor for matching test (beta.test_two)
     jsonl_path = project / "data.jsonl"
@@ -1040,17 +1017,8 @@ def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatc
     monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
     monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "pyroworks-dev")
 
-    # Mock evaluator exists and ACTIVE
-    class _Resp:
-        ok = True
-
-        def json(self):
-            return {"state": "ACTIVE"}
-
-        def raise_for_status(self):
-            return None
-
-    monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
+    # Mock evaluator upload succeeds and version becomes ACTIVE
+    monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
 
     captured = stub_fireworks
 
@@ -1133,7 +1101,7 @@ def test_create_rft_prefers_explicit_dataset_jsonl_over_input_dataset(rft_test_h
 
     monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
     monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
-    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+    monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
 
     # Prepare two JSONL paths: one explicit via --dataset-jsonl and one inferable via input_dataset
     explicit_jsonl = project / "metric" / "explicit.jsonl"
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py