Skip to content

Commit 26fbc2d

Browse files
author
Dylan Huang
committed
Enhance evaluator handling by returning version ID on creation and updating polling functions to target specific evaluator versions. Refactor related CLI commands and tests to accommodate these changes, ensuring clearer status messages and improved error handling.
1 parent ea673f4 commit 26fbc2d

File tree

5 files changed

+90
-136
lines changed

5 files changed

+90
-136
lines changed

eval_protocol/cli_commands/create_rft.py

Lines changed: 65 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,18 @@
77
import time
88
from typing import Any, Callable, Dict, Optional
99
import inspect
10-
import requests
1110
import tempfile
1211
from pydantic import ValidationError
1312

1413
from ..auth import get_fireworks_api_base, get_fireworks_api_key
1514
from ..fireworks_client import create_fireworks_client
16-
from ..common_utils import get_user_agent, load_jsonl
15+
from ..common_utils import load_jsonl
1716
from ..fireworks_rft import (
1817
create_dataset_from_jsonl,
1918
detect_dataset_builder,
2019
materialize_dataset_via_builder,
2120
)
2221
from ..models import EvaluationRow
23-
from .upload import upload_command
2422
from .utils import (
2523
_build_entry_point,
2624
_build_trimmed_dataset_id,
@@ -222,64 +220,68 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
222220
return None
223221

224222

225-
def _poll_evaluator_status(
226-
evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
223+
def _poll_evaluator_version_status(
224+
evaluator_id: str,
225+
version_id: str,
226+
api_key: str,
227+
api_base: str,
228+
timeout_minutes: int = 10,
227229
) -> bool:
228230
"""
229-
Poll evaluator status until it becomes ACTIVE or times out.
231+
Poll a specific evaluator version status until it becomes ACTIVE or times out.
232+
233+
Uses the Fireworks SDK to get the specified version of the evaluator and checks
234+
its build state.
230235
231236
Args:
232-
evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
237+
evaluator_id: The evaluator ID (not full resource name)
238+
version_id: The specific version ID to poll
233239
api_key: Fireworks API key
234240
api_base: Fireworks API base URL
235241
timeout_minutes: Maximum time to wait in minutes
236242
237243
Returns:
238-
True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
244+
True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
239245
"""
240-
headers = {
241-
"Authorization": f"Bearer {api_key}",
242-
"Content-Type": "application/json",
243-
"User-Agent": get_user_agent(),
244-
}
245-
246-
check_url = f"{api_base}/v1/{evaluator_resource_name}"
247246
timeout_seconds = timeout_minutes * 60
248247
poll_interval = 10 # seconds
249248
start_time = time.time()
250249

251-
print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
250+
print(
251+
f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
252+
)
253+
254+
client = create_fireworks_client(api_key=api_key, base_url=api_base)
252255

253256
while time.time() - start_time < timeout_seconds:
254257
try:
255-
response = requests.get(check_url, headers=headers, timeout=30)
256-
response.raise_for_status()
257-
258-
evaluator_data = response.json()
259-
state = evaluator_data.get("state", "STATE_UNSPECIFIED")
260-
status = evaluator_data.get("status", "")
258+
version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
259+
state = version.state or "STATE_UNSPECIFIED"
260+
status_msg = ""
261+
if version.status and version.status.message:
262+
status_msg = version.status.message
261263

262264
if state == "ACTIVE":
263-
print("✅ Evaluator is ACTIVE and ready!")
265+
print("✅ Evaluator version is ACTIVE and ready!")
264266
return True
265267
elif state == "BUILD_FAILED":
266-
print(f"❌ Evaluator build failed. Status: {status}")
268+
print(f"❌ Evaluator version build failed. Status: {status_msg}")
267269
return False
268270
elif state == "BUILDING":
269271
elapsed_minutes = (time.time() - start_time) / 60
270-
print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
272+
print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
271273
else:
272-
print(f"⏳ Evaluator state: {state}, status: {status}")
274+
print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
273275

274-
except requests.exceptions.RequestException as e:
275-
print(f"Warning: Failed to check evaluator status: {e}")
276+
except Exception as e:
277+
print(f"Warning: Failed to check evaluator version status: {e}")
276278

277279
# Wait before next poll
278280
time.sleep(poll_interval)
279281

280282
# Timeout reached
281283
elapsed_minutes = (time.time() - start_time) / 60
282-
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
284+
print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
283285
return False
284286

285287

@@ -564,40 +566,16 @@ def _upload_dataset(
564566
def _upload_and_ensure_evaluator(
565567
project_root: str,
566568
evaluator_id: str,
567-
evaluator_resource_name: str,
568569
api_key: str,
569570
api_base: str,
570571
) -> bool:
571-
"""Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
572-
# Check if evaluator already exists
573-
try:
574-
headers = {
575-
"Authorization": f"Bearer {api_key}",
576-
"Content-Type": "application/json",
577-
"User-Agent": get_user_agent(),
578-
}
579-
resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
580-
if resp.ok:
581-
state = resp.json().get("state", "STATE_UNSPECIFIED")
582-
print(f"✓ Evaluator exists (state: {state}). Skipping upload.")
583-
# Poll for ACTIVE before proceeding
584-
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
585-
if not _poll_evaluator_status(
586-
evaluator_resource_name=evaluator_resource_name,
587-
api_key=api_key,
588-
api_base=api_base,
589-
timeout_minutes=10,
590-
):
591-
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
592-
print("\n❌ Evaluator is not ready within the timeout period.")
593-
print(f"📊 Please check the evaluator status at: {dashboard_url}")
594-
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
595-
return False
596-
return True
597-
except requests.exceptions.RequestException:
598-
pass
572+
"""Upload evaluator and ensure its version becomes ACTIVE.
573+
574+
Creates/updates the evaluator and uploads the code, then polls the specific
575+
version until it becomes ACTIVE.
576+
"""
577+
from eval_protocol.evaluation import create_evaluation
599578

600-
# Ensure evaluator exists by invoking the upload flow programmatically
601579
try:
602580
tests = _discover_tests(project_root)
603581
selected_entry: Optional[str] = None
@@ -614,39 +592,37 @@ def _upload_and_ensure_evaluator(
614592
)
615593
return False
616594

617-
upload_args = argparse.Namespace(
618-
path=project_root,
619-
entry=selected_entry,
620-
id=evaluator_id,
621-
display_name=None,
622-
description=None,
623-
yes=True,
624-
env_file=None,
595+
print(f"\nUploading evaluator '{evaluator_id}'...")
596+
result, version_id = create_evaluation(
597+
evaluator_id=evaluator_id,
598+
display_name=evaluator_id,
599+
description=f"Evaluator for {evaluator_id}",
600+
entry_point=selected_entry,
625601
)
626602

627-
rc = upload_command(upload_args)
628-
if rc == 0:
629-
print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
630-
631-
# Poll for evaluator status
632-
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
633-
is_active = _poll_evaluator_status(
634-
evaluator_resource_name=evaluator_resource_name,
635-
api_key=api_key,
636-
api_base=api_base,
637-
timeout_minutes=10,
638-
)
603+
if not version_id:
604+
print("Warning: Evaluator created but version upload failed.")
605+
return False
639606

640-
if not is_active:
641-
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
642-
print("\n❌ Evaluator is not ready within the timeout period.")
643-
print(f"📊 Please check the evaluator status at: {dashboard_url}")
644-
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
645-
return False
646-
return True
647-
else:
648-
print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
607+
print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
608+
609+
# Poll for the specific evaluator version status
610+
print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
611+
is_active = _poll_evaluator_version_status(
612+
evaluator_id=evaluator_id,
613+
version_id=version_id,
614+
api_key=api_key,
615+
api_base=api_base,
616+
timeout_minutes=10,
617+
)
618+
619+
if not is_active:
620+
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
621+
print("\n❌ Evaluator version is not ready within the timeout period.")
622+
print(f"📊 Please check the evaluator status at: {dashboard_url}")
623+
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
649624
return False
625+
return True
650626
except Exception as e:
651627
print(f"Warning: Failed to upload evaluator automatically: {e}")
652628
return False
@@ -802,11 +778,10 @@ def create_rft_command(args) -> int:
802778
if not dataset_id or not dataset_resource:
803779
return 1
804780

805-
# 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
781+
# 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
806782
if not _upload_and_ensure_evaluator(
807783
project_root=project_root,
808784
evaluator_id=evaluator_id,
809-
evaluator_resource_name=evaluator_resource_name,
810785
api_key=api_key,
811786
api_base=api_base,
812787
):

eval_protocol/cli_commands/upload.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ def upload_command(args: argparse.Namespace) -> int:
377377

378378
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
379379
try:
380-
result = create_evaluation(
380+
result, version_id = create_evaluation(
381381
evaluator_id=evaluator_id,
382382
display_name=display_name or evaluator_id,
383383
description=description or f"Evaluator for {qualname}",
@@ -387,6 +387,8 @@ def upload_command(args: argparse.Namespace) -> int:
387387

388388
# Print success message with Fireworks dashboard link
389389
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
390+
if version_id:
391+
print(f" Version: {version_id}")
390392
print("📊 View in Fireworks Dashboard:")
391393
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
392394
print(f" {dashboard_url}\n")

eval_protocol/evaluation.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,8 +351,10 @@ def create(self, evaluator_id, display_name=None, description=None):
351351
except Exception as upload_error:
352352
logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
353353
# Don't fail - evaluator is created, just code upload failed
354+
# Return None for version_id since upload failed
355+
return result, None
354356

355-
return result # Return after attempting upload
357+
return result, evaluator_version_id # Return evaluator result and version ID
356358
except fireworks.APIStatusError as e:
357359
logger.error(f"Error creating evaluator: {str(e)}")
358360
logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
@@ -392,6 +394,10 @@ def create_evaluation(
392394
account_id: Optional Fireworks account ID
393395
api_key: Optional Fireworks API key
394396
entry_point: Optional entry point (module::function or path::function)
397+
398+
Returns:
399+
A tuple of (evaluator_result, version_id) where version_id is the ID of the
400+
created evaluator version, or None if upload failed.
395401
"""
396402
evaluator = Evaluator(
397403
account_id=account_id,

tests/test_cli_create_rft.py

Lines changed: 11 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import os
33
import argparse
4-
import requests
54
from types import SimpleNamespace
65
from unittest.mock import patch
76
from typing import Any, cast
@@ -106,7 +105,7 @@ def rft_test_harness(tmp_path, monkeypatch, stub_fireworks):
106105

107106
monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
108107
monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
109-
monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
108+
monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
110109
monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
111110

112111
return project
@@ -446,7 +445,7 @@ def test_create_rft_picks_most_recent_evaluator_and_dataset_id_follows(rft_test_
446445
monkeypatch.setattr(cr, "_discover_and_select_tests", lambda cwd, non_interactive=False: [single_disc])
447446
monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
448447
monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
449-
monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
448+
monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
450449

451450
captured = {"dataset_id": None}
452451

@@ -641,17 +640,8 @@ def test_create_rft_quiet_existing_evaluator_skips_upload(tmp_path, monkeypatch,
641640
monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
642641
monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "acct123")
643642

644-
# Mock evaluator exists and is ACTIVE
645-
class _Resp:
646-
ok = True
647-
648-
def json(self):
649-
return {"state": "ACTIVE"}
650-
651-
def raise_for_status(self):
652-
return None
653-
654-
monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
643+
# Mock evaluator upload and version polling - evaluator becomes ACTIVE
644+
monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
655645

656646
# Provide dataset via --dataset-jsonl so no test discovery needed
657647
ds_path = project / "dataset.jsonl"
@@ -703,11 +693,8 @@ def test_create_rft_quiet_new_evaluator_ambiguous_without_entry_errors(tmp_path,
703693
monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
704694
monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "acct123")
705695

706-
# Evaluator does not exist (force path into upload section)
707-
def _raise(*a, **k):
708-
raise requests.exceptions.RequestException("nope")
709-
710-
monkeypatch.setattr(cr.requests, "get", _raise)
696+
# Mock _upload_and_ensure_evaluator to fail (ambiguous tests)
697+
monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: False)
711698

712699
# Two discovered tests (ambiguous)
713700
f1 = project / "a.py"
@@ -948,18 +935,8 @@ def test_create_rft_quiet_existing_evaluator_infers_dataset_from_matching_test(r
948935
d2 = SimpleNamespace(qualname="beta.test_two", file_path=str(f2))
949936
monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [d1, d2])
950937

951-
# Evaluator exists and is ACTIVE (skip upload)
952-
class _Resp:
953-
ok = True
954-
955-
def json(self):
956-
return {"state": "ACTIVE"}
957-
958-
def raise_for_status(self):
959-
return None
960-
961-
monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
962-
monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
938+
# Evaluator upload succeeds and version becomes ACTIVE
939+
monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
963940

964941
# We will provide JSONL via input_dataset extractor for matching test (beta.test_two)
965942
jsonl_path = project / "data.jsonl"
@@ -1040,17 +1017,8 @@ def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatc
10401017
monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
10411018
monkeypatch.setattr(cli_utils, "verify_api_key_and_get_account_id", lambda *a, **k: "pyroworks-dev")
10421019

1043-
# Mock evaluator exists and ACTIVE
1044-
class _Resp:
1045-
ok = True
1046-
1047-
def json(self):
1048-
return {"state": "ACTIVE"}
1049-
1050-
def raise_for_status(self):
1051-
return None
1052-
1053-
monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
1020+
# Mock evaluator upload succeeds and version becomes ACTIVE
1021+
monkeypatch.setattr(cr, "_upload_and_ensure_evaluator", lambda *a, **k: True)
10541022

10551023
captured = stub_fireworks
10561024

@@ -1133,7 +1101,7 @@ def test_create_rft_prefers_explicit_dataset_jsonl_over_input_dataset(rft_test_h
11331101

11341102
monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
11351103
monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
1136-
monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
1104+
monkeypatch.setattr(cr, "_poll_evaluator_version_status", lambda **kwargs: True)
11371105

11381106
# Prepare two JSONL paths: one explicit via --dataset-jsonl and one inferable via input_dataset
11391107
explicit_jsonl = project / "metric" / "explicit.jsonl"

0 commit comments

Comments
 (0)