From 64efd1cabe5da634556c91d7cf4831cbf1676ed9 Mon Sep 17 00:00:00 2001 From: Benny Chen Date: Sat, 8 Nov 2025 17:58:00 -0800 Subject: [PATCH 1/4] auto select evaluators correctly --- eval_protocol/cli_commands/create_rft.py | 114 ++++++++++++++++++-- tests/test_cli_create_rft_infer.py | 126 +++++++++++++++++++++++ 2 files changed, 231 insertions(+), 9 deletions(-) create mode 100644 tests/test_cli_create_rft_infer.py diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index 5828b3ea..c90512ae 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -23,6 +23,87 @@ from .upload import _discover_tests, _normalize_evaluator_id, _resolve_entry_to_qual_and_source +def _last_evaluator_paths(cwd: str) -> list[str]: + return [ + os.path.join(cwd, ".eval_protocol", "last_evaluator.json"), + os.path.expanduser(os.path.join("~", ".eval_protocol", "last_evaluator.json")), + ] + + +def _load_last_evaluator(cwd: str) -> Optional[str]: + import json + + for p in _last_evaluator_paths(cwd): + try: + if os.path.isfile(p): + with open(p, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict) and data.get("evaluator_id"): + return str(data["evaluator_id"]) + except Exception: + # ignore and continue + pass + return None + + +def _save_last_evaluator(cwd: str, evaluator_id: str) -> None: + import json + + base = os.path.join(cwd, ".eval_protocol") + try: + os.makedirs(base, exist_ok=True) + with open(os.path.join(base, "last_evaluator.json"), "w", encoding="utf-8") as f: + json.dump({"evaluator_id": evaluator_id, "ts": time.time()}, f) + except Exception: + # best-effort only + pass + + +def _gather_evaluator_traces(cwd: str) -> list[dict]: + roots = [ + os.path.join(cwd, ".eval_protocol", "evaluators"), + os.path.expanduser(os.path.join("~", ".eval_protocol", "evaluators")), + ] + records: list[dict] = [] + for root in roots: + if os.path.isdir(root): + for name in os.listdir(root): + if name.endswith(".json"): + full = os.path.join(root, name) + try: + mtime = os.path.getmtime(full) + except Exception: + mtime = 0.0 + records.append({"id": name[:-5], "path": full, "mtime": mtime}) + # dedupe by id keeping most recent mtime + dedup: dict[str, dict] = {} + for rec in records: + cur = dedup.get(rec["id"]) + if not cur or rec["mtime"] > cur["mtime"]: + dedup[rec["id"]] = rec + return list(dedup.values()) + + +def _prompt_select_evaluator(candidates: list[dict]) -> Optional[str]: + print("\nMultiple evaluators detected. Select one:") + ordered = sorted(candidates, key=lambda x: -x["mtime"]) + for i, c in enumerate(ordered, start=1): + print(f" {i}) {c['id']} (from {c['path']})") + try: + choice = input("Enter a number (or press Enter to cancel): ").strip() + except KeyboardInterrupt: + print("\nCancelled.") + return None + if not choice or not choice.isdigit(): + return None + n = int(choice) + if 1 <= n <= len(ordered): + sel = ordered[n - 1]["id"] + print(f"āœ“ Using evaluator: {sel}") + return sel + return None + + def _ensure_account_id() -> Optional[str]: account_id = get_fireworks_account_id() api_key = get_fireworks_api_key() @@ -248,14 +329,27 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str: return f"{base}{suffix}" -def _auto_select_evaluator_id(cwd: str) -> Optional[str]: - # Try local traces - traces_dir = os.path.join(cwd, ".eval_protocol", "evaluators") - if os.path.isdir(traces_dir): - candidates = [f[:-5] for f in os.listdir(traces_dir) if f.endswith(".json")] - if len(candidates) == 1: - return candidates[0] - # Fall back to discovering a single evaluation_test +def _auto_select_evaluator_id(cwd: str, *, non_interactive: bool = False) -> Optional[str]: + # 1) Use last used pointer if available + last = _load_last_evaluator(cwd) + if last: + return last + + # 2) Look for evaluator traces in project and home + traces = _gather_evaluator_traces(cwd) + if len(traces) == 1: + return traces[0]["id"] + if len(traces) > 1: + if non_interactive: + sel = sorted(traces, key=lambda x: -x["mtime"])[0]["id"] + print(f"āš ļø Multiple evaluators found; using most recent: {sel}. Override with --evaluator-id.") + return sel + chosen = _prompt_select_evaluator(traces) + if chosen: + return chosen + return None + + # 3) Fall back to discovering a single evaluation_test tests = _discover_tests(cwd) if len(tests) == 1: qualname, source_file_path = tests[0].qualname, tests[0].file_path @@ -348,10 +442,12 @@ def create_rft_command(args) -> int: # Resolve evaluator id if omitted project_root = os.getcwd() if not evaluator_id: - evaluator_id = _auto_select_evaluator_id(project_root) + evaluator_id = _auto_select_evaluator_id(project_root, non_interactive=non_interactive) if not evaluator_id: print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.") return 1 + # Persist last selected/used evaluator for next runs + _save_last_evaluator(project_root, evaluator_id) # Resolve evaluator resource name to fully-qualified format required by API evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}" diff --git a/tests/test_cli_create_rft_infer.py b/tests/test_cli_create_rft_infer.py new file mode 100644 index 00000000..ccbe7441 --- /dev/null +++ b/tests/test_cli_create_rft_infer.py @@ -0,0 +1,126 @@ +import json +import os +import time +from types import SimpleNamespace +from unittest.mock import patch + +import pytest + +from eval_protocol.cli_commands import create_rft as cr + + +def _write_json(path: str, data: dict) -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f) + + +def test_load_and_save_last_evaluator(tmp_path, monkeypatch): + # Force HOME to temp so expanduser paths remain inside tmp + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # Initially none + assert cr._load_last_evaluator(str(project)) is None + + # Save and load + cr._save_last_evaluator(str(project), "evaluator-abc") + assert cr._load_last_evaluator(str(project)) == "evaluator-abc" + + +def test_auto_select_uses_last_pointer(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # Write last pointer under project + last_path = project / ".eval_protocol" / "last_evaluator.json" + _write_json(str(last_path), {"evaluator_id": "chosen-id"}) + + eid = cr._auto_select_evaluator_id(str(project)) + assert eid == "chosen-id" + + +def test_auto_select_single_trace(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # Single evaluator trace under project + trace = project / ".eval_protocol" / "evaluators" / "only-one.json" + _write_json(str(trace), {"dummy": True}) + + eid = cr._auto_select_evaluator_id(str(project)) + assert eid == "only-one" + + +def test_auto_select_multiple_traces_non_interactive_most_recent(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # Two traces with different mtimes + older = project / ".eval_protocol" / "evaluators" / "older.json" + newer = project / ".eval_protocol" / "evaluators" / "newer.json" + _write_json(str(older), {}) + _write_json(str(newer), {}) + # Set older then newer mtime + t0 = time.time() - 100 + os.utime(str(older), (t0, t0)) + t1 = time.time() + os.utime(str(newer), (t1, t1)) + + eid = cr._auto_select_evaluator_id(str(project), non_interactive=True) + assert eid == "newer" + + +def test_auto_select_multiple_traces_interactive_prompt(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # Two traces with different mtimes to force ordering: newer first, older second + older = project / ".eval_protocol" / "evaluators" / "older.json" + newer = project / ".eval_protocol" / "evaluators" / "newer.json" + _write_json(str(older), {}) + _write_json(str(newer), {}) + t0 = time.time() - 100 + os.utime(str(older), (t0, t0)) + t1 = time.time() + os.utime(str(newer), (t1, t1)) + + with patch("builtins.input", return_value="2"): + eid = cr._auto_select_evaluator_id(str(project), non_interactive=False) + # Choosing "2" should pick the second item by recency => "older" + assert eid == "older" + + +def test_auto_select_falls_back_to_single_discovered_test(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # No traces; provide exactly one discovered test + test_file = project / "metric" / "test_calendar.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("# dummy", encoding="utf-8") + + dummy = SimpleNamespace(qualname="calendar_agent.test_calendar_agent_evaluation", file_path=str(test_file)) + monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [dummy]) + + eid = cr._auto_select_evaluator_id(str(project)) + assert eid is not None + # Should incorporate function name suffix + assert "test_calendar_agent_evaluation".split("_")[-1] in eid or "test-calendar-agent-evaluation" in eid + + +def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch): + monkeypatch.setenv("HOME", str(tmp_path / "home")) + project = tmp_path / "proj" + project.mkdir() + + # No traces, no tests + monkeypatch.setattr(cr, "_discover_tests", lambda cwd: []) + eid = cr._auto_select_evaluator_id(str(project)) + assert eid is None From e6cbe868a5475116f62043f25ca6f408ba876a90 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 8 Nov 2025 18:43:26 -0800 Subject: [PATCH 2/4] add new test to verify dataset id and fix code --- eval_protocol/cli_commands/create_rft.py | 6 +- tests/test_cli_create_rft_infer.py | 194 ++++++++++++++++++++++- 2 files changed, 194 insertions(+), 6 deletions(-) diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index c90512ae..49893690 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -446,9 +446,6 @@ def create_rft_command(args) -> int: if not evaluator_id: print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.") return 1 - # Persist last selected/used evaluator for next runs - _save_last_evaluator(project_root, evaluator_id) - # Resolve evaluator resource name to fully-qualified format required by API evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}" @@ -515,6 +512,9 @@ def create_rft_command(args) -> int: print(f"šŸ“Š Please check the evaluator status at: {dashboard_url}") print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.") return 1 + else: + # Only persist last-used evaluator after successful ensure + ACTIVE + _save_last_evaluator(project_root, evaluator_id) else: print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.") except Exception as e: diff --git a/tests/test_cli_create_rft_infer.py b/tests/test_cli_create_rft_infer.py index ccbe7441..42307253 100644 --- a/tests/test_cli_create_rft_infer.py +++ b/tests/test_cli_create_rft_infer.py @@ -102,17 +102,17 @@ def test_auto_select_falls_back_to_single_discovered_test(tmp_path, monkeypatch) project.mkdir() # No traces; provide exactly one discovered test - test_file = project / "metric" / "test_calendar.py" + test_file = project / "metric" / "test_dummy.py" test_file.parent.mkdir(parents=True, exist_ok=True) test_file.write_text("# dummy", encoding="utf-8") - dummy = SimpleNamespace(qualname="calendar_agent.test_calendar_agent_evaluation", file_path=str(test_file)) + dummy = SimpleNamespace(qualname="dummy_module.test_dummy_evaluation", file_path=str(test_file)) monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [dummy]) eid = cr._auto_select_evaluator_id(str(project)) assert eid is not None # Should incorporate function name suffix - assert "test_calendar_agent_evaluation".split("_")[-1] in eid or "test-calendar-agent-evaluation" in eid + assert "test_dummy_evaluation".split("_")[-1] in eid or "test-dummy-evaluation" in eid def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch): @@ -124,3 +124,191 @@ def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch): monkeypatch.setattr(cr, "_discover_tests", lambda cwd: []) eid = cr._auto_select_evaluator_id(str(project)) assert eid is None + + +def test_create_rft_picks_most_recent_evaluator_and_dataset_id_follows(tmp_path, monkeypatch): + # Isolate HOME so expanduser paths remain inside tmp + monkeypatch.setenv("HOME", str(tmp_path / "home")) + + # Create a fake project and chdir into it (create_rft uses os.getcwd()) + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Prepare two evaluator traces with different mtimes + traces_dir = project / ".eval_protocol" / "evaluators" + traces_dir.mkdir(parents=True, exist_ok=True) + older = traces_dir / "example-eval-1.json" + newer = traces_dir / "example-eval-2.json" + older.write_text("{}", encoding="utf-8") + newer.write_text("{}", encoding="utf-8") + t0 = time.time() - 200 + os.utime(str(older), (t0, t0)) + t1 = time.time() + os.utime(str(newer), (t1, t1)) + + # Create a dummy dataset jsonl file + ds_path = project / "evaluator" / "dummy_dataset.jsonl" + ds_path.parent.mkdir(parents=True, exist_ok=True) + ds_path.write_text('{"input":"x"}\n', encoding="utf-8") + + # Env required by create_rft_command + monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy") + monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123") + monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") + + # Stub out networked/subcommands used by create_rft + # Patch upload command in its own module (create_rft imports it at call time) + import eval_protocol.cli_commands.upload as upload_mod + + monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0) + monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True) + + captured = {"dataset_id": None} + + def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path): + captured["dataset_id"] = dataset_id + return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"} + + monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl) + monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"}) + + # Build args: non_interactive (yes=True), no explicit evaluator_id, valid warm_start_from + args = type("Args", (), {})() + setattr(args, "evaluator_id", None) + setattr(args, "yes", True) + setattr(args, "dry_run", False) + setattr(args, "force", False) + setattr(args, "env_file", None) + setattr(args, "dataset_id", None) + setattr(args, "dataset_jsonl", str(ds_path)) + setattr(args, "dataset_display_name", None) + setattr(args, "dataset_builder", None) + setattr(args, "base_model", None) + setattr(args, "warm_start_from", "accounts/acct123/models/ft-abc123") + setattr(args, "output_model", None) + setattr(args, "n", None) + setattr(args, "max_tokens", None) + setattr(args, "learning_rate", None) + setattr(args, "batch_size", None) + setattr(args, "epochs", None) + setattr(args, "lora_rank", None) + setattr(args, "max_context_length", None) + setattr(args, "chunk_size", None) + setattr(args, "eval_auto_carveout", None) + + rc = cr.create_rft_command(args) + assert rc == 0 + + # Assert dataset id followed the most recent evaluator id ("example-eval-2") + assert captured["dataset_id"] is not None + assert captured["dataset_id"].startswith("example-eval-2-dataset-") + + +def test_create_rft_passes_matching_evaluator_id_and_entry_with_multiple_tests(tmp_path, monkeypatch): + # Ensure expanduser paths stay under tmp + monkeypatch.setenv("HOME", str(tmp_path / "home")) + + # Project structure and CWD + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Two evaluator traces: make the target evaluator the most recent + traces_dir = project / ".eval_protocol" / "evaluators" + traces_dir.mkdir(parents=True, exist_ok=True) + svg_id = "example-svg-evaluation" + # Use an evaluator id that matches normalization logic for mapping to foo_eval.py::test_bar_evaluation + target_id = cr._normalize_evaluator_id("foo_eval-test_bar_evaluation") + older = traces_dir / f"{svg_id}.json" + newer = traces_dir / f"{target_id}.json" + older.write_text("{}", encoding="utf-8") + newer.write_text("{}", encoding="utf-8") + t0 = time.time() - 200 + os.utime(str(older), (t0, t0)) + t1 = time.time() + os.utime(str(newer), (t1, t1)) + + # Create dummy test files for discovery + eval_dir = project / "evaluator" + eval_dir.mkdir(parents=True, exist_ok=True) + cal_file = eval_dir / "foo_eval.py" + svg_file = eval_dir / "bar_eval.py" + cal_file.write_text("# foo", encoding="utf-8") + svg_file.write_text("# bar", encoding="utf-8") + + # Fake discovered tests: foo and bar + cal_disc = SimpleNamespace(qualname="foo_eval.test_bar_evaluation", file_path=str(cal_file)) + svg_disc = SimpleNamespace(qualname="bar_eval.test_baz_evaluation", file_path=str(svg_file)) + monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [cal_disc, svg_disc]) + + # Env for CLI + monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy") + monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123") + monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") + + # Capture what upload receives (id and entry) + captured = {"id": None, "entry": None, "dataset_id": None} + + # Monkeypatch the upload command from the upload module (the function imports it inside) + import eval_protocol.cli_commands.upload as upload_mod + + def _fake_upload(ns): + captured["id"] = getattr(ns, "id", None) + captured["entry"] = getattr(ns, "entry", None) + return 0 + + monkeypatch.setattr(upload_mod, "upload_command", _fake_upload) + + # Avoid network and capture dataset id + monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True) + + def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path): + captured["dataset_id"] = dataset_id + return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"} + + monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl) + monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"}) + + # Provide a dataset jsonl so flow proceeds + ds_path = eval_dir / "dummy_dataset.jsonl" + ds_path.write_text('{"input":"x"}\n', encoding="utf-8") + + # Build args: non-interactive, no explicit evaluator id + import argparse + + args = argparse.Namespace( + evaluator_id=None, + yes=True, + dry_run=False, + force=False, + env_file=None, + dataset_id=None, + dataset_jsonl=str(ds_path), + dataset_display_name=None, + dataset_builder=None, + base_model=None, + warm_start_from="accounts/acct123/models/ft-abc123", + output_model=None, + n=None, + max_tokens=None, + learning_rate=None, + batch_size=None, + epochs=None, + lora_rank=None, + max_context_length=None, + chunk_size=None, + eval_auto_carveout=None, + ) + + rc = cr.create_rft_command(args) + assert rc == 0 + + # Assert evaluator_id passed to upload matches the most recent trace (target) + assert captured["id"] == target_id + # Assert entry points to the foo test (should map when id matches normalization) + assert captured["entry"] is not None and captured["entry"].endswith("foo_eval.py::test_bar_evaluation") + # Assert dataset id is derived from the same evaluator id (trimmed base + '-dataset-') + assert captured["dataset_id"] is not None + expected_prefix = cr._build_trimmed_dataset_id(target_id).split("-dataset-")[0] + "-dataset-" + assert captured["dataset_id"].startswith(expected_prefix) From fb1202880cabd70f06bafe6779264cf23681e6ae Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 8 Nov 2025 19:12:02 -0800 Subject: [PATCH 3/4] try skipping if possible --- eval_protocol/cli_commands/create_rft.py | 172 +++++++++++++++-------- 1 file changed, 110 insertions(+), 62 deletions(-) diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index 49893690..26e663d1 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -321,6 +321,8 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str: if not base: base = "dataset" # Ensure first char is a letter + if not base: + base = "dataset" if not base[0].isalpha(): base = f"eval-{base}" if len(base) > max_base_len: @@ -449,76 +451,122 @@ def create_rft_command(args) -> int: # Resolve evaluator resource name to fully-qualified format required by API evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}" + # Optional short-circuit: if evaluator already exists and not forcing, skip upload path + skip_upload = False + if not force: + try: + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "User-Agent": get_user_agent(), + } + resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10) + if resp.ok: + state = resp.json().get("state", "STATE_UNSPECIFIED") + print(f"āœ“ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).") + # Poll for ACTIVE before proceeding + print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...") + if not _poll_evaluator_status( + evaluator_resource_name=evaluator_resource_name, + api_key=api_key, + api_base=api_base, + timeout_minutes=10, + ): + app_base = _map_api_host_to_app_host(api_base) + evaluator_slug = _extract_terminal_segment(evaluator_id) + dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}" + print("\nāŒ Evaluator is not ready within the timeout period.") + print(f"šŸ“Š Please check the evaluator status at: {dashboard_url}") + print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.") + return 1 + _save_last_evaluator(project_root, evaluator_id) + skip_upload = True + except requests.exceptions.RequestException: + pass + # Ensure evaluator exists by invoking the upload flow programmatically - try: - from .upload import upload_command + if not skip_upload: + try: + from .upload import upload_command - tests = _discover_tests(project_root) - selected_entry: Optional[str] = None - if len(tests) == 1: - func_name = tests[0].qualname.split(".")[-1] - abs_path = os.path.abspath(tests[0].file_path) - try: - rel = os.path.relpath(abs_path, project_root) - except Exception: - rel = abs_path - selected_entry = f"{rel}::{func_name}" - else: - # Try to match evaluator_id to a discovered test's normalized ID - for t in tests: - func_name = t.qualname.split(".")[-1] - source_file_name = os.path.splitext(os.path.basename(t.file_path))[0] - candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}") - if candidate == evaluator_id: - abs_path = os.path.abspath(t.file_path) - try: - rel = os.path.relpath(abs_path, project_root) - except Exception: - rel = abs_path - selected_entry = f"{rel}::{func_name}" - break - - upload_args = argparse.Namespace( - path=project_root, - entry=selected_entry, - id=evaluator_id, - display_name=None, - description=None, - force=force, # Pass through the --force flag - yes=True, - env_file=None, # Add the new env_file parameter - ) + tests = _discover_tests(project_root) + selected_entry: Optional[str] = None + if len(tests) == 1: + func_name = tests[0].qualname.split(".")[-1] + abs_path = os.path.abspath(tests[0].file_path) + try: + rel = os.path.relpath(abs_path, project_root) + except Exception: + rel = abs_path + selected_entry = f"{rel}::{func_name}" + else: + # Try to match evaluator_id to a discovered test's normalized ID + for t in tests: + func_name = t.qualname.split(".")[-1] + source_file_name = os.path.splitext(os.path.basename(t.file_path))[0] + candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}") + if candidate == evaluator_id: + abs_path = os.path.abspath(t.file_path) + try: + rel = os.path.relpath(abs_path, project_root) + except Exception: + rel = abs_path + selected_entry = f"{rel}::{func_name}" + break + # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators + if selected_entry is None: + print( + f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n" + " Please re-run specifying the evaluator id.\n" + " Hints:\n" + " - eval-protocol create rft --evaluator-id \n" + ) + return 1 + + upload_args = argparse.Namespace( + path=project_root, + entry=selected_entry, + id=evaluator_id, + display_name=None, + description=None, + force=force, # Pass through the --force flag + yes=True, + env_file=None, # Add the new env_file parameter + ) - if force: - print(f"šŸ”„ Force flag enabled - will overwrite existing evaluator '{evaluator_id}'") + if force: + print(f"šŸ”„ Force flag enabled - will overwrite existing evaluator '{evaluator_id}'") - rc = upload_command(upload_args) - if rc == 0: - print(f"āœ“ Uploaded/ensured evaluator: {evaluator_id}") + rc = upload_command(upload_args) + if rc == 0: + print(f"āœ“ Uploaded/ensured evaluator: {evaluator_id}") - # Poll for evaluator status - print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...") - is_active = _poll_evaluator_status( - evaluator_resource_name=evaluator_resource_name, api_key=api_key, api_base=api_base, timeout_minutes=10 - ) + # Poll for evaluator status + print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...") + is_active = _poll_evaluator_status( + evaluator_resource_name=evaluator_resource_name, + api_key=api_key, + api_base=api_base, + timeout_minutes=10, + ) - if not is_active: - # Print helpful message with dashboard link - app_base = _map_api_host_to_app_host(api_base) - evaluator_slug = _extract_terminal_segment(evaluator_id) - dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}" + if not is_active: + # Print helpful message with dashboard link + app_base = _map_api_host_to_app_host(api_base) + evaluator_slug = _extract_terminal_segment(evaluator_id) + dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}" - print("\nāŒ Evaluator is not ready within the timeout period.") - print(f"šŸ“Š Please check the evaluator status at: {dashboard_url}") - print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.") - return 1 + print("\nāŒ Evaluator is not ready within the timeout period.") + print(f"šŸ“Š Please check the evaluator status at: {dashboard_url}") + print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.") + return 1 + else: + # Only persist last-used evaluator after successful ensure + ACTIVE + _save_last_evaluator(project_root, evaluator_id) else: - # Only persist last-used evaluator after successful ensure + ACTIVE - _save_last_evaluator(project_root, evaluator_id) - else: - print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.") - except Exception as e: - print(f"Warning: Failed to upload evaluator automatically: {e}") + print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.") + except Exception as e: + print(f"Warning: Failed to upload evaluator automatically: {e}") # Determine dataset id and materialization path dataset_id = getattr(args, "dataset_id", None) From cca18e66bb48fa09b10b6f340b6c7f192da3a0d2 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sat, 8 Nov 2025 19:21:55 -0800 Subject: [PATCH 4/4] fix --- eval_protocol/cli_commands/create_rft.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py index 26e663d1..8b6bce4d 100644 --- a/eval_protocol/cli_commands/create_rft.py +++ b/eval_protocol/cli_commands/create_rft.py @@ -513,15 +513,15 @@ def create_rft_command(args) -> int: rel = abs_path selected_entry = f"{rel}::{func_name}" break - # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators - if selected_entry is None: - print( - f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n" - " Please re-run specifying the evaluator id.\n" - " Hints:\n" - " - eval-protocol create rft --evaluator-id \n" - ) - return 1 + # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators + if selected_entry is None and len(tests) > 1: + print( + f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n" + " Please re-run specifying the evaluator id.\n" + " Hints:\n" + " - eval-protocol create rft --evaluator-id \n" + ) + return 1 upload_args = argparse.Namespace( path=project_root,