From 64efd1cabe5da634556c91d7cf4831cbf1676ed9 Mon Sep 17 00:00:00 2001
From: Benny Chen <youfychenbc5000@gmail.com>
Date: Sat, 8 Nov 2025 17:58:00 -0800
Subject: [PATCH 1/4] auto select evaluators correctly

---
 eval_protocol/cli_commands/create_rft.py | 114 ++++++++++++++++++--
 tests/test_cli_create_rft_infer.py       | 126 +++++++++++++++++++++++
 2 files changed, 231 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_cli_create_rft_infer.py

diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
index 5828b3ea..c90512ae 100644
--- a/eval_protocol/cli_commands/create_rft.py
+++ b/eval_protocol/cli_commands/create_rft.py
@@ -23,6 +23,87 @@
 from .upload import _discover_tests, _normalize_evaluator_id, _resolve_entry_to_qual_and_source
 
 
+def _last_evaluator_paths(cwd: str) -> list[str]:
+    return [
+        os.path.join(cwd, ".eval_protocol", "last_evaluator.json"),
+        os.path.expanduser(os.path.join("~", ".eval_protocol", "last_evaluator.json")),
+    ]
+
+
+def _load_last_evaluator(cwd: str) -> Optional[str]:
+    import json
+
+    for p in _last_evaluator_paths(cwd):
+        try:
+            if os.path.isfile(p):
+                with open(p, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                if isinstance(data, dict) and data.get("evaluator_id"):
+                    return str(data["evaluator_id"])
+        except Exception:
+            # ignore and continue
+            pass
+    return None
+
+
+def _save_last_evaluator(cwd: str, evaluator_id: str) -> None:
+    import json
+
+    base = os.path.join(cwd, ".eval_protocol")
+    try:
+        os.makedirs(base, exist_ok=True)
+        with open(os.path.join(base, "last_evaluator.json"), "w", encoding="utf-8") as f:
+            json.dump({"evaluator_id": evaluator_id, "ts": time.time()}, f)
+    except Exception:
+        # best-effort only
+        pass
+
+
+def _gather_evaluator_traces(cwd: str) -> list[dict]:
+    roots = [
+        os.path.join(cwd, ".eval_protocol", "evaluators"),
+        os.path.expanduser(os.path.join("~", ".eval_protocol", "evaluators")),
+    ]
+    records: list[dict] = []
+    for root in roots:
+        if os.path.isdir(root):
+            for name in os.listdir(root):
+                if name.endswith(".json"):
+                    full = os.path.join(root, name)
+                    try:
+                        mtime = os.path.getmtime(full)
+                    except Exception:
+                        mtime = 0.0
+                    records.append({"id": name[:-5], "path": full, "mtime": mtime})
+    # dedupe by id keeping most recent mtime
+    dedup: dict[str, dict] = {}
+    for rec in records:
+        cur = dedup.get(rec["id"])
+        if not cur or rec["mtime"] > cur["mtime"]:
+            dedup[rec["id"]] = rec
+    return list(dedup.values())
+
+
+def _prompt_select_evaluator(candidates: list[dict]) -> Optional[str]:
+    print("\nMultiple evaluators detected. Select one:")
+    ordered = sorted(candidates, key=lambda x: -x["mtime"])
+    for i, c in enumerate(ordered, start=1):
+        print(f"  {i}) {c['id']}  (from {c['path']})")
+    try:
+        choice = input("Enter a number (or press Enter to cancel): ").strip()
+    except KeyboardInterrupt:
+        print("\nCancelled.")
+        return None
+    if not choice or not choice.isdigit():
+        return None
+    n = int(choice)
+    if 1 <= n <= len(ordered):
+        sel = ordered[n - 1]["id"]
+        print(f"✓ Using evaluator: {sel}")
+        return sel
+    return None
+
+
 def _ensure_account_id() -> Optional[str]:
     account_id = get_fireworks_account_id()
     api_key = get_fireworks_api_key()
@@ -248,14 +329,27 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
     return f"{base}{suffix}"
 
 
-def _auto_select_evaluator_id(cwd: str) -> Optional[str]:
-    # Try local traces
-    traces_dir = os.path.join(cwd, ".eval_protocol", "evaluators")
-    if os.path.isdir(traces_dir):
-        candidates = [f[:-5] for f in os.listdir(traces_dir) if f.endswith(".json")]
-        if len(candidates) == 1:
-            return candidates[0]
-    # Fall back to discovering a single evaluation_test
+def _auto_select_evaluator_id(cwd: str, *, non_interactive: bool = False) -> Optional[str]:
+    # 1) Use last used pointer if available
+    last = _load_last_evaluator(cwd)
+    if last:
+        return last
+
+    # 2) Look for evaluator traces in project and home
+    traces = _gather_evaluator_traces(cwd)
+    if len(traces) == 1:
+        return traces[0]["id"]
+    if len(traces) > 1:
+        if non_interactive:
+            sel = sorted(traces, key=lambda x: -x["mtime"])[0]["id"]
+            print(f"⚠️  Multiple evaluators found; using most recent: {sel}. Override with --evaluator-id.")
+            return sel
+        chosen = _prompt_select_evaluator(traces)
+        if chosen:
+            return chosen
+        return None
+
+    # 3) Fall back to discovering a single evaluation_test
     tests = _discover_tests(cwd)
     if len(tests) == 1:
         qualname, source_file_path = tests[0].qualname, tests[0].file_path
@@ -348,10 +442,12 @@ def create_rft_command(args) -> int:
     # Resolve evaluator id if omitted
     project_root = os.getcwd()
     if not evaluator_id:
-        evaluator_id = _auto_select_evaluator_id(project_root)
+        evaluator_id = _auto_select_evaluator_id(project_root, non_interactive=non_interactive)
         if not evaluator_id:
             print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.")
             return 1
+    # Persist last selected/used evaluator for next runs
+    _save_last_evaluator(project_root, evaluator_id)
 
     # Resolve evaluator resource name to fully-qualified format required by API
     evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
diff --git a/tests/test_cli_create_rft_infer.py b/tests/test_cli_create_rft_infer.py
new file mode 100644
index 00000000..ccbe7441
--- /dev/null
+++ b/tests/test_cli_create_rft_infer.py
@@ -0,0 +1,126 @@
+import json
+import os
+import time
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import pytest
+
+from eval_protocol.cli_commands import create_rft as cr
+
+
+def _write_json(path: str, data: dict) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f)
+
+
+def test_load_and_save_last_evaluator(tmp_path, monkeypatch):
+    # Force HOME to temp so expanduser paths remain inside tmp
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # Initially none
+    assert cr._load_last_evaluator(str(project)) is None
+
+    # Save and load
+    cr._save_last_evaluator(str(project), "evaluator-abc")
+    assert cr._load_last_evaluator(str(project)) == "evaluator-abc"
+
+
+def test_auto_select_uses_last_pointer(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # Write last pointer under project
+    last_path = project / ".eval_protocol" / "last_evaluator.json"
+    _write_json(str(last_path), {"evaluator_id": "chosen-id"})
+
+    eid = cr._auto_select_evaluator_id(str(project))
+    assert eid == "chosen-id"
+
+
+def test_auto_select_single_trace(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # Single evaluator trace under project
+    trace = project / ".eval_protocol" / "evaluators" / "only-one.json"
+    _write_json(str(trace), {"dummy": True})
+
+    eid = cr._auto_select_evaluator_id(str(project))
+    assert eid == "only-one"
+
+
+def test_auto_select_multiple_traces_non_interactive_most_recent(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # Two traces with different mtimes
+    older = project / ".eval_protocol" / "evaluators" / "older.json"
+    newer = project / ".eval_protocol" / "evaluators" / "newer.json"
+    _write_json(str(older), {})
+    _write_json(str(newer), {})
+    # Set older then newer mtime
+    t0 = time.time() - 100
+    os.utime(str(older), (t0, t0))
+    t1 = time.time()
+    os.utime(str(newer), (t1, t1))
+
+    eid = cr._auto_select_evaluator_id(str(project), non_interactive=True)
+    assert eid == "newer"
+
+
+def test_auto_select_multiple_traces_interactive_prompt(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # Two traces with different mtimes to force ordering: newer first, older second
+    older = project / ".eval_protocol" / "evaluators" / "older.json"
+    newer = project / ".eval_protocol" / "evaluators" / "newer.json"
+    _write_json(str(older), {})
+    _write_json(str(newer), {})
+    t0 = time.time() - 100
+    os.utime(str(older), (t0, t0))
+    t1 = time.time()
+    os.utime(str(newer), (t1, t1))
+
+    with patch("builtins.input", return_value="2"):
+        eid = cr._auto_select_evaluator_id(str(project), non_interactive=False)
+    # Choosing "2" should pick the second item by recency => "older"
+    assert eid == "older"
+
+
+def test_auto_select_falls_back_to_single_discovered_test(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # No traces; provide exactly one discovered test
+    test_file = project / "metric" / "test_calendar.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("# dummy", encoding="utf-8")
+
+    dummy = SimpleNamespace(qualname="calendar_agent.test_calendar_agent_evaluation", file_path=str(test_file))
+    monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [dummy])
+
+    eid = cr._auto_select_evaluator_id(str(project))
+    assert eid is not None
+    # Should incorporate function name suffix
+    assert "test_calendar_agent_evaluation".split("_")[-1] in eid or "test-calendar-agent-evaluation" in eid
+
+
+def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch):
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+    project = tmp_path / "proj"
+    project.mkdir()
+
+    # No traces, no tests
+    monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [])
+    eid = cr._auto_select_evaluator_id(str(project))
+    assert eid is None

From e6cbe868a5475116f62043f25ca6f408ba876a90 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sat, 8 Nov 2025 18:43:26 -0800
Subject: [PATCH 2/4] add new test to verify dataset id and fix code

---
 eval_protocol/cli_commands/create_rft.py |   6 +-
 tests/test_cli_create_rft_infer.py       | 194 ++++++++++++++++++++++-
 2 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
index c90512ae..49893690 100644
--- a/eval_protocol/cli_commands/create_rft.py
+++ b/eval_protocol/cli_commands/create_rft.py
@@ -446,9 +446,6 @@ def create_rft_command(args) -> int:
         if not evaluator_id:
             print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.")
             return 1
-    # Persist last selected/used evaluator for next runs
-    _save_last_evaluator(project_root, evaluator_id)
-
     # Resolve evaluator resource name to fully-qualified format required by API
     evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
 
@@ -515,6 +512,9 @@ def create_rft_command(args) -> int:
                 print(f"📊 Please check the evaluator status at: {dashboard_url}")
                 print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
                 return 1
+            else:
+                # Only persist last-used evaluator after successful ensure + ACTIVE
+                _save_last_evaluator(project_root, evaluator_id)
         else:
             print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
     except Exception as e:
diff --git a/tests/test_cli_create_rft_infer.py b/tests/test_cli_create_rft_infer.py
index ccbe7441..42307253 100644
--- a/tests/test_cli_create_rft_infer.py
+++ b/tests/test_cli_create_rft_infer.py
@@ -102,17 +102,17 @@ def test_auto_select_falls_back_to_single_discovered_test(tmp_path, monkeypatch)
     project.mkdir()
 
     # No traces; provide exactly one discovered test
-    test_file = project / "metric" / "test_calendar.py"
+    test_file = project / "metric" / "test_dummy.py"
     test_file.parent.mkdir(parents=True, exist_ok=True)
     test_file.write_text("# dummy", encoding="utf-8")
 
-    dummy = SimpleNamespace(qualname="calendar_agent.test_calendar_agent_evaluation", file_path=str(test_file))
+    dummy = SimpleNamespace(qualname="dummy_module.test_dummy_evaluation", file_path=str(test_file))
     monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [dummy])
 
     eid = cr._auto_select_evaluator_id(str(project))
     assert eid is not None
     # Should incorporate function name suffix
-    assert "test_calendar_agent_evaluation".split("_")[-1] in eid or "test-calendar-agent-evaluation" in eid
+    assert "test_dummy_evaluation".split("_")[-1] in eid or "test-dummy-evaluation" in eid
 
 
 def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch):
@@ -124,3 +124,191 @@ def test_auto_select_returns_none_when_no_candidates(tmp_path, monkeypatch):
     monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [])
     eid = cr._auto_select_evaluator_id(str(project))
     assert eid is None
+
+
+def test_create_rft_picks_most_recent_evaluator_and_dataset_id_follows(tmp_path, monkeypatch):
+    # Isolate HOME so expanduser paths remain inside tmp
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+
+    # Create a fake project and chdir into it (create_rft uses os.getcwd())
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Prepare two evaluator traces with different mtimes
+    traces_dir = project / ".eval_protocol" / "evaluators"
+    traces_dir.mkdir(parents=True, exist_ok=True)
+    older = traces_dir / "example-eval-1.json"
+    newer = traces_dir / "example-eval-2.json"
+    older.write_text("{}", encoding="utf-8")
+    newer.write_text("{}", encoding="utf-8")
+    t0 = time.time() - 200
+    os.utime(str(older), (t0, t0))
+    t1 = time.time()
+    os.utime(str(newer), (t1, t1))
+
+    # Create a dummy dataset jsonl file
+    ds_path = project / "evaluator" / "dummy_dataset.jsonl"
+    ds_path.parent.mkdir(parents=True, exist_ok=True)
+    ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
+
+    # Env required by create_rft_command
+    monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
+    monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
+    monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
+
+    # Stub out networked/subcommands used by create_rft
+    # Patch upload command in its own module (create_rft imports it at call time)
+    import eval_protocol.cli_commands.upload as upload_mod
+
+    monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
+    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+
+    captured = {"dataset_id": None}
+
+    def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
+        captured["dataset_id"] = dataset_id
+        return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
+
+    monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
+    monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
+
+    # Build args: non_interactive (yes=True), no explicit evaluator_id, valid warm_start_from
+    args = type("Args", (), {})()
+    setattr(args, "evaluator_id", None)
+    setattr(args, "yes", True)
+    setattr(args, "dry_run", False)
+    setattr(args, "force", False)
+    setattr(args, "env_file", None)
+    setattr(args, "dataset_id", None)
+    setattr(args, "dataset_jsonl", str(ds_path))
+    setattr(args, "dataset_display_name", None)
+    setattr(args, "dataset_builder", None)
+    setattr(args, "base_model", None)
+    setattr(args, "warm_start_from", "accounts/acct123/models/ft-abc123")
+    setattr(args, "output_model", None)
+    setattr(args, "n", None)
+    setattr(args, "max_tokens", None)
+    setattr(args, "learning_rate", None)
+    setattr(args, "batch_size", None)
+    setattr(args, "epochs", None)
+    setattr(args, "lora_rank", None)
+    setattr(args, "max_context_length", None)
+    setattr(args, "chunk_size", None)
+    setattr(args, "eval_auto_carveout", None)
+
+    rc = cr.create_rft_command(args)
+    assert rc == 0
+
+    # Assert dataset id followed the most recent evaluator id ("example-eval-2")
+    assert captured["dataset_id"] is not None
+    assert captured["dataset_id"].startswith("example-eval-2-dataset-")
+
+
+def test_create_rft_passes_matching_evaluator_id_and_entry_with_multiple_tests(tmp_path, monkeypatch):
+    # Ensure expanduser paths stay under tmp
+    monkeypatch.setenv("HOME", str(tmp_path / "home"))
+
+    # Project structure and CWD
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Two evaluator traces: make the target evaluator the most recent
+    traces_dir = project / ".eval_protocol" / "evaluators"
+    traces_dir.mkdir(parents=True, exist_ok=True)
+    svg_id = "example-svg-evaluation"
+    # Use an evaluator id that matches normalization logic for mapping to foo_eval.py::test_bar_evaluation
+    target_id = cr._normalize_evaluator_id("foo_eval-test_bar_evaluation")
+    older = traces_dir / f"{svg_id}.json"
+    newer = traces_dir / f"{target_id}.json"
+    older.write_text("{}", encoding="utf-8")
+    newer.write_text("{}", encoding="utf-8")
+    t0 = time.time() - 200
+    os.utime(str(older), (t0, t0))
+    t1 = time.time()
+    os.utime(str(newer), (t1, t1))
+
+    # Create dummy test files for discovery
+    eval_dir = project / "evaluator"
+    eval_dir.mkdir(parents=True, exist_ok=True)
+    cal_file = eval_dir / "foo_eval.py"
+    svg_file = eval_dir / "bar_eval.py"
+    cal_file.write_text("# foo", encoding="utf-8")
+    svg_file.write_text("# bar", encoding="utf-8")
+
+    # Fake discovered tests: foo and bar
+    cal_disc = SimpleNamespace(qualname="foo_eval.test_bar_evaluation", file_path=str(cal_file))
+    svg_disc = SimpleNamespace(qualname="bar_eval.test_baz_evaluation", file_path=str(svg_file))
+    monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [cal_disc, svg_disc])
+
+    # Env for CLI
+    monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
+    monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
+    monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
+
+    # Capture what upload receives (id and entry)
+    captured = {"id": None, "entry": None, "dataset_id": None}
+
+    # Monkeypatch the upload command from the upload module (the function imports it inside)
+    import eval_protocol.cli_commands.upload as upload_mod
+
+    def _fake_upload(ns):
+        captured["id"] = getattr(ns, "id", None)
+        captured["entry"] = getattr(ns, "entry", None)
+        return 0
+
+    monkeypatch.setattr(upload_mod, "upload_command", _fake_upload)
+
+    # Avoid network and capture dataset id
+    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+
+    def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
+        captured["dataset_id"] = dataset_id
+        return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
+
+    monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
+    monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
+
+    # Provide a dataset jsonl so flow proceeds
+    ds_path = eval_dir / "dummy_dataset.jsonl"
+    ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
+
+    # Build args: non-interactive, no explicit evaluator id
+    import argparse
+
+    args = argparse.Namespace(
+        evaluator_id=None,
+        yes=True,
+        dry_run=False,
+        force=False,
+        env_file=None,
+        dataset_id=None,
+        dataset_jsonl=str(ds_path),
+        dataset_display_name=None,
+        dataset_builder=None,
+        base_model=None,
+        warm_start_from="accounts/acct123/models/ft-abc123",
+        output_model=None,
+        n=None,
+        max_tokens=None,
+        learning_rate=None,
+        batch_size=None,
+        epochs=None,
+        lora_rank=None,
+        max_context_length=None,
+        chunk_size=None,
+        eval_auto_carveout=None,
+    )
+
+    rc = cr.create_rft_command(args)
+    assert rc == 0
+
+    # Assert evaluator_id passed to upload matches the most recent trace (target)
+    assert captured["id"] == target_id
+    # Assert entry points to the foo test (should map when id matches normalization)
+    assert captured["entry"] is not None and captured["entry"].endswith("foo_eval.py::test_bar_evaluation")
+    # Assert dataset id is derived from the same evaluator id (trimmed base + '-dataset-<timestamp>')
+    assert captured["dataset_id"] is not None
+    expected_prefix = cr._build_trimmed_dataset_id(target_id).split("-dataset-")[0] + "-dataset-"
+    assert captured["dataset_id"].startswith(expected_prefix)

From fb1202880cabd70f06bafe6779264cf23681e6ae Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sat, 8 Nov 2025 19:12:02 -0800
Subject: [PATCH 3/4] try skipping if possible

---
 eval_protocol/cli_commands/create_rft.py | 172 +++++++++++++++--------
 1 file changed, 110 insertions(+), 62 deletions(-)

diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
index 49893690..26e663d1 100644
--- a/eval_protocol/cli_commands/create_rft.py
+++ b/eval_protocol/cli_commands/create_rft.py
@@ -321,6 +321,8 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
         if not base:
             base = "dataset"
     # Ensure first char is a letter
+    if not base:
+        base = "dataset"
     if not base[0].isalpha():
         base = f"eval-{base}"
         if len(base) > max_base_len:
@@ -449,76 +451,122 @@ def create_rft_command(args) -> int:
     # Resolve evaluator resource name to fully-qualified format required by API
     evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
 
+    # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
+    skip_upload = False
+    if not force:
+        try:
+            headers = {
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+                "User-Agent": get_user_agent(),
+            }
+            resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
+            if resp.ok:
+                state = resp.json().get("state", "STATE_UNSPECIFIED")
+                print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
+                # Poll for ACTIVE before proceeding
+                print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
+                if not _poll_evaluator_status(
+                    evaluator_resource_name=evaluator_resource_name,
+                    api_key=api_key,
+                    api_base=api_base,
+                    timeout_minutes=10,
+                ):
+                    app_base = _map_api_host_to_app_host(api_base)
+                    evaluator_slug = _extract_terminal_segment(evaluator_id)
+                    dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}"
+                    print("\n❌ Evaluator is not ready within the timeout period.")
+                    print(f"📊 Please check the evaluator status at: {dashboard_url}")
+                    print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
+                    return 1
+                _save_last_evaluator(project_root, evaluator_id)
+                skip_upload = True
+        except requests.exceptions.RequestException:
+            pass
+
     # Ensure evaluator exists by invoking the upload flow programmatically
-    try:
-        from .upload import upload_command
+    if not skip_upload:
+        try:
+            from .upload import upload_command
 
-        tests = _discover_tests(project_root)
-        selected_entry: Optional[str] = None
-        if len(tests) == 1:
-            func_name = tests[0].qualname.split(".")[-1]
-            abs_path = os.path.abspath(tests[0].file_path)
-            try:
-                rel = os.path.relpath(abs_path, project_root)
-            except Exception:
-                rel = abs_path
-            selected_entry = f"{rel}::{func_name}"
-        else:
-            # Try to match evaluator_id to a discovered test's normalized ID
-            for t in tests:
-                func_name = t.qualname.split(".")[-1]
-                source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
-                candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
-                if candidate == evaluator_id:
-                    abs_path = os.path.abspath(t.file_path)
-                    try:
-                        rel = os.path.relpath(abs_path, project_root)
-                    except Exception:
-                        rel = abs_path
-                    selected_entry = f"{rel}::{func_name}"
-                    break
-
-        upload_args = argparse.Namespace(
-            path=project_root,
-            entry=selected_entry,
-            id=evaluator_id,
-            display_name=None,
-            description=None,
-            force=force,  # Pass through the --force flag
-            yes=True,
-            env_file=None,  # Add the new env_file parameter
-        )
+            tests = _discover_tests(project_root)
+            selected_entry: Optional[str] = None
+            if len(tests) == 1:
+                func_name = tests[0].qualname.split(".")[-1]
+                abs_path = os.path.abspath(tests[0].file_path)
+                try:
+                    rel = os.path.relpath(abs_path, project_root)
+                except Exception:
+                    rel = abs_path
+                selected_entry = f"{rel}::{func_name}"
+            else:
+                # Try to match evaluator_id to a discovered test's normalized ID
+                for t in tests:
+                    func_name = t.qualname.split(".")[-1]
+                    source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
+                    candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
+                    if candidate == evaluator_id:
+                        abs_path = os.path.abspath(t.file_path)
+                        try:
+                            rel = os.path.relpath(abs_path, project_root)
+                        except Exception:
+                            rel = abs_path
+                        selected_entry = f"{rel}::{func_name}"
+                        break
+                # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
+                if selected_entry is None:
+                    print(
+                        f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n"
+                        "       Please re-run specifying the evaluator id.\n"
+                        "       Hints:\n"
+                        "         - eval-protocol create rft --evaluator-id <existing-evaluator-id>\n"
+                    )
+                    return 1
+
+            upload_args = argparse.Namespace(
+                path=project_root,
+                entry=selected_entry,
+                id=evaluator_id,
+                display_name=None,
+                description=None,
+                force=force,  # Pass through the --force flag
+                yes=True,
+                env_file=None,  # Add the new env_file parameter
+            )
 
-        if force:
-            print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
+            if force:
+                print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
 
-        rc = upload_command(upload_args)
-        if rc == 0:
-            print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
+            rc = upload_command(upload_args)
+            if rc == 0:
+                print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
 
-            # Poll for evaluator status
-            print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-            is_active = _poll_evaluator_status(
-                evaluator_resource_name=evaluator_resource_name, api_key=api_key, api_base=api_base, timeout_minutes=10
-            )
+                # Poll for evaluator status
+                print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
+                is_active = _poll_evaluator_status(
+                    evaluator_resource_name=evaluator_resource_name,
+                    api_key=api_key,
+                    api_base=api_base,
+                    timeout_minutes=10,
+                )
 
-            if not is_active:
-                # Print helpful message with dashboard link
-                app_base = _map_api_host_to_app_host(api_base)
-                evaluator_slug = _extract_terminal_segment(evaluator_id)
-                dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}"
+                if not is_active:
+                    # Print helpful message with dashboard link
+                    app_base = _map_api_host_to_app_host(api_base)
+                    evaluator_slug = _extract_terminal_segment(evaluator_id)
+                    dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}"
 
-                print("\n❌ Evaluator is not ready within the timeout period.")
-                print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                return 1
+                    print("\n❌ Evaluator is not ready within the timeout period.")
+                    print(f"📊 Please check the evaluator status at: {dashboard_url}")
+                    print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
+                    return 1
+                else:
+                    # Only persist last-used evaluator after successful ensure + ACTIVE
+                    _save_last_evaluator(project_root, evaluator_id)
             else:
-                # Only persist last-used evaluator after successful ensure + ACTIVE
-                _save_last_evaluator(project_root, evaluator_id)
-        else:
-            print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
-    except Exception as e:
-        print(f"Warning: Failed to upload evaluator automatically: {e}")
+                print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
+        except Exception as e:
+            print(f"Warning: Failed to upload evaluator automatically: {e}")
 
     # Determine dataset id and materialization path
     dataset_id = getattr(args, "dataset_id", None)

From cca18e66bb48fa09b10b6f340b6c7f192da3a0d2 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sat, 8 Nov 2025 19:21:55 -0800
Subject: [PATCH 4/4] fix

---
 eval_protocol/cli_commands/create_rft.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
index 26e663d1..8b6bce4d 100644
--- a/eval_protocol/cli_commands/create_rft.py
+++ b/eval_protocol/cli_commands/create_rft.py
@@ -513,15 +513,15 @@ def create_rft_command(args) -> int:
                             rel = abs_path
                         selected_entry = f"{rel}::{func_name}"
                         break
-                # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
-                if selected_entry is None:
-                    print(
-                        f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n"
-                        "       Please re-run specifying the evaluator id.\n"
-                        "       Hints:\n"
-                        "         - eval-protocol create rft --evaluator-id <existing-evaluator-id>\n"
-                    )
-                    return 1
+            # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
+            if selected_entry is None and len(tests) > 1:
+                print(
+                    f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n"
+                    "       Please re-run specifying the evaluator id.\n"
+                    "       Hints:\n"
+                    "         - eval-protocol create rft --evaluator-id <existing-evaluator-id>\n"
+                )
+                return 1
 
             upload_args = argparse.Namespace(
                 path=project_root,