2020 create_dataset_from_jsonl ,
2121 create_reinforcement_fine_tuning_job ,
2222)
23- from .upload import _discover_tests , _normalize_evaluator_id , _resolve_entry_to_qual_and_source
24-
25-
26- def _last_evaluator_paths (cwd : str ) -> list [str ]:
27- return [
28- os .path .join (cwd , ".eval_protocol" , "last_evaluator.json" ),
29- os .path .expanduser (os .path .join ("~" , ".eval_protocol" , "last_evaluator.json" )),
30- ]
31-
32-
33- def _load_last_evaluator (cwd : str ) -> Optional [str ]:
34- import json
35-
36- for p in _last_evaluator_paths (cwd ):
37- try :
38- if os .path .isfile (p ):
39- with open (p , "r" , encoding = "utf-8" ) as f :
40- data = json .load (f )
41- if isinstance (data , dict ) and data .get ("evaluator_id" ):
42- return str (data ["evaluator_id" ])
43- except Exception :
44- # ignore and continue
45- pass
46- return None
47-
48-
49- def _save_last_evaluator (cwd : str , evaluator_id : str ) -> None :
50- import json
51-
52- base = os .path .join (cwd , ".eval_protocol" )
53- try :
54- os .makedirs (base , exist_ok = True )
55- with open (os .path .join (base , "last_evaluator.json" ), "w" , encoding = "utf-8" ) as f :
56- json .dump ({"evaluator_id" : evaluator_id , "ts" : time .time ()}, f )
57- except Exception :
58- # best-effort only
59- pass
60-
61-
62- def _gather_evaluator_traces (cwd : str ) -> list [dict ]:
63- roots = [
64- os .path .join (cwd , ".eval_protocol" , "evaluators" ),
65- os .path .expanduser (os .path .join ("~" , ".eval_protocol" , "evaluators" )),
66- ]
67- records : list [dict ] = []
68- for root in roots :
69- if os .path .isdir (root ):
70- for name in os .listdir (root ):
71- if name .endswith (".json" ):
72- full = os .path .join (root , name )
73- try :
74- mtime = os .path .getmtime (full )
75- except Exception :
76- mtime = 0.0
77- records .append ({"id" : name [:- 5 ], "path" : full , "mtime" : mtime })
78- # dedupe by id keeping most recent mtime
79- dedup : dict [str , dict ] = {}
80- for rec in records :
81- cur = dedup .get (rec ["id" ])
82- if not cur or rec ["mtime" ] > cur ["mtime" ]:
83- dedup [rec ["id" ]] = rec
84- return list (dedup .values ())
85-
86-
87- def _prompt_select_evaluator (candidates : list [dict ]) -> Optional [str ]:
88- print ("\n Multiple evaluators detected. Select one:" )
89- ordered = sorted (candidates , key = lambda x : - x ["mtime" ])
90- for i , c in enumerate (ordered , start = 1 ):
91- print (f" { i } ) { c ['id' ]} (from { c ['path' ]} )" )
92- try :
93- choice = input ("Enter a number (or press Enter to cancel): " ).strip ()
94- except KeyboardInterrupt :
95- print ("\n Cancelled." )
96- return None
97- if not choice or not choice .isdigit ():
98- return None
99- n = int (choice )
100- if 1 <= n <= len (ordered ):
101- sel = ordered [n - 1 ]["id" ]
102- print (f"✓ Using evaluator: { sel } " )
103- return sel
104- return None
23+ from ..fireworks_rft import detect_dataset_builder , materialize_dataset_via_builder
24+ from .upload import _discover_tests , _normalize_evaluator_id , _prompt_select
10525
10626
10727def _ensure_account_id () -> Optional [str ]:
@@ -331,35 +251,35 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
331251 return f"{ base } { suffix } "
332252
333253
334- def _auto_select_evaluator_id ( cwd : str , * , non_interactive : bool = False ) -> Optional [ str ]:
335- # 1) Use last used pointer if available
336- last = _load_last_evaluator ( cwd )
337- if last :
338- return last
339-
340- # 2) Look for evaluator traces in project and home
341- traces = _gather_evaluator_traces ( cwd )
342- if len ( traces ) == 1 :
343- return traces [ 0 ][ "id" ]
344- if len ( traces ) > 1 :
345- if non_interactive :
346- sel = sorted ( traces , key = lambda x : - x [ "mtime" ])[ 0 ][ "id" ]
347- print ( f"⚠️ Multiple evaluators found; using most recent: { sel } . Override with --evaluator-id." )
348- return sel
349- chosen = _prompt_select_evaluator ( traces )
350- if chosen :
351- return chosen
352- return None
353-
354- # 3) Fall back to discovering a single evaluation_test
355- tests = _discover_tests ( cwd )
356- if len ( tests ) == 1 :
357- qualname , source_file_path = tests [ 0 ]. qualname , tests [ 0 ]. file_path
358- test_func_name = qualname . split ( "." )[ - 1 ]
359- source_file_name = os . path . splitext ( os . path . basename ( source_file_path ))[ 0 ]
360- evaluator_id = _normalize_evaluator_id ( f" { source_file_name } - { test_func_name } " )
361- return evaluator_id
362- return None
254+ def _resolve_selected_test (
255+ project_root : str ,
256+ evaluator_id : Optional [ str ],
257+ selected_tests : Optional [ list ] = None ,
258+ ) -> tuple [ Optional [ str ], Optional [ str ]]:
259+ """
260+ Resolve a single test's source file path and function name to use downstream.
261+ Priority:
262+ 1) If selected_tests provided and length == 1, use it.
263+ 2) Else discover tests; if exactly one test, use it.
264+ 3) Else, if evaluator_id provided, match by normalized '<file-stem>-<func-name>'.
265+ Returns: (file_path, func_name) or (None, None) if unresolved.
266+ """
267+ try :
268+ tests = selected_tests if selected_tests is not None else _discover_tests ( project_root )
269+ if not tests :
270+ return None , None
271+ if len ( tests ) == 1 :
272+ return tests [ 0 ]. file_path , tests [ 0 ]. qualname . split ( "." )[ - 1 ]
273+ if evaluator_id :
274+ for t in tests :
275+ func_name = t . qualname . split ( "." )[ - 1 ]
276+ source_file_name = os . path . splitext ( os . path . basename ( t . file_path ))[ 0 ]
277+ candidate = _normalize_evaluator_id ( f" { source_file_name } - { func_name } " )
278+ if candidate == evaluator_id :
279+ return t . file_path , func_name
280+ return None , None
281+ except Exception :
282+ return None , None
363283
364284
365285def _poll_evaluator_status (
@@ -428,6 +348,9 @@ def create_rft_command(args) -> int:
428348 non_interactive : bool = bool (getattr (args , "yes" , False ))
429349 dry_run : bool = bool (getattr (args , "dry_run" , False ))
430350 force : bool = bool (getattr (args , "force" , False ))
351+ # Track the specifically chosen test (if any) to aid dataset inference later
352+ selected_test_file_path : Optional [str ] = None
353+ selected_test_func_name : Optional [str ] = None
431354
432355 api_key = get_fireworks_api_key ()
433356 if not api_key :
@@ -441,13 +364,52 @@ def create_rft_command(args) -> int:
441364
442365 api_base = get_fireworks_api_base ()
443366
444- # Resolve evaluator id if omitted
367+ # Resolve evaluator id/entry if omitted (reuse upload's selector flow)
445368 project_root = os .getcwd ()
446369 if not evaluator_id :
447- evaluator_id = _auto_select_evaluator_id (project_root , non_interactive = non_interactive )
448- if not evaluator_id :
449- print ("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first." )
370+ print ("Scanning for evaluation tests..." )
371+ tests = _discover_tests (project_root )
372+ if not tests :
373+ print ("No evaluation tests found." )
374+ print ("\n Hint: Make sure your tests use the @evaluation_test decorator." )
375+ return 1
376+ # Always interactive selection here (no implicit quiet unless --evaluator-id was provided)
377+ try :
378+ selected_tests = _prompt_select (tests , non_interactive = non_interactive )
379+ except Exception :
380+ print ("Error: Failed to open selector UI. Please pass --evaluator-id or --entry explicitly." )
381+ return 1
382+ if not selected_tests :
383+ print ("No tests selected." )
450384 return 1
385+ if len (selected_tests ) != 1 :
386+ if non_interactive and len (selected_tests ) > 1 :
387+ print ("Error: Multiple evaluation tests found in --yes (non-interactive) mode." )
388+ print (" Please pass --evaluator-id or --entry to disambiguate." )
389+ try :
390+ # Offer candidate evaluator ids for convenience
391+ tests = _discover_tests (project_root )
392+ if tests :
393+ print (" Candidate evaluator ids:" )
394+ for t in tests :
395+ func = t .qualname .split ("." )[- 1 ]
396+ stem = os .path .splitext (os .path .basename (t .file_path ))[0 ]
397+ cand = _normalize_evaluator_id (f"{ stem } -{ func } " )
398+ print (f" - { cand } " )
399+ except Exception :
400+ pass
401+ else :
402+ print ("Error: Please select exactly one evaluation test for 'create rft'." )
403+ return 1
404+ # Derive evaluator_id from user's single selection
405+ chosen = selected_tests [0 ]
406+ func_name = chosen .qualname .split ("." )[- 1 ]
407+ source_file_name = os .path .splitext (os .path .basename (chosen .file_path ))[0 ]
408+ evaluator_id = _normalize_evaluator_id (f"{ source_file_name } -{ func_name } " )
409+ # Resolve selected test once for downstream
410+ selected_test_file_path , selected_test_func_name = _resolve_selected_test (
411+ project_root , evaluator_id , selected_tests = selected_tests
412+ )
451413 # Resolve evaluator resource name to fully-qualified format required by API
452414 evaluator_resource_name = f"accounts/{ account_id } /evaluators/{ evaluator_id } "
453415
@@ -479,8 +441,12 @@ def create_rft_command(args) -> int:
479441 print (f"📊 Please check the evaluator status at: { dashboard_url } " )
480442 print (" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again." )
481443 return 1
482- _save_last_evaluator (project_root , evaluator_id )
483444 skip_upload = True
445+ # Populate selected test info for dataset inference later
446+ st_path , st_func = _resolve_selected_test (project_root , evaluator_id )
447+ if st_path and st_func :
448+ selected_test_file_path = st_path
449+ selected_test_func_name = st_func
484450 except requests .exceptions .RequestException :
485451 pass
486452
@@ -491,28 +457,16 @@ def create_rft_command(args) -> int:
491457
492458 tests = _discover_tests (project_root )
493459 selected_entry : Optional [str ] = None
494- if len ( tests ) == 1 :
495- func_name = tests [ 0 ]. qualname . split ( "." )[ - 1 ]
496- abs_path = os .path .abspath (tests [ 0 ]. file_path )
460+ st_path , st_func = _resolve_selected_test ( project_root , evaluator_id , selected_tests = tests )
461+ if st_path and st_func :
462+ abs_path = os .path .abspath (st_path )
497463 try :
498464 rel = os .path .relpath (abs_path , project_root )
499465 except Exception :
500466 rel = abs_path
501- selected_entry = f"{ rel } ::{ func_name } "
502- else :
503- # Try to match evaluator_id to a discovered test's normalized ID
504- for t in tests :
505- func_name = t .qualname .split ("." )[- 1 ]
506- source_file_name = os .path .splitext (os .path .basename (t .file_path ))[0 ]
507- candidate = _normalize_evaluator_id (f"{ source_file_name } -{ func_name } " )
508- if candidate == evaluator_id :
509- abs_path = os .path .abspath (t .file_path )
510- try :
511- rel = os .path .relpath (abs_path , project_root )
512- except Exception :
513- rel = abs_path
514- selected_entry = f"{ rel } ::{ func_name } "
515- break
467+ selected_entry = f"{ rel } ::{ st_func } "
468+ selected_test_file_path = st_path
469+ selected_test_func_name = st_func
516470 # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
517471 if selected_entry is None and len (tests ) > 1 :
518472 print (
@@ -561,8 +515,8 @@ def create_rft_command(args) -> int:
561515 print (" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again." )
562516 return 1
563517 else :
564- # Only persist last-used evaluator after successful ensure + ACTIVE
565- _save_last_evaluator ( project_root , evaluator_id )
518+ # Evaluator ACTIVE; proceed
519+ pass
566520 else :
567521 print ("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation." )
568522 except Exception as e :
@@ -575,30 +529,48 @@ def create_rft_command(args) -> int:
575529 dataset_builder = getattr (args , "dataset_builder" , None ) # accepted but unused in simplified flow
576530
577531 if not dataset_id :
578- # Prefer explicit --dataset-jsonl, else attempt to extract from data loader or input_dataset of the single discovered test
532+ # Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
579533 if not dataset_jsonl :
580- tests = _discover_tests (project_root )
581- if len (tests ) == 1 :
582- func_name = tests [0 ].qualname .split ("." )[- 1 ]
583- # Try data_loaders first (existing behavior)
584- dataset_jsonl = _extract_jsonl_from_dataloader (tests [0 ].file_path , func_name )
534+ # Use specifically selected test if available; else only infer when exactly one test exists
535+ test_file_for_infer = None
536+ func_for_infer = None
537+ if selected_test_file_path and selected_test_func_name :
538+ test_file_for_infer = selected_test_file_path
539+ func_for_infer = selected_test_func_name
540+ else :
541+ tests = _discover_tests (project_root )
542+ if len (tests ) == 1 :
543+ test_file_for_infer = tests [0 ].file_path
544+ func_for_infer = tests [0 ].qualname .split ("." )[- 1 ]
545+ if test_file_for_infer and func_for_infer :
546+ # Try data_loaders first
547+ dataset_jsonl = _extract_jsonl_from_dataloader (test_file_for_infer , func_for_infer )
585548 if dataset_jsonl :
586- # Display relative path for readability
587549 try :
588550 rel = os .path .relpath (dataset_jsonl , project_root )
589551 except Exception :
590552 rel = dataset_jsonl
591553 print (f"✓ Using JSONL from data loader: { rel } " )
592- else :
554+ if not dataset_jsonl :
593555 # Fall back to input_dataset (dataset_path)
594- dataset_jsonl = _extract_jsonl_from_input_dataset (tests [ 0 ]. file_path , func_name )
556+ dataset_jsonl = _extract_jsonl_from_input_dataset (test_file_for_infer , func_for_infer )
595557 if dataset_jsonl :
596- # Display relative path for readability
597558 try :
598559 rel = os .path .relpath (dataset_jsonl , project_root )
599560 except Exception :
600561 rel = dataset_jsonl
601562 print (f"✓ Using JSONL from input_dataset: { rel } " )
563+ if not dataset_jsonl :
564+ # Last resort: attempt to detect and run a dataset builder in the test's directory
565+ metric_dir = os .path .dirname (test_file_for_infer )
566+ builder_spec = detect_dataset_builder (metric_dir )
567+ if builder_spec :
568+ try :
569+ tmp_jsonl , count = materialize_dataset_via_builder (builder_spec )
570+ dataset_jsonl = tmp_jsonl
571+ print (f"✓ Materialized { count } rows via dataset builder: { builder_spec } " )
572+ except Exception as e :
573+ print (f"Warning: dataset builder failed: { e } " )
602574 if not dataset_jsonl :
603575 print (
604576 "Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
0 commit comments