@@ -86,7 +86,7 @@ def _setup_metrics(self):
8686 registry = MetricsRegistry ()
8787 return registry
8888
89- def _prepare_benchmark (self , benchmark_name , data_path , limit , agent_system , agent_config , verbose ):
89+ def _prepare_benchmark (self , benchmark_name , data_path , limit , agent_system , agent_config , verbose , data_id = None ):
9090 """
9191 Run a benchmark with the specified configuration.
9292
@@ -131,6 +131,15 @@ def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, age
131131 except FileNotFoundError :
132132 raise FileNotFoundError (f"Data file not found: { data_path } " )
133133
134+ if data_id :
135+ primary_id = benchmark_config .get ("normalization_keys" , {}).get ("id" , None )
136+ if primary_id is not None :
137+ for problem in problems :
138+ if str (problem ["task_id" ]) == data_id :
139+ problems = [problem ]
140+ break
141+
142+
134143 if limit and limit < len (problems ):
135144 problems = random .sample (problems , limit )
136145
@@ -364,16 +373,16 @@ def _run_failure_attribution(self, all_results, agent_system, verbose):
364373 print (f" --output_dir { failure_output_dir } " )
365374 # print("-" * 80)
366375 rprint ("\n [bold]Alternative analysis methods:[/bold]" )
367- print (f"# For comprehensive analysis:" )
376+ print (f"#\n For comprehensive analysis:" )
368377 print (f"python { failure_inference_script } --method all_at_once --model gpt-4.1 --directory_path { failed_responses_dir } --output_dir { failure_output_dir } " )
369- print (f"# For efficient error localization in long conversations:" )
378+ print (f"#\n For efficient error localization in long conversations:" )
370379 print (f"python { failure_inference_script } --method binary_search --model gpt-4.1 --directory_path { failed_responses_dir } --output_dir { failure_output_dir } " )
371380 print (f"\n # For detailed incremental analysis:" )
372381 print (f"python { failure_inference_script } --method step_by_step --model gpt-4.1 --directory_path { failed_responses_dir } --output_dir { failure_output_dir } " )
373382
374383 print ("=" * 80 )
375384
376- def run (self , benchmark_name = "math" , data_path = None , limit = None , agent_system = "single_agent" , agent_config = None , verbose = True ):
385+ def run (self , benchmark_name = "math" , data_path = None , limit = None , agent_system = "single_agent" , agent_config = None , verbose = True , data_id = None ):
377386 """
378387 Run a benchmark sequentially. This is a wrapper around arun.
379388 """
@@ -384,12 +393,13 @@ def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="s
384393 agent_system = agent_system ,
385394 agent_config = agent_config ,
386395 verbose = verbose ,
396+ data_id = data_id ,
387397 concurrency = 1 # Run sequentially
388398 ))
389399
390- async def arun (self , benchmark_name = "math" , data_path = None , limit = None , agent_system = "single_agent" , agent_config = None , verbose = True , concurrency = 10 ):
400+ async def arun (self , benchmark_name = "math" , data_path = None , limit = None , agent_system = "single_agent" , agent_config = None , verbose = True , data_id = None , concurrency = 10 ):
391401 agent , problems , benchmark_config , output_file = self ._prepare_benchmark (
392- benchmark_name , data_path , limit , agent_system , agent_config , verbose
402+ benchmark_name , data_path , limit , agent_system , agent_config , verbose , data_id
393403 )
394404
395405 if verbose :
0 commit comments