diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py index 6180b55b..04a6cca9 100755 --- a/src/aggregators/aggregate_results.py +++ b/src/aggregators/aggregate_results.py @@ -84,6 +84,20 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]: results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) # Current layout: results//__/run-N/__/ + # Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy"). + # Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres). + + def normalize_service_name(name: str) -> str: + # Strip known task-set suffixes like "-easy" or "-standard" + if name.endswith("-easy") or name.endswith("-standard"): + base = name.rsplit("-", 1)[0] + else: + base = name + + # Map variant names to canonical service + if base == "playwright_webarena": + return "playwright" + return base for model_service_dir in exp_dir.iterdir(): if not model_service_dir.is_dir() or "__" not in model_service_dir.name: continue @@ -944,13 +958,15 @@ def main(): # Print validation report with summary table print_validation_report(complete_models, incomplete_models, invalid_models, all_tasks, args.k, single_run_models, results) - - if not complete_models: + + # Determine which models to include in output (strict: only complete models) + models_for_output = dict(complete_models) + if not models_for_output: return 1 # Calculate metrics print("\nšŸ“Š Calculating metrics...") - summary = calculate_metrics(complete_models, all_tasks, args.k, single_run_models) + summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models) summary["experiment_name"] = args.exp_name summary["task_set"] = args.task_set @@ -962,12 +978,12 @@ def main(): # Generate model_results print("šŸ“ Generating model_results...") - generate_model_results(exp_dir, complete_models, all_tasks) - print(f" Created {len(complete_models)} model directories") + generate_model_results(exp_dir, models_for_output, all_tasks) + print(f" Created {len(models_for_output)} model directories") # Generate task_results print("šŸ“ Generating task_results...") - generate_task_results(exp_dir, complete_models, all_tasks) + generate_task_results(exp_dir, models_for_output, all_tasks) print(f" Created {total_tasks} task files") # Generate README diff --git a/src/errors.py b/src/errors.py index d39d5f05..759ba5b9 100644 --- a/src/errors.py +++ b/src/errors.py @@ -46,14 +46,10 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) -> "connection refused" in error_str.lower() or "econnrefused" in error_str.lower() ): base_msg = "Connection refused" - elif "authentication" in error_str.lower() or "unauthorized" in error_str.lower(): - base_msg = "Authentication failed" elif "not found" in error_str.lower(): base_msg = "Resource not found" elif "already exists" in error_str.lower(): base_msg = "Resource already exists" - elif "mcp" in error_str.lower() and "error" in error_str.lower(): - base_msg = "MCP service error" else: # Return original message if no standardization applies return error_str