eval-sys · xyliugo · Dec 2, 2025 · Nov 17, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/src/aggregators/aggregate_results.py b/src/aggregators/aggregate_results.py
@@ -84,6 +84,20 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
     results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
     # Current layout: results/<exp>/<model>__<service>/run-N/<category>__<task>/
+    # Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
+    # Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).
+
+    def normalize_service_name(name: str) -> str:
+        # Strip known task-set suffixes like "-easy" or "-standard"
+        if name.endswith("-easy") or name.endswith("-standard"):
+            base = name.rsplit("-", 1)[0]
+        else:
+            base = name
+
+        # Map variant names to canonical service
+        if base == "playwright_webarena":
+            return "playwright"
+        return base
     for model_service_dir in exp_dir.iterdir():
         if not model_service_dir.is_dir() or "__" not in model_service_dir.name:
             continue
@@ -944,13 +958,15 @@ def main():
     # Print validation report with summary table
     print_validation_report(complete_models, incomplete_models, invalid_models, 
                            all_tasks, args.k, single_run_models, results)
-
-    if not complete_models:
+
+    # Determine which models to include in output (strict: only complete models)
+    models_for_output = dict(complete_models)
+    if not models_for_output:
         return 1
 
     # Calculate metrics
     print("\n📊 Calculating metrics...")
-    summary = calculate_metrics(complete_models, all_tasks, args.k, single_run_models)
+    summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models)
     summary["experiment_name"] = args.exp_name
     summary["task_set"] = args.task_set
 
@@ -962,12 +978,12 @@ def main():
 
     # Generate model_results
     print("📁 Generating model_results...")
-    generate_model_results(exp_dir, complete_models, all_tasks)
-    print(f"  Created {len(complete_models)} model directories")
+    generate_model_results(exp_dir, models_for_output, all_tasks)
+    print(f"  Created {len(models_for_output)} model directories")
 
     # Generate task_results
     print("📁 Generating task_results...")
-    generate_task_results(exp_dir, complete_models, all_tasks)
+    generate_task_results(exp_dir, models_for_output, all_tasks)
     print(f"  Created {total_tasks} task files")
 
     # Generate README

diff --git a/src/errors.py b/src/errors.py
@@ -46,14 +46,10 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) ->
         "connection refused" in error_str.lower() or "econnrefused" in error_str.lower()
     ):
         base_msg = "Connection refused"
-    elif "authentication" in error_str.lower() or "unauthorized" in error_str.lower():
-        base_msg = "Authentication failed"
     elif "not found" in error_str.lower():
         base_msg = "Resource not found"
     elif "already exists" in error_str.lower():
         base_msg = "Resource already exists"
-    elif "mcp" in error_str.lower() and "error" in error_str.lower():
-        base_msg = "MCP service error"
     else:
         # Return original message if no standardization applies
         return error_str