Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions src/aggregators/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,20 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

# Current layout: results/<exp>/<model>__<service>/run-N/<category>__<task>/
# Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
# Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).

def normalize_service_name(name: str) -> str:
# Strip known task-set suffixes like "-easy" or "-standard"
if name.endswith("-easy") or name.endswith("-standard"):
base = name.rsplit("-", 1)[0]
else:
base = name

# Map variant names to canonical service
if base == "playwright_webarena":
return "playwright"
return base
for model_service_dir in exp_dir.iterdir():
if not model_service_dir.is_dir() or "__" not in model_service_dir.name:
continue
Expand Down Expand Up @@ -944,13 +958,15 @@ def main():
# Print validation report with summary table
print_validation_report(complete_models, incomplete_models, invalid_models,
all_tasks, args.k, single_run_models, results)

if not complete_models:

# Determine which models to include in output (strict: only complete models)
models_for_output = dict(complete_models)
if not models_for_output:
return 1

# Calculate metrics
print("\n📊 Calculating metrics...")
summary = calculate_metrics(complete_models, all_tasks, args.k, single_run_models)
summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models)
summary["experiment_name"] = args.exp_name
summary["task_set"] = args.task_set

Expand All @@ -962,12 +978,12 @@ def main():

# Generate model_results
print("📁 Generating model_results...")
generate_model_results(exp_dir, complete_models, all_tasks)
print(f" Created {len(complete_models)} model directories")
generate_model_results(exp_dir, models_for_output, all_tasks)
print(f" Created {len(models_for_output)} model directories")

# Generate task_results
print("📁 Generating task_results...")
generate_task_results(exp_dir, complete_models, all_tasks)
generate_task_results(exp_dir, models_for_output, all_tasks)
print(f" Created {total_tasks} task files")

# Generate README
Expand Down
4 changes: 0 additions & 4 deletions src/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,10 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) ->
"connection refused" in error_str.lower() or "econnrefused" in error_str.lower()
):
base_msg = "Connection refused"
elif "authentication" in error_str.lower() or "unauthorized" in error_str.lower():
base_msg = "Authentication failed"
elif "not found" in error_str.lower():
base_msg = "Resource not found"
elif "already exists" in error_str.lower():
base_msg = "Resource already exists"
elif "mcp" in error_str.lower() and "error" in error_str.lower():
base_msg = "MCP service error"
else:
# Return original message if no standardization applies
return error_str
Expand Down