From 572dc051d2ccb7db91af0b6890c5718e9c6c569b Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 17 Feb 2026 19:24:30 +0000 Subject: [PATCH] Add SWE-bench run status checker / eval debugging tool Co-authored-by: Derek Xu --- check_run_status.py | 250 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 check_run_status.py diff --git a/check_run_status.py b/check_run_status.py new file mode 100644 index 0000000..d2a72f1 --- /dev/null +++ b/check_run_status.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +SWE-bench run status checker / eval debugging tool. + +Usage: + # With API key from environment: + export FIREWORKS_API_KEY="your-key" + python check_run_status.py pyroworks-d0xzrqmi + + # Or pass key inline: + FIREWORKS_API_KEY="your-key" python check_run_status.py pyroworks-d0xzrqmi + + # Check remote server invocations: + python check_run_status.py --server http://35.209.134.123:3000 + + # Check specific invocation on remote server: + python check_run_status.py --server http://35.209.134.123:3000 pyroworks-d0xzrqmi +""" + +import argparse +import json +import os +import sys +from datetime import datetime +from typing import Any, Dict, List, Optional + +import requests + + +def check_remote_server(server_url: str, invocation_id: Optional[str] = None) -> None: + """Check invocations on a remote SWE-bench server.""" + base = server_url.rstrip("/") + + # Health check + try: + r = requests.get(f"{base}/health", timeout=5) + print(f"Server health: {r.json()}") + except Exception as e: + print(f"Server unreachable: {e}") + return + + # Active invocations + try: + r = requests.get(f"{base}/active", timeout=5) + active = r.json() + count = active.get("count", 0) + invocations = active.get("invocations", []) + print(f"\nActive invocations: {count}") + for inv in invocations: + print(f" - {inv}") + except Exception as e: + print(f"Error fetching active: {e}") + + # All invocations + try: + r = requests.get(f"{base}/invocations", timeout=30) + data = r.json() + all_inv = data.get("invocations", []) + print(f"\nAll invocations ({data.get('count', len(all_inv))}):") + for inv in all_inv: + name = inv.get("name", "unknown") + mtime = inv.get("mtime", "") + size = inv.get("size_mb", 0) + empty = inv.get("empty_responses", "N/A") + marker = " <-- MATCH" if invocation_id and invocation_id in name else "" + print(f" {name:35s} modified={mtime} size={size:.1f}MB empty_responses={empty}{marker}") + except Exception as e: + print(f"Error fetching invocations: {e}") + + # If specific invocation requested, check debug endpoints + if invocation_id: + print(f"\nDebug info for invocation: {invocation_id}") + for endpoint in ["empty_responses", "llm_calls"]: + try: + r = requests.get(f"{base}/logs/{invocation_id}/{endpoint}", timeout=10) + data = r.json() + status = data.get("status", "unknown") + if status == "not_found": + print(f" {endpoint}: not found on this server") + else: + entries = data.get("entries", []) + print(f" {endpoint}: {len(entries)} entries") + except Exception as e: + print(f" {endpoint}: error - {e}") + + +def check_fireworks_tracing(invocation_id: str, api_key: Optional[str] = None) -> None: + """Check run status via Fireworks tracing API.""" + api_key = api_key or os.environ.get("FIREWORKS_API_KEY") + base_url = os.environ.get("FW_TRACING_GATEWAY_BASE_URL", "https://tracing.fireworks.ai") + + if not api_key: + print("\nFireworks Tracing API: FIREWORKS_API_KEY not set, skipping trace query.") + print(" Set FIREWORKS_API_KEY to enable querying evaluation results.") + return + + headers = { + "Authorization": f"Bearer {api_key}", + "User-Agent": "eval-protocol-debug/1.0", + } + + # Query traces by invocation_id tag + print(f"\nQuerying Fireworks tracing for invocation_id={invocation_id}...") + params = { + "tags": [f"invocation_id:{invocation_id}"], + "limit": 500, + "hours_back": 720, # 30 days + } + + for endpoint in ["/v1/traces/pointwise", "/v1/traces"]: + try: + r = requests.get(f"{base_url}{endpoint}", params=params, headers=headers, timeout=60) + if r.status_code == 401: + print(f" {endpoint}: Auth failed (401). Check your API key.") + continue + if r.status_code == 404: + continue + r.raise_for_status() + data = r.json() + traces = data.get("traces", []) + print(f" {endpoint}: Found {len(traces)} traces") + + if traces: + # Analyze traces + completed = 0 + running = 0 + errored = 0 + resolved = 0 + total_score = 0.0 + + for trace in traces: + row_data = trace if isinstance(trace, dict) else {} + exec_meta = row_data.get("execution_metadata", {}) + eval_result = row_data.get("evaluation_result", {}) + rollout_status = row_data.get("rollout_status", {}) + + status_code = rollout_status.get("code", 0) if rollout_status else 0 + if status_code == 5: # ERROR + errored += 1 + elif eval_result: + completed += 1 + score = eval_result.get("score", 0) + total_score += score + if score > 0: + resolved += 1 + else: + running += 1 + + print(f"\n Summary:") + print(f" Total rows: {len(traces)}") + print(f" Completed: {completed}") + print(f" Running: {running}") + print(f" Errored: {errored}") + print(f" Resolved: {resolved}/{completed} ({resolved/completed*100:.1f}%)" if completed else " Resolved: N/A") + if completed > 0: + print(f" Avg score: {total_score/completed:.4f}") + return + + except requests.exceptions.RequestException as e: + print(f" {endpoint}: Error - {e}") + + # Also try the logs endpoint (doesn't require auth for some queries) + print(f"\nChecking tracing logs...") + try: + from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter + adapter = FireworksTracingAdapter(base_url=base_url) + + # Try to get evaluation rows + rows = adapter.get_evaluation_rows( + tags=[f"invocation_id:{invocation_id}"], + limit=500, + hours_back=720, + ) + + if rows: + print(f" Found {len(rows)} evaluation rows") + completed = sum(1 for r in rows if r.evaluation_result is not None) + resolved = sum(1 for r in rows if r.evaluation_result and r.evaluation_result.score > 0) + running = sum(1 for r in rows if r.evaluation_result is None and not r.rollout_status.is_error()) + errored = sum(1 for r in rows if r.rollout_status.is_error()) + + print(f"\n Summary:") + print(f" Total rows: {len(rows)}") + print(f" Completed: {completed}") + print(f" Running: {running}") + print(f" Errored: {errored}") + if completed > 0: + print(f" Resolved: {resolved}/{completed} ({resolved/completed*100:.1f}%)") + avg_score = sum(r.evaluation_result.score for r in rows if r.evaluation_result) / completed + print(f" Avg score: {avg_score:.4f}") + + # Print per-row details + print(f"\n Per-row details:") + for r in rows: + row_id = r.input_metadata.row_id or "?" + rollout_id = r.execution_metadata.rollout_id or "?" + if r.evaluation_result: + score = r.evaluation_result.score + reason = r.evaluation_result.reason or "" + status = "RESOLVED" if score > 0 else "FAILED" + print(f" row={row_id:>3s} {status:8s} score={score:.1f} {reason[:60]}") + elif r.rollout_status.is_error(): + print(f" row={row_id:>3s} ERROR {r.rollout_status.message or ''}") + else: + print(f" row={row_id:>3s} RUNNING") + else: + print(f" No evaluation rows found for invocation_id={invocation_id}") + + except ImportError: + print(" eval_protocol not installed, skipping adapter query") + except Exception as e: + print(f" Error querying evaluation rows: {e}") + + +def main(): + parser = argparse.ArgumentParser(description="SWE-bench run status checker") + parser.add_argument("invocation_id", nargs="?", help="Invocation ID to check (e.g., pyroworks-d0xzrqmi)") + parser.add_argument("--server", default="http://35.209.134.123:3000", + help="Remote SWE-bench server URL (default: http://35.209.134.123:3000)") + parser.add_argument("--api-key", help="Fireworks API key (or set FIREWORKS_API_KEY env var)") + parser.add_argument("--skip-server", action="store_true", help="Skip remote server check") + parser.add_argument("--skip-tracing", action="store_true", help="Skip Fireworks tracing check") + args = parser.parse_args() + + if not args.invocation_id and args.skip_server: + parser.error("invocation_id is required when --skip-server is used") + + print(f"SWE-bench Run Status Checker") + print(f"{'=' * 50}") + if args.invocation_id: + print(f"Invocation ID: {args.invocation_id}") + print(f"Timestamp: {datetime.utcnow().isoformat()}Z") + + # Check remote server + if not args.skip_server: + print(f"\n{'=' * 50}") + print(f"Remote Server: {args.server}") + print(f"{'=' * 50}") + check_remote_server(args.server, args.invocation_id) + + # Check Fireworks tracing + if args.invocation_id and not args.skip_tracing: + print(f"\n{'=' * 50}") + print(f"Fireworks Tracing API") + print(f"{'=' * 50}") + check_fireworks_tracing(args.invocation_id, args.api_key) + + +if __name__ == "__main__": + main()