From 572dc051d2ccb7db91af0b6890c5718e9c6c569b Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 17 Feb 2026 19:24:30 +0000
Subject: [PATCH] Add SWE-bench run status checker / eval debugging tool

Co-authored-by: Derek Xu <xzrderek@users.noreply.github.com>
---
 check_run_status.py | 250 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 check_run_status.py

diff --git a/check_run_status.py b/check_run_status.py
new file mode 100644
index 0000000..d2a72f1
--- /dev/null
+++ b/check_run_status.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+SWE-bench run status checker / eval debugging tool.
+
+Usage:
+    # With API key from environment:
+    export FIREWORKS_API_KEY="your-key"
+    python check_run_status.py pyroworks-d0xzrqmi
+
+    # Or pass key inline:
+    FIREWORKS_API_KEY="your-key" python check_run_status.py pyroworks-d0xzrqmi
+
+    # Check remote server invocations:
+    python check_run_status.py --server http://35.209.134.123:3000
+
+    # Check specific invocation on remote server:
+    python check_run_status.py --server http://35.209.134.123:3000 pyroworks-d0xzrqmi
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import requests
+
+
+def check_remote_server(server_url: str, invocation_id: Optional[str] = None) -> None:
+    """Check invocations on a remote SWE-bench server."""
+    base = server_url.rstrip("/")
+
+    # Health check
+    try:
+        r = requests.get(f"{base}/health", timeout=5)
+        print(f"Server health: {r.json()}")
+    except Exception as e:
+        print(f"Server unreachable: {e}")
+        return
+
+    # Active invocations
+    try:
+        r = requests.get(f"{base}/active", timeout=5)
+        active = r.json()
+        count = active.get("count", 0)
+        invocations = active.get("invocations", [])
+        print(f"\nActive invocations: {count}")
+        for inv in invocations:
+            print(f"  - {inv}")
+    except Exception as e:
+        print(f"Error fetching active: {e}")
+
+    # All invocations
+    try:
+        r = requests.get(f"{base}/invocations", timeout=30)
+        data = r.json()
+        all_inv = data.get("invocations", [])
+        print(f"\nAll invocations ({data.get('count', len(all_inv))}):")
+        for inv in all_inv:
+            name = inv.get("name", "unknown")
+            mtime = inv.get("mtime", "")
+            size = inv.get("size_mb", 0)
+            empty = inv.get("empty_responses", "N/A")
+            marker = " <-- MATCH" if invocation_id and invocation_id in name else ""
+            print(f"  {name:35s}  modified={mtime}  size={size:.1f}MB  empty_responses={empty}{marker}")
+    except Exception as e:
+        print(f"Error fetching invocations: {e}")
+
+    # If specific invocation requested, check debug endpoints
+    if invocation_id:
+        print(f"\nDebug info for invocation: {invocation_id}")
+        for endpoint in ["empty_responses", "llm_calls"]:
+            try:
+                r = requests.get(f"{base}/logs/{invocation_id}/{endpoint}", timeout=10)
+                data = r.json()
+                status = data.get("status", "unknown")
+                if status == "not_found":
+                    print(f"  {endpoint}: not found on this server")
+                else:
+                    entries = data.get("entries", [])
+                    print(f"  {endpoint}: {len(entries)} entries")
+            except Exception as e:
+                print(f"  {endpoint}: error - {e}")
+
+
+def check_fireworks_tracing(invocation_id: str, api_key: Optional[str] = None) -> None:
+    """Check run status via Fireworks tracing API."""
+    api_key = api_key or os.environ.get("FIREWORKS_API_KEY")
+    base_url = os.environ.get("FW_TRACING_GATEWAY_BASE_URL", "https://tracing.fireworks.ai")
+
+    if not api_key:
+        print("\nFireworks Tracing API: FIREWORKS_API_KEY not set, skipping trace query.")
+        print("  Set FIREWORKS_API_KEY to enable querying evaluation results.")
+        return
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "User-Agent": "eval-protocol-debug/1.0",
+    }
+
+    # Query traces by invocation_id tag
+    print(f"\nQuerying Fireworks tracing for invocation_id={invocation_id}...")
+    params = {
+        "tags": [f"invocation_id:{invocation_id}"],
+        "limit": 500,
+        "hours_back": 720,  # 30 days
+    }
+
+    for endpoint in ["/v1/traces/pointwise", "/v1/traces"]:
+        try:
+            r = requests.get(f"{base_url}{endpoint}", params=params, headers=headers, timeout=60)
+            if r.status_code == 401:
+                print(f"  {endpoint}: Auth failed (401). Check your API key.")
+                continue
+            if r.status_code == 404:
+                continue
+            r.raise_for_status()
+            data = r.json()
+            traces = data.get("traces", [])
+            print(f"  {endpoint}: Found {len(traces)} traces")
+
+            if traces:
+                # Analyze traces
+                completed = 0
+                running = 0
+                errored = 0
+                resolved = 0
+                total_score = 0.0
+
+                for trace in traces:
+                    row_data = trace if isinstance(trace, dict) else {}
+                    exec_meta = row_data.get("execution_metadata", {})
+                    eval_result = row_data.get("evaluation_result", {})
+                    rollout_status = row_data.get("rollout_status", {})
+
+                    status_code = rollout_status.get("code", 0) if rollout_status else 0
+                    if status_code == 5:  # ERROR
+                        errored += 1
+                    elif eval_result:
+                        completed += 1
+                        score = eval_result.get("score", 0)
+                        total_score += score
+                        if score > 0:
+                            resolved += 1
+                    else:
+                        running += 1
+
+                print(f"\n  Summary:")
+                print(f"    Total rows: {len(traces)}")
+                print(f"    Completed:  {completed}")
+                print(f"    Running:    {running}")
+                print(f"    Errored:    {errored}")
+                print(f"    Resolved:   {resolved}/{completed} ({resolved/completed*100:.1f}%)" if completed else "    Resolved:   N/A")
+                if completed > 0:
+                    print(f"    Avg score:  {total_score/completed:.4f}")
+                return
+
+        except requests.exceptions.RequestException as e:
+            print(f"  {endpoint}: Error - {e}")
+
+    # Also try the logs endpoint (doesn't require auth for some queries)
+    print(f"\nChecking tracing logs...")
+    try:
+        from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
+        adapter = FireworksTracingAdapter(base_url=base_url)
+
+        # Try to get evaluation rows
+        rows = adapter.get_evaluation_rows(
+            tags=[f"invocation_id:{invocation_id}"],
+            limit=500,
+            hours_back=720,
+        )
+
+        if rows:
+            print(f"  Found {len(rows)} evaluation rows")
+            completed = sum(1 for r in rows if r.evaluation_result is not None)
+            resolved = sum(1 for r in rows if r.evaluation_result and r.evaluation_result.score > 0)
+            running = sum(1 for r in rows if r.evaluation_result is None and not r.rollout_status.is_error())
+            errored = sum(1 for r in rows if r.rollout_status.is_error())
+
+            print(f"\n  Summary:")
+            print(f"    Total rows: {len(rows)}")
+            print(f"    Completed:  {completed}")
+            print(f"    Running:    {running}")
+            print(f"    Errored:    {errored}")
+            if completed > 0:
+                print(f"    Resolved:   {resolved}/{completed} ({resolved/completed*100:.1f}%)")
+                avg_score = sum(r.evaluation_result.score for r in rows if r.evaluation_result) / completed
+                print(f"    Avg score:  {avg_score:.4f}")
+
+            # Print per-row details
+            print(f"\n  Per-row details:")
+            for r in rows:
+                row_id = r.input_metadata.row_id or "?"
+                rollout_id = r.execution_metadata.rollout_id or "?"
+                if r.evaluation_result:
+                    score = r.evaluation_result.score
+                    reason = r.evaluation_result.reason or ""
+                    status = "RESOLVED" if score > 0 else "FAILED"
+                    print(f"    row={row_id:>3s}  {status:8s}  score={score:.1f}  {reason[:60]}")
+                elif r.rollout_status.is_error():
+                    print(f"    row={row_id:>3s}  ERROR     {r.rollout_status.message or ''}")
+                else:
+                    print(f"    row={row_id:>3s}  RUNNING")
+        else:
+            print(f"  No evaluation rows found for invocation_id={invocation_id}")
+
+    except ImportError:
+        print("  eval_protocol not installed, skipping adapter query")
+    except Exception as e:
+        print(f"  Error querying evaluation rows: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="SWE-bench run status checker")
+    parser.add_argument("invocation_id", nargs="?", help="Invocation ID to check (e.g., pyroworks-d0xzrqmi)")
+    parser.add_argument("--server", default="http://35.209.134.123:3000",
+                        help="Remote SWE-bench server URL (default: http://35.209.134.123:3000)")
+    parser.add_argument("--api-key", help="Fireworks API key (or set FIREWORKS_API_KEY env var)")
+    parser.add_argument("--skip-server", action="store_true", help="Skip remote server check")
+    parser.add_argument("--skip-tracing", action="store_true", help="Skip Fireworks tracing check")
+    args = parser.parse_args()
+
+    if not args.invocation_id and args.skip_server:
+        parser.error("invocation_id is required when --skip-server is used")
+
+    print(f"SWE-bench Run Status Checker")
+    print(f"{'=' * 50}")
+    if args.invocation_id:
+        print(f"Invocation ID: {args.invocation_id}")
+    print(f"Timestamp: {datetime.utcnow().isoformat()}Z")
+
+    # Check remote server
+    if not args.skip_server:
+        print(f"\n{'=' * 50}")
+        print(f"Remote Server: {args.server}")
+        print(f"{'=' * 50}")
+        check_remote_server(args.server, args.invocation_id)
+
+    # Check Fireworks tracing
+    if args.invocation_id and not args.skip_tracing:
+        print(f"\n{'=' * 50}")
+        print(f"Fireworks Tracing API")
+        print(f"{'=' * 50}")
+        check_fireworks_tracing(args.invocation_id, args.api_key)
+
+
+if __name__ == "__main__":
+    main()