Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 250 additions & 0 deletions check_run_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
#!/usr/bin/env python3
"""
SWE-bench run status checker / eval debugging tool.

Usage:
# With API key from environment:
export FIREWORKS_API_KEY="your-key"
python check_run_status.py pyroworks-d0xzrqmi

# Or pass key inline:
FIREWORKS_API_KEY="your-key" python check_run_status.py pyroworks-d0xzrqmi

# Check remote server invocations:
python check_run_status.py --server http://35.209.134.123:3000

# Check specific invocation on remote server:
python check_run_status.py --server http://35.209.134.123:3000 pyroworks-d0xzrqmi
"""

import argparse
import json
import os
import sys
from datetime import datetime
from typing import Any, Dict, List, Optional

import requests


def check_remote_server(server_url: str, invocation_id: Optional[str] = None) -> None:
"""Check invocations on a remote SWE-bench server."""
base = server_url.rstrip("/")

# Health check
try:
r = requests.get(f"{base}/health", timeout=5)
print(f"Server health: {r.json()}")
except Exception as e:
print(f"Server unreachable: {e}")
return

# Active invocations
try:
r = requests.get(f"{base}/active", timeout=5)
active = r.json()
count = active.get("count", 0)
invocations = active.get("invocations", [])
print(f"\nActive invocations: {count}")
for inv in invocations:
print(f" - {inv}")
except Exception as e:
print(f"Error fetching active: {e}")

# All invocations
try:
r = requests.get(f"{base}/invocations", timeout=30)
data = r.json()
all_inv = data.get("invocations", [])
print(f"\nAll invocations ({data.get('count', len(all_inv))}):")
for inv in all_inv:
name = inv.get("name", "unknown")
mtime = inv.get("mtime", "")
size = inv.get("size_mb", 0)
empty = inv.get("empty_responses", "N/A")
marker = " <-- MATCH" if invocation_id and invocation_id in name else ""
print(f" {name:35s} modified={mtime} size={size:.1f}MB empty_responses={empty}{marker}")
except Exception as e:
print(f"Error fetching invocations: {e}")

# If specific invocation requested, check debug endpoints
if invocation_id:
print(f"\nDebug info for invocation: {invocation_id}")
for endpoint in ["empty_responses", "llm_calls"]:
try:
r = requests.get(f"{base}/logs/{invocation_id}/{endpoint}", timeout=10)
data = r.json()
status = data.get("status", "unknown")
if status == "not_found":
print(f" {endpoint}: not found on this server")
else:
entries = data.get("entries", [])
print(f" {endpoint}: {len(entries)} entries")
except Exception as e:
print(f" {endpoint}: error - {e}")


def check_fireworks_tracing(invocation_id: str, api_key: Optional[str] = None) -> None:
"""Check run status via Fireworks tracing API."""
api_key = api_key or os.environ.get("FIREWORKS_API_KEY")
base_url = os.environ.get("FW_TRACING_GATEWAY_BASE_URL", "https://tracing.fireworks.ai")

if not api_key:
print("\nFireworks Tracing API: FIREWORKS_API_KEY not set, skipping trace query.")
print(" Set FIREWORKS_API_KEY to enable querying evaluation results.")
return

headers = {
"Authorization": f"Bearer {api_key}",
"User-Agent": "eval-protocol-debug/1.0",
}

# Query traces by invocation_id tag
print(f"\nQuerying Fireworks tracing for invocation_id={invocation_id}...")
params = {
"tags": [f"invocation_id:{invocation_id}"],
"limit": 500,
"hours_back": 720, # 30 days
}

for endpoint in ["/v1/traces/pointwise", "/v1/traces"]:
try:
r = requests.get(f"{base_url}{endpoint}", params=params, headers=headers, timeout=60)
if r.status_code == 401:
print(f" {endpoint}: Auth failed (401). Check your API key.")
continue
if r.status_code == 404:
continue
r.raise_for_status()
data = r.json()
traces = data.get("traces", [])
print(f" {endpoint}: Found {len(traces)} traces")

if traces:
# Analyze traces
completed = 0
running = 0
errored = 0
resolved = 0
total_score = 0.0

for trace in traces:
row_data = trace if isinstance(trace, dict) else {}
exec_meta = row_data.get("execution_metadata", {})
eval_result = row_data.get("evaluation_result", {})
rollout_status = row_data.get("rollout_status", {})

status_code = rollout_status.get("code", 0) if rollout_status else 0
if status_code == 5: # ERROR
errored += 1
elif eval_result:
completed += 1
score = eval_result.get("score", 0)
total_score += score
if score > 0:
resolved += 1
else:
running += 1

print(f"\n Summary:")
print(f" Total rows: {len(traces)}")
print(f" Completed: {completed}")
print(f" Running: {running}")
print(f" Errored: {errored}")
print(f" Resolved: {resolved}/{completed} ({resolved/completed*100:.1f}%)" if completed else " Resolved: N/A")
if completed > 0:
print(f" Avg score: {total_score/completed:.4f}")
return

except requests.exceptions.RequestException as e:
print(f" {endpoint}: Error - {e}")

# Also try the logs endpoint (doesn't require auth for some queries)
print(f"\nChecking tracing logs...")
try:
from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
adapter = FireworksTracingAdapter(base_url=base_url)

# Try to get evaluation rows
rows = adapter.get_evaluation_rows(
tags=[f"invocation_id:{invocation_id}"],
limit=500,
hours_back=720,
)

if rows:
print(f" Found {len(rows)} evaluation rows")
completed = sum(1 for r in rows if r.evaluation_result is not None)
resolved = sum(1 for r in rows if r.evaluation_result and r.evaluation_result.score > 0)
running = sum(1 for r in rows if r.evaluation_result is None and not r.rollout_status.is_error())
errored = sum(1 for r in rows if r.rollout_status.is_error())

print(f"\n Summary:")
print(f" Total rows: {len(rows)}")
print(f" Completed: {completed}")
print(f" Running: {running}")
print(f" Errored: {errored}")
if completed > 0:
print(f" Resolved: {resolved}/{completed} ({resolved/completed*100:.1f}%)")
avg_score = sum(r.evaluation_result.score for r in rows if r.evaluation_result) / completed
print(f" Avg score: {avg_score:.4f}")

# Print per-row details
print(f"\n Per-row details:")
for r in rows:
row_id = r.input_metadata.row_id or "?"
rollout_id = r.execution_metadata.rollout_id or "?"
if r.evaluation_result:
score = r.evaluation_result.score
reason = r.evaluation_result.reason or ""
status = "RESOLVED" if score > 0 else "FAILED"
print(f" row={row_id:>3s} {status:8s} score={score:.1f} {reason[:60]}")
elif r.rollout_status.is_error():
print(f" row={row_id:>3s} ERROR {r.rollout_status.message or ''}")
else:
print(f" row={row_id:>3s} RUNNING")
else:
print(f" No evaluation rows found for invocation_id={invocation_id}")

except ImportError:
print(" eval_protocol not installed, skipping adapter query")
except Exception as e:
print(f" Error querying evaluation rows: {e}")


def main():
parser = argparse.ArgumentParser(description="SWE-bench run status checker")
parser.add_argument("invocation_id", nargs="?", help="Invocation ID to check (e.g., pyroworks-d0xzrqmi)")
parser.add_argument("--server", default="http://35.209.134.123:3000",
help="Remote SWE-bench server URL (default: http://35.209.134.123:3000)")
parser.add_argument("--api-key", help="Fireworks API key (or set FIREWORKS_API_KEY env var)")
parser.add_argument("--skip-server", action="store_true", help="Skip remote server check")
parser.add_argument("--skip-tracing", action="store_true", help="Skip Fireworks tracing check")
args = parser.parse_args()

if not args.invocation_id and args.skip_server:
parser.error("invocation_id is required when --skip-server is used")

print(f"SWE-bench Run Status Checker")
print(f"{'=' * 50}")
if args.invocation_id:
print(f"Invocation ID: {args.invocation_id}")
print(f"Timestamp: {datetime.utcnow().isoformat()}Z")

# Check remote server
if not args.skip_server:
print(f"\n{'=' * 50}")
print(f"Remote Server: {args.server}")
print(f"{'=' * 50}")
check_remote_server(args.server, args.invocation_id)

# Check Fireworks tracing
if args.invocation_id and not args.skip_tracing:
print(f"\n{'=' * 50}")
print(f"Fireworks Tracing API")
print(f"{'=' * 50}")
check_fireworks_tracing(args.invocation_id, args.api_key)


if __name__ == "__main__":
main()