porting to fireworks tracing

Shrey Modi · Shrey Modi · commit 867d94757599 · 2025-10-21T10:12:53.000-07:00
diff --git a/examples/swebench/server.py b/examples/swebench/server.py
@@ -7,32 +7,24 @@
 from fastapi import FastAPI
 import uvicorn
 
-from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter
+from eval_protocol import Status, InitRequest, RolloutIdFilter
+from eval_protocol.log_utils.init import init_external_logging_from_env
 
 app = FastAPI()
 
 # Attach Elasticsearch handler to root logger (Eval Protocol UI)
-handler = ElasticsearchDirectHttpHandler()
-logging.getLogger().addHandler(handler)
+init_external_logging_from_env()
 # rollout_states = {}
 
 
 @app.post("/init")
 def init(req: InitRequest):
     # Allow Eval Protocol to dynamically configure ES endpoint
-    if req.elastic_search_config:
-        handler.configure(req.elastic_search_config)
 
     # Tag all logs for this rollout_id
     logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
     logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
 
-    # rollout_states[req.metadata.rollout_id] = {
-    #     "terminated": False,
-    #     "status": "running",
-    #     "instance_id": req.metadata.row_id,
-    # }
-
     def _worker():
         try:
             # Validate model
@@ -130,6 +122,7 @@ def _worker():
 
             # 2) Run SWE-bench evaluation harness on preds.json
             preds_path_str = str(preds_path)
+            unique_run_id = f"eval-{invocation_id}"
             eval_cmd = [
                 "python3",
                 "-m",
@@ -141,7 +134,7 @@ def _worker():
                 "--max_workers",
                 str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
                 "--run_id",
-                "eval-run",
+                unique_run_id,
             ]
             logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
             eval_proc = subprocess.Popen(
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
@@ -2,9 +2,7 @@
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
 from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
-from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
-
-# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
 from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
 
 
@@ -23,39 +21,59 @@ def rows() -> List[EvaluationRow]:
         model_base_url="https://tracing.fireworks.ai",
         timeout_seconds=1800,
         disable_elastic_search_setup=True,
-        elastic_search_config=create_elasticsearch_config_from_env(),
     ),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_concurrent_rollouts=3,
 )
 async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-    """Evaluate SWE-bench instance by reading results from Elasticsearch."""
+    """Evaluate SWE-bench instance by reading results from Fireworks tracing logs."""
+    import logging
+
+    logger = logging.getLogger(__name__)
+
     rollout_id = row.execution_metadata.rollout_id
+    logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
+
     if not rollout_id:
+        logger.warning("[DEBUG] No rollout_id")
         return row
 
     try:
-        from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
+        from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
 
-        es_config = create_elasticsearch_config_from_env()
-        es_client = ElasticsearchClient(es_config)
+        adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai")
+        logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai")
 
-        # Get all logs for this rollout and find EVAL_RESULT message
-        query = {"match": {"rollout_id": rollout_id}}
-        search_results = es_client.search(query=query, size=50)
+        # Fetch logs for this rollout
+        logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}")
+        log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24)
 
-        if search_results and search_results["hits"]["total"]["value"] > 0:
-            for hit in search_results["hits"]["hits"]:
-                message = hit["_source"].get("message", "")
+        logger.info(f"[DEBUG] Received {len(log_entries)} log entries")
+        if log_entries:
+            logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}")
 
-                if message.startswith("EVAL_RESULT:"):
-                    result_json = message.replace("EVAL_RESULT:", "")
+        # Find EVAL_RESULT message
+        found = False
+        for entry in log_entries:
+            message = entry.get("message", "")
+            if message.startswith("EVAL_RESULT:"):
+                logger.info("[DEBUG] Found EVAL_RESULT message!")
+                result_json = message.replace("EVAL_RESULT:", "")
+                logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...")
+
+                if result_json != "null":
                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
-                    break
+                    logger.info(
+                        f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}"
+                    )
+                    found = True
+                break
 
-    except Exception as e:
-        import logging
+        if not found:
+            logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs")
 
-        logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}")
+    except Exception as e:
+        logger.error(f"[DEBUG] Exception: {e}", exc_info=True)
 
+    logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
     return row