Skip to content

Commit 867d947

Browse files
author
Shrey Modi
committed
porting to fireworks tracing
1 parent 0b38ca4 commit 867d947

File tree

2 files changed

+43
-32
lines changed

2 files changed

+43
-32
lines changed

examples/swebench/server.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,24 @@
77
from fastapi import FastAPI
88
import uvicorn
99

10-
from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter
10+
from eval_protocol import Status, InitRequest, RolloutIdFilter
11+
from eval_protocol.log_utils.init import init_external_logging_from_env
1112

1213
app = FastAPI()
1314

1415
# Attach Elasticsearch handler to root logger (Eval Protocol UI)
15-
handler = ElasticsearchDirectHttpHandler()
16-
logging.getLogger().addHandler(handler)
16+
init_external_logging_from_env()
1717
# rollout_states = {}
1818

1919

2020
@app.post("/init")
2121
def init(req: InitRequest):
2222
# Allow Eval Protocol to dynamically configure ES endpoint
23-
if req.elastic_search_config:
24-
handler.configure(req.elastic_search_config)
2523

2624
# Tag all logs for this rollout_id
2725
logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
2826
logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
2927

30-
# rollout_states[req.metadata.rollout_id] = {
31-
# "terminated": False,
32-
# "status": "running",
33-
# "instance_id": req.metadata.row_id,
34-
# }
35-
3628
def _worker():
3729
try:
3830
# Validate model
@@ -130,6 +122,7 @@ def _worker():
130122

131123
# 2) Run SWE-bench evaluation harness on preds.json
132124
preds_path_str = str(preds_path)
125+
unique_run_id = f"eval-{invocation_id}"
133126
eval_cmd = [
134127
"python3",
135128
"-m",
@@ -141,7 +134,7 @@ def _worker():
141134
"--max_workers",
142135
str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
143136
"--run_id",
144-
"eval-run",
137+
unique_run_id,
145138
]
146139
logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
147140
eval_proc = subprocess.Popen(

examples/swebench/tests/test_swebench.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@
22
from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
33
from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
44
from eval_protocol.pytest import evaluation_test
5-
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
6-
7-
# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
5+
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
86
from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
97

108

@@ -23,39 +21,59 @@ def rows() -> List[EvaluationRow]:
2321
model_base_url="https://tracing.fireworks.ai",
2422
timeout_seconds=1800,
2523
disable_elastic_search_setup=True,
26-
elastic_search_config=create_elasticsearch_config_from_env(),
2724
),
2825
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
2926
max_concurrent_rollouts=3,
3027
)
3128
async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
32-
"""Evaluate SWE-bench instance by reading results from Elasticsearch."""
29+
"""Evaluate SWE-bench instance by reading results from Fireworks tracing logs."""
30+
import logging
31+
32+
logger = logging.getLogger(__name__)
33+
3334
rollout_id = row.execution_metadata.rollout_id
35+
logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
36+
3437
if not rollout_id:
38+
logger.warning("[DEBUG] No rollout_id")
3539
return row
3640

3741
try:
38-
from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
42+
from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
3943

40-
es_config = create_elasticsearch_config_from_env()
41-
es_client = ElasticsearchClient(es_config)
44+
adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai")
45+
logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai")
4246

43-
# Get all logs for this rollout and find EVAL_RESULT message
44-
query = {"match": {"rollout_id": rollout_id}}
45-
search_results = es_client.search(query=query, size=50)
47+
# Fetch logs for this rollout
48+
logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}")
49+
log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24)
4650

47-
if search_results and search_results["hits"]["total"]["value"] > 0:
48-
for hit in search_results["hits"]["hits"]:
49-
message = hit["_source"].get("message", "")
51+
logger.info(f"[DEBUG] Received {len(log_entries)} log entries")
52+
if log_entries:
53+
logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}")
5054

51-
if message.startswith("EVAL_RESULT:"):
52-
result_json = message.replace("EVAL_RESULT:", "")
55+
# Find EVAL_RESULT message
56+
found = False
57+
for entry in log_entries:
58+
message = entry.get("message", "")
59+
if message.startswith("EVAL_RESULT:"):
60+
logger.info("[DEBUG] Found EVAL_RESULT message!")
61+
result_json = message.replace("EVAL_RESULT:", "")
62+
logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...")
63+
64+
if result_json != "null":
5365
row.evaluation_result = EvaluateResult.model_validate_json(result_json)
54-
break
66+
logger.info(
67+
f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}"
68+
)
69+
found = True
70+
break
5571

56-
except Exception as e:
57-
import logging
72+
if not found:
73+
logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs")
5874

59-
logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}")
75+
except Exception as e:
76+
logger.error(f"[DEBUG] Exception: {e}", exc_info=True)
6077

78+
logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
6179
return row

0 commit comments

Comments
 (0)