Skip to content

Commit e447ad6

Browse files
author
Shrey Modi
committed
remote server changes
1 parent 47ef37b commit e447ad6

File tree

3 files changed

+145
-49
lines changed

3 files changed

+145
-49
lines changed

examples/swebench/server.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# Attach Elasticsearch handler to root logger (Eval Protocol UI)
1515
handler = ElasticsearchDirectHttpHandler()
1616
logging.getLogger().addHandler(handler)
17-
rollout_states = {}
17+
# rollout_states = {}
1818

1919

2020
@app.post("/init")
@@ -27,11 +27,11 @@ def init(req: InitRequest):
2727
logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
2828
logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
2929

30-
rollout_states[req.metadata.rollout_id] = {
31-
"terminated": False,
32-
"status": "running",
33-
"instance_id": req.metadata.row_id,
34-
}
30+
# rollout_states[req.metadata.rollout_id] = {
31+
# "terminated": False,
32+
# "status": "running",
33+
# "instance_id": req.metadata.row_id,
34+
# }
3535

3636
def _worker():
3737
try:
@@ -157,6 +157,7 @@ def _worker():
157157

158158
instance_id = None
159159
resolved = None
160+
exit_reason = None
160161

161162
if preds_path.exists():
162163
try:
@@ -166,7 +167,7 @@ def _worker():
166167
pass
167168

168169
if instance_id:
169-
model_id = req.completion_params.get("model") if req.completion_params else None
170+
model_id = req.model
170171
if model_id:
171172
safe_model = model_id.replace("/", "__").replace(":", "-")
172173
report_path = (
@@ -189,13 +190,15 @@ def _worker():
189190
for status_name, ids in by_status.items():
190191
if instance_id in (ids or []):
191192
resolved = False
193+
exit_reason = status_name
192194
break
193195
except Exception:
194196
pass
195197

196198
results_data = {
197199
"instance_id": instance_id,
198200
"resolved": resolved,
201+
"exit_reason": exit_reason,
199202
"row_id": str(single_index),
200203
}
201204

@@ -204,16 +207,40 @@ def _worker():
204207
results_data = {"error": str(e), "row_id": str(single_index)}
205208
logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
206209
finally:
207-
# Log results and mark finished
208-
logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()})
210+
# Create and log EvaluateResult in standardized format
211+
from eval_protocol.models import EvaluateResult, MetricResult
212+
213+
if resolved is not None:
214+
reason = f"instance={instance_id}, resolved={resolved}"
215+
if exit_reason:
216+
reason += f", exit_reason={exit_reason}"
217+
218+
eval_result = EvaluateResult(
219+
score=1.0 if resolved else 0.0,
220+
reason=reason,
221+
is_score_valid=True,
222+
metrics={
223+
"resolved": MetricResult(
224+
score=1.0 if resolved else 0.0,
225+
is_score_valid=True,
226+
reason=f"resolved={resolved}",
227+
value=int(resolved),
228+
)
229+
},
230+
)
231+
logger.info(
232+
f"EVAL_RESULT:{eval_result.model_dump_json()}", extra={"status": Status.rollout_finished()}
233+
)
234+
else:
235+
logger.info("EVAL_RESULT:null", extra={"status": Status.rollout_finished()})
209236

210237
threading.Thread(target=_worker, daemon=True).start()
211238
return {"status": "accepted"}
212239

213240

214-
@app.get("/status")
215-
def status(rollout_id: str):
216-
return rollout_states.get(rollout_id, {"terminated": False})
241+
# @app.get("/status")
242+
# def status(rollout_id: str):
243+
# return rollout_states.get(rollout_id, {"terminated": False})
217244

218245

219246
def main():

examples/swebench/tests/test_swebench.py

Lines changed: 106 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def rows() -> List[EvaluationRow]:
3232
generators=[rows],
3333
),
3434
rollout_processor=RemoteRolloutProcessor(
35-
remote_base_url="http://127.0.0.1:3000",
35+
remote_base_url="http://35.209.134.123:3000",
3636
model_base_url="https://tracing.fireworks.ai",
3737
timeout_seconds=1800,
3838
output_data_loader=default_fireworks_output_data_loader,
@@ -42,49 +42,123 @@ def rows() -> List[EvaluationRow]:
4242
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
4343
max_concurrent_rollouts=3,
4444
)
45-
async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
46-
"""Evaluate SWE-bench instance by reading results from Elasticsearch."""
47-
import logging
45+
# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
46+
# """Evaluate SWE-bench instance by reading results from Elasticsearch."""
47+
# import logging
48+
# logger = logging.getLogger(__name__)
49+
50+
# rollout_id = row.execution_metadata.rollout_id
51+
# logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
52+
53+
# if not rollout_id:
54+
# logger.warning("[DEBUG] No rollout_id, returning early")
55+
# return row
56+
57+
# try:
58+
# from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
59+
60+
# es_config = create_elasticsearch_config_from_env()
61+
# es_client = ElasticsearchClient(es_config)
62+
# logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}")
63+
64+
# # Search for EVAL_RESULT log by message prefix
65+
# query = {"match": {"rollout_id": rollout_id}}
66+
# search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT
67+
# logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}")
68+
69+
# # Filter for EVAL_RESULT in Python
70+
# if search_results and search_results["hits"]["total"]["value"] > 0:
71+
# for hit in search_results["hits"]["hits"]:
72+
# message = hit["_source"].get("message", "")
73+
74+
# if message.startswith("EVAL_RESULT:"):
75+
# logger.info(f"[DEBUG] Found EVAL_RESULT message!")
76+
# result_json = message.replace("EVAL_RESULT:", "")
77+
# row.evaluation_result = EvaluateResult.model_validate_json(result_json)
78+
# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}")
79+
# break
80+
# else:
81+
# logger.warning("[DEBUG] EVAL_RESULT message not found in logs")
82+
# else:
83+
# logger.warning("[DEBUG] No logs found for rollout")
84+
85+
# logger.info(f"[DEBUG] Searching ES for EVAL_RESULT")
86+
# import asyncio
87+
# search_results = None
88+
# for attempt in range(5):
89+
# search_results = es_client.search(query=query, size=1)
90+
# if search_results and search_results["hits"]["total"]["value"] > 0:
91+
# logger.info(f"[DEBUG] Found result on attempt {attempt + 1}")
92+
# break
93+
# logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...")
94+
# await asyncio.sleep(1)
95+
96+
# logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits")
97+
# debug_query = {"match": {"rollout_id": rollout_id}}
98+
# debug_results = es_client.search(query=debug_query, size=26)
99+
# logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}")
48100

49-
logger = logging.getLogger(__name__)
101+
# if debug_results["hits"]["total"]["value"] > 0:
102+
# for hit in debug_results["hits"]["hits"]:
103+
# msg = hit["_source"].get("message", "")[:80]
104+
# logger.info(f"[DEBUG] Sample message: {msg}")
105+
# else:
106+
# logger.warning("[DEBUG] No logs at all for this rollout_id!")
107+
# if search_results and search_results["hits"]["total"]["value"] > 0:
108+
# hit = search_results["hits"]["hits"][0]["_source"]
109+
# message = hit.get("message", "")
110+
# logger.info(f"[DEBUG] Found message: {message[:100]}...")
50111

112+
# if message.startswith("EVAL_RESULT:"):
113+
# result_json = message.replace("EVAL_RESULT:", "")
114+
# logger.info(f"[DEBUG] Parsing EvaluateResult JSON")
115+
116+
# if result_json != "null":
117+
# # Deserialize directly to EvaluateResult
118+
# row.evaluation_result = EvaluateResult.model_validate_json(result_json)
119+
# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}")
120+
# else:
121+
# logger.warning("[DEBUG] Result was null (no resolved status available)")
122+
# else:
123+
# logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}")
124+
# else:
125+
# logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch")
126+
127+
# except Exception as e:
128+
# logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True)
129+
130+
# logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
131+
# return row
132+
133+
134+
async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
135+
"""Evaluate SWE-bench instance by reading results from Elasticsearch."""
51136
rollout_id = row.execution_metadata.rollout_id
52137
if not rollout_id:
53138
return row
54139

55-
# Query Elasticsearch for results logged by server
56140
try:
57141
from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
58142

59143
es_config = create_elasticsearch_config_from_env()
60144
es_client = ElasticsearchClient(es_config)
61145

62-
# Search for results log from this rollout
63-
query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}}
64-
65-
search_results = es_client.es.search(index=es_config.index_name, query=query, size=1)
66-
67-
if search_results["hits"]["total"]["value"] > 0:
68-
hit = search_results["hits"]["hits"][0]["_source"]
69-
results_data = hit.get("results", {})
70-
resolved = results_data.get("resolved")
71-
instance_id = results_data.get("instance_id")
72-
73-
if resolved is not None:
74-
row.evaluation_result = EvaluateResult(
75-
score=1.0 if resolved else 0.0,
76-
reason=f"instance={instance_id}, resolved={resolved}",
77-
is_score_valid=True,
78-
metrics={
79-
"resolved": MetricResult(
80-
score=1.0 if resolved else 0.0,
81-
is_score_valid=True,
82-
reason=f"resolved={resolved}",
83-
value=int(resolved),
84-
)
85-
},
86-
)
146+
# Get all logs for this rollout and find EVAL_RESULT message
147+
query = {"match": {"rollout_id": rollout_id}}
148+
search_results = es_client.search(query=query, size=50)
149+
150+
if search_results and search_results["hits"]["total"]["value"] > 0:
151+
for hit in search_results["hits"]["hits"]:
152+
message = hit["_source"].get("message", "")
153+
154+
if message.startswith("EVAL_RESULT:"):
155+
result_json = message.replace("EVAL_RESULT:", "")
156+
row.evaluation_result = EvaluateResult.model_validate_json(result_json)
157+
break
158+
87159
except Exception as e:
88-
logger.warning(f"Could not read results from Elasticsearch: {e}")
160+
import logging
161+
162+
logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}")
89163

90164
return row

pyproject.toml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,6 @@ box2d = [
9797
"gymnasium[box2d]>=0.29.0",
9898
"Pillow",
9999
]
100-
swebench = [
101-
"mini-swe-agent>=1.14.0",
102-
"datasets>=2.0.0",
103-
"litellm>=1.75.0", # Note: Overrides core litellm<1.75.0 for swebench compatibility
104-
]
105100
langfuse = [
106101
"langfuse>=2.0.0",
107102
]

0 commit comments

Comments
 (0)