Skip to content

Commit 47ef37b

Browse files
author
Shrey Modi
committed
added sandboxing of runs and remote server support
1 parent 14c6f46 commit 47ef37b

File tree

2 files changed

+93
-73
lines changed

2 files changed

+93
-73
lines changed

examples/swebench/server.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,13 @@ def _worker():
5656
script_dir = Path(__file__).parent
5757
env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}"
5858

59-
# Determine output directory (from env or default)
60-
out_dir = os.getcwd()
61-
59+
# Sandbox by invocation_id to isolate concurrent test runs
6260
from pathlib import Path
6361

62+
invocation_id = req.metadata.invocation_id
63+
base_dir = Path(os.getcwd()) / invocation_id
64+
base_dir.mkdir(parents=True, exist_ok=True)
65+
6466
script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve())
6567

6668
# Extract model_kwargs from req.metadata (forwarded from input_metadata)
@@ -89,7 +91,7 @@ def _worker():
8991
str(single_index),
9092
"--exit-immediately",
9193
"--output",
92-
str(out_dir),
94+
str(base_dir),
9395
"--model-class",
9496
"tracing_model.TracingFireworksModel",
9597
]
@@ -103,7 +105,7 @@ def _worker():
103105
import json
104106

105107
# Log path inside row directory for this run
106-
row_dir = Path(out_dir) / f"row_{single_index}"
108+
row_dir = base_dir / f"row_{single_index}"
107109
row_dir.mkdir(parents=True, exist_ok=True)
108110
log_path = row_dir / f"agent_{single_index}.log"
109111

@@ -150,12 +152,60 @@ def _worker():
150152
logger.info(line.rstrip("\n"))
151153
eval_rc = eval_proc.wait()
152154

155+
# Collect evaluation results to send via Elasticsearch
156+
import yaml
157+
158+
instance_id = None
159+
resolved = None
160+
161+
if preds_path.exists():
162+
try:
163+
preds = json.loads(preds_path.read_text())
164+
instance_id = next(iter(preds.keys()), None)
165+
except Exception:
166+
pass
167+
168+
if instance_id:
169+
model_id = req.completion_params.get("model") if req.completion_params else None
170+
if model_id:
171+
safe_model = model_id.replace("/", "__").replace(":", "-")
172+
report_path = (
173+
row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
174+
)
175+
176+
if report_path.exists():
177+
try:
178+
report_data = json.loads(report_path.read_text())
179+
resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
180+
except Exception:
181+
pass
182+
183+
if resolved is None:
184+
exit_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
185+
if exit_files:
186+
try:
187+
status_doc = yaml.safe_load(exit_files[-1].read_text()) or {}
188+
by_status = status_doc.get("instances_by_exit_status", {})
189+
for status_name, ids in by_status.items():
190+
if instance_id in (ids or []):
191+
resolved = False
192+
break
193+
except Exception:
194+
pass
195+
196+
results_data = {
197+
"instance_id": instance_id,
198+
"resolved": resolved,
199+
"row_id": str(single_index),
200+
}
201+
153202
except Exception as e:
154203
# Best-effort: mark error but still finish to unblock polling
204+
results_data = {"error": str(e), "row_id": str(single_index)}
155205
logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
156206
finally:
157-
# Always mark finished so RemoteRolloutProcessor stops polling
158-
logger.info("Rollout completed", extra={"status": Status.rollout_finished()})
207+
# Log results and mark finished
208+
logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()})
159209

160210
threading.Thread(target=_worker, daemon=True).start()
161211
return {"status": "accepted"}
Lines changed: 36 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
from typing import List
2-
import yaml
32
from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
4-
from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
3+
from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
54
from eval_protocol.pytest import evaluation_test
6-
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
5+
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
76
from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
8-
import json
9-
from pathlib import Path
107

118

129
def rows_from_indices(count: int) -> List[EvaluationRow]:
@@ -39,82 +36,55 @@ def rows() -> List[EvaluationRow]:
3936
model_base_url="https://tracing.fireworks.ai",
4037
timeout_seconds=1800,
4138
output_data_loader=default_fireworks_output_data_loader,
39+
disable_elastic_search_setup=True,
40+
elastic_search_config=create_elasticsearch_config_from_env(),
4241
),
4342
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
4443
max_concurrent_rollouts=3,
4544
)
4645
async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
47-
"""Evaluate SWE-bench instance by reading harness report or exit status."""
46+
"""Evaluate SWE-bench instance by reading results from Elasticsearch."""
47+
import logging
4848

49-
# Get row_id
50-
try:
51-
row_id = str(row.input_metadata.row_id)
52-
except Exception:
53-
return row
54-
55-
row_dir = Path.cwd() / f"row_{row_id}"
56-
57-
# Find instance_id from preds.json
58-
preds_path = row_dir / "preds.json"
59-
instance_id = None
60-
if preds_path.exists():
61-
try:
62-
preds = json.loads(preds_path.read_text())
63-
instance_id = next(iter(preds.keys()), None)
64-
except Exception:
65-
pass
49+
logger = logging.getLogger(__name__)
6650

67-
if not instance_id:
51+
rollout_id = row.execution_metadata.rollout_id
52+
if not rollout_id:
6853
return row
6954

70-
resolved: bool | None = None
71-
reason_text: str | None = None
55+
# Query Elasticsearch for results logged by server
56+
try:
57+
from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
7258

73-
# Get model from completion_params and convert to safe directory name (matching SWE-bench convention)
74-
model_id = row.input_metadata.completion_params.get("model") if row.input_metadata.completion_params else None
75-
if not model_id:
76-
return row
77-
safe_model = model_id.replace("/", "__").replace(":", "-")
59+
es_config = create_elasticsearch_config_from_env()
60+
es_client = ElasticsearchClient(es_config)
7861

79-
# Read from report.json (harness ran tests)
80-
report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
81-
if report_path.exists():
82-
try:
83-
report_data = json.loads(report_path.read_text())
84-
resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
85-
reason_text = f"harness_resolved={resolved}"
86-
except Exception:
87-
pass
62+
# Search for results log from this rollout
63+
query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}}
8864

89-
# If no report, check exit status YAML
90-
if resolved is None:
91-
exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
92-
if exit_status_files:
93-
try:
94-
status_doc = yaml.safe_load(exit_status_files[-1].read_text()) or {}
95-
by_status = status_doc.get("instances_by_exit_status", {})
96-
for status_name, ids in by_status.items():
97-
if instance_id in (ids or []):
98-
resolved = False
99-
reason_text = f"exit_status={status_name}"
100-
break
101-
except Exception:
102-
pass
65+
search_results = es_client.es.search(index=es_config.index_name, query=query, size=1)
10366

104-
# Attach result
105-
if resolved is not None:
106-
row.evaluation_result = EvaluateResult(
107-
score=1.0 if resolved else 0.0,
108-
reason=reason_text or f"resolved={resolved}",
109-
is_score_valid=True,
110-
metrics={
111-
"resolved": MetricResult(
67+
if search_results["hits"]["total"]["value"] > 0:
68+
hit = search_results["hits"]["hits"][0]["_source"]
69+
results_data = hit.get("results", {})
70+
resolved = results_data.get("resolved")
71+
instance_id = results_data.get("instance_id")
72+
73+
if resolved is not None:
74+
row.evaluation_result = EvaluateResult(
11275
score=1.0 if resolved else 0.0,
76+
reason=f"instance={instance_id}, resolved={resolved}",
11377
is_score_valid=True,
114-
reason=reason_text or f"resolved={resolved}",
115-
value=int(resolved),
78+
metrics={
79+
"resolved": MetricResult(
80+
score=1.0 if resolved else 0.0,
81+
is_score_valid=True,
82+
reason=f"resolved={resolved}",
83+
value=int(resolved),
84+
)
85+
},
11686
)
117-
},
118-
)
87+
except Exception as e:
88+
logger.warning(f"Could not read results from Elasticsearch: {e}")
11989

12090
return row

0 commit comments

Comments
 (0)