Skip to content

Commit e08ca9a

Browse files
author
Shrey Modi
committed
addressed comments
1 parent e447ad6 commit e08ca9a

File tree

3 files changed

+64
-109
lines changed

3 files changed

+64
-109
lines changed

eval_protocol/utils/evaluation_row_utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import List
1010

1111
from eval_protocol.models import EvaluationRow, Message
12+
from eval_protocol.models import InputMetadata
1213

1314

1415
def serialize_message(msg: Message) -> str:
@@ -134,3 +135,27 @@ def assistant_to_ground_truth(data: List[EvaluationRow]) -> List[EvaluationRow]:
134135
)
135136

136137
return processed_rows
138+
139+
140+
def create_rows_from_indices(count: int, **metadata) -> List[EvaluationRow]:
141+
"""Create evaluation rows with sequential row_ids.
142+
143+
Useful for remote processors where the server determines content based on row_id.
144+
145+
Args:
146+
count: Number of rows to create
147+
**metadata: Additional metadata to include in each row
148+
149+
Returns:
150+
List of EvaluationRows with row_id set to "0", "1", "2", ...
151+
"""
152+
rows = []
153+
for idx in range(count):
154+
row_metadata = {"row_id": str(idx), **metadata}
155+
rows.append(
156+
EvaluationRow(
157+
messages=[],
158+
input_metadata=InputMetadata(**row_metadata),
159+
)
160+
)
161+
return rows

examples/swebench/tests/test_swebench.py

Lines changed: 5 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -3,134 +3,31 @@
33
from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
44
from eval_protocol.pytest import evaluation_test
55
from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
6-
from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
76

8-
9-
def rows_from_indices(count: int) -> List[EvaluationRow]:
10-
out: List[EvaluationRow] = []
11-
for idx in range(count):
12-
out.append(
13-
EvaluationRow(
14-
messages=[],
15-
input_metadata={
16-
"row_id": str(idx),
17-
"instance_index": str(idx),
18-
},
19-
)
20-
)
21-
return out
7+
# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
8+
from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
229

2310

2411
def rows() -> List[EvaluationRow]:
25-
# Generate 10 rows by index; server maps index -> dataset instance via --slice
26-
return rows_from_indices(2)
12+
return create_rows_from_indices(500) # All instances
2713

2814

2915
# -------------------- Harness result attachment (UI pass/fail) --------------------
3016
@evaluation_test(
3117
data_loaders=DynamicDataLoader(
3218
generators=[rows],
3319
),
20+
max_dataset_rows=2,
3421
rollout_processor=RemoteRolloutProcessor(
35-
remote_base_url="http://35.209.134.123:3000",
22+
remote_base_url="http://127.0.0.1:3000",
3623
model_base_url="https://tracing.fireworks.ai",
3724
timeout_seconds=1800,
38-
output_data_loader=default_fireworks_output_data_loader,
3925
disable_elastic_search_setup=True,
4026
elastic_search_config=create_elasticsearch_config_from_env(),
4127
),
4228
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
4329
max_concurrent_rollouts=3,
4430
)
45-
# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
46-
# """Evaluate SWE-bench instance by reading results from Elasticsearch."""
47-
# import logging
48-
# logger = logging.getLogger(__name__)
49-
50-
# rollout_id = row.execution_metadata.rollout_id
51-
# logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
52-
53-
# if not rollout_id:
54-
# logger.warning("[DEBUG] No rollout_id, returning early")
55-
# return row
56-
57-
# try:
58-
# from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
59-
60-
# es_config = create_elasticsearch_config_from_env()
61-
# es_client = ElasticsearchClient(es_config)
62-
# logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}")
63-
64-
# # Search for EVAL_RESULT log by message prefix
65-
# query = {"match": {"rollout_id": rollout_id}}
66-
# search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT
67-
# logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}")
68-
69-
# # Filter for EVAL_RESULT in Python
70-
# if search_results and search_results["hits"]["total"]["value"] > 0:
71-
# for hit in search_results["hits"]["hits"]:
72-
# message = hit["_source"].get("message", "")
73-
74-
# if message.startswith("EVAL_RESULT:"):
75-
# logger.info(f"[DEBUG] Found EVAL_RESULT message!")
76-
# result_json = message.replace("EVAL_RESULT:", "")
77-
# row.evaluation_result = EvaluateResult.model_validate_json(result_json)
78-
# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}")
79-
# break
80-
# else:
81-
# logger.warning("[DEBUG] EVAL_RESULT message not found in logs")
82-
# else:
83-
# logger.warning("[DEBUG] No logs found for rollout")
84-
85-
# logger.info(f"[DEBUG] Searching ES for EVAL_RESULT")
86-
# import asyncio
87-
# search_results = None
88-
# for attempt in range(5):
89-
# search_results = es_client.search(query=query, size=1)
90-
# if search_results and search_results["hits"]["total"]["value"] > 0:
91-
# logger.info(f"[DEBUG] Found result on attempt {attempt + 1}")
92-
# break
93-
# logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...")
94-
# await asyncio.sleep(1)
95-
96-
# logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits")
97-
# debug_query = {"match": {"rollout_id": rollout_id}}
98-
# debug_results = es_client.search(query=debug_query, size=26)
99-
# logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}")
100-
101-
# if debug_results["hits"]["total"]["value"] > 0:
102-
# for hit in debug_results["hits"]["hits"]:
103-
# msg = hit["_source"].get("message", "")[:80]
104-
# logger.info(f"[DEBUG] Sample message: {msg}")
105-
# else:
106-
# logger.warning("[DEBUG] No logs at all for this rollout_id!")
107-
# if search_results and search_results["hits"]["total"]["value"] > 0:
108-
# hit = search_results["hits"]["hits"][0]["_source"]
109-
# message = hit.get("message", "")
110-
# logger.info(f"[DEBUG] Found message: {message[:100]}...")
111-
112-
# if message.startswith("EVAL_RESULT:"):
113-
# result_json = message.replace("EVAL_RESULT:", "")
114-
# logger.info(f"[DEBUG] Parsing EvaluateResult JSON")
115-
116-
# if result_json != "null":
117-
# # Deserialize directly to EvaluateResult
118-
# row.evaluation_result = EvaluateResult.model_validate_json(result_json)
119-
# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}")
120-
# else:
121-
# logger.warning("[DEBUG] Result was null (no resolved status available)")
122-
# else:
123-
# logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}")
124-
# else:
125-
# logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch")
126-
127-
# except Exception as e:
128-
# logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True)
129-
130-
# logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
131-
# return row
132-
133-
13431
async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
13532
"""Evaluate SWE-bench instance by reading results from Elasticsearch."""
13633
rollout_id = row.execution_metadata.rollout_id

examples/swebench/tracing_model.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,38 @@
11
"""
2-
TracingFireworksModel - Routes through tracing using OpenAI SDK.
2+
Custom model classes for integrating mini-swe-agent with eval-protocol's tracing infrastructure.
3+
4+
## Why This File Exists
5+
6+
mini-swe-agent is an autonomous agent that makes 20-100+ LLM API calls per SWE-bench instance
7+
(e.g., reading files, editing code, running tests). To debug agent behavior and display results
8+
in eval-protocol's UI, we need to capture and analyze every LLM call.
9+
10+
This file bridges mini-swe-agent (which uses LitellmModel) with the Fireworks tracing proxy
11+
(which requires specific URL patterns and SDK usage).
12+
13+
## Problem Without This File
14+
15+
By default, mini-swe-agent would:
16+
- Call Fireworks API directly (no tracing)
17+
- Agent conversations invisible in eval-protocol UI
18+
- Can't debug why agent failed
19+
- No cost tracking per call
20+
- Model names get mangled by litellm routing
21+
22+
## What These Classes Do
23+
24+
### FireworksCompatibleModel (Base)
25+
- Extends mini-swe-agent's LitellmModel
26+
- Handles Fireworks API compatibility:
27+
* Strips non-standard message fields that Fireworks API rejects
28+
* Adds stop sequences to prevent common agent failure modes
29+
* Applies temperature/reasoning overrides from wrapper script
30+
- Used when tracing isn't needed (direct Fireworks API calls)
31+
32+
### TracingFireworksModel (For eval-protocol integration)
33+
- Extends FireworksCompatibleModel
34+
- Routes ALL LLM calls through Fireworks tracing proxy instead of direct API
35+
- Uses OpenAI SDK (not litellm) to preserve full model names
336
"""
437

538
import sys

0 commit comments

Comments
 (0)