@@ -32,7 +32,7 @@ def rows() -> List[EvaluationRow]:
3232 generators = [rows ],
3333 ),
3434 rollout_processor = RemoteRolloutProcessor (
35- remote_base_url = "http://127.0.0.1 :3000" ,
35+ remote_base_url = "http://35.209.134.123 :3000" ,
3636 model_base_url = "https://tracing.fireworks.ai" ,
3737 timeout_seconds = 1800 ,
3838 output_data_loader = default_fireworks_output_data_loader ,
@@ -42,49 +42,123 @@ def rows() -> List[EvaluationRow]:
4242 completion_params = [{"model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" }],
4343 max_concurrent_rollouts = 3 ,
4444)
45- async def test_swebench_remote (row : EvaluationRow ) -> EvaluationRow :
46- """Evaluate SWE-bench instance by reading results from Elasticsearch."""
47- import logging
45+ # async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
46+ # """Evaluate SWE-bench instance by reading results from Elasticsearch."""
47+ # import logging
48+ # logger = logging.getLogger(__name__)
49+
50+ # rollout_id = row.execution_metadata.rollout_id
51+ # logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
52+
53+ # if not rollout_id:
54+ # logger.warning("[DEBUG] No rollout_id, returning early")
55+ # return row
56+
57+ # try:
58+ # from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
59+
60+ # es_config = create_elasticsearch_config_from_env()
61+ # es_client = ElasticsearchClient(es_config)
62+ # logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}")
63+
64+ # # Search for EVAL_RESULT log by message prefix
65+ # query = {"match": {"rollout_id": rollout_id}}
66+ # search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT
67+ # logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}")
68+
69+ # # Filter for EVAL_RESULT in Python
70+ # if search_results and search_results["hits"]["total"]["value"] > 0:
71+ # for hit in search_results["hits"]["hits"]:
72+ # message = hit["_source"].get("message", "")
73+
74+ # if message.startswith("EVAL_RESULT:"):
75+ # logger.info(f"[DEBUG] Found EVAL_RESULT message!")
76+ # result_json = message.replace("EVAL_RESULT:", "")
77+ # row.evaluation_result = EvaluateResult.model_validate_json(result_json)
78+ # logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}")
79+ # break
80+ # else:
81+ # logger.warning("[DEBUG] EVAL_RESULT message not found in logs")
82+ # else:
83+ # logger.warning("[DEBUG] No logs found for rollout")
84+
85+ # logger.info(f"[DEBUG] Searching ES for EVAL_RESULT")
86+ # import asyncio
87+ # search_results = None
88+ # for attempt in range(5):
89+ # search_results = es_client.search(query=query, size=1)
90+ # if search_results and search_results["hits"]["total"]["value"] > 0:
91+ # logger.info(f"[DEBUG] Found result on attempt {attempt + 1}")
92+ # break
93+ # logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...")
94+ # await asyncio.sleep(1)
95+
96+ # logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits")
97+ # debug_query = {"match": {"rollout_id": rollout_id}}
98+ # debug_results = es_client.search(query=debug_query, size=26)
99+ # logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}")
48100
49- logger = logging .getLogger (__name__ )
101+ # if debug_results["hits"]["total"]["value"] > 0:
102+ # for hit in debug_results["hits"]["hits"]:
103+ # msg = hit["_source"].get("message", "")[:80]
104+ # logger.info(f"[DEBUG] Sample message: {msg}")
105+ # else:
106+ # logger.warning("[DEBUG] No logs at all for this rollout_id!")
107+ # if search_results and search_results["hits"]["total"]["value"] > 0:
108+ # hit = search_results["hits"]["hits"][0]["_source"]
109+ # message = hit.get("message", "")
110+ # logger.info(f"[DEBUG] Found message: {message[:100]}...")
50111
112+ # if message.startswith("EVAL_RESULT:"):
113+ # result_json = message.replace("EVAL_RESULT:", "")
114+ # logger.info(f"[DEBUG] Parsing EvaluateResult JSON")
115+
116+ # if result_json != "null":
117+ # # Deserialize directly to EvaluateResult
118+ # row.evaluation_result = EvaluateResult.model_validate_json(result_json)
119+ # logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}")
120+ # else:
121+ # logger.warning("[DEBUG] Result was null (no resolved status available)")
122+ # else:
123+ # logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}")
124+ # else:
125+ # logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch")
126+
127+ # except Exception as e:
128+ # logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True)
129+
130+ # logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
131+ # return row
132+
133+
134+ async def test_swebench_remote (row : EvaluationRow ) -> EvaluationRow :
135+ """Evaluate SWE-bench instance by reading results from Elasticsearch."""
51136 rollout_id = row .execution_metadata .rollout_id
52137 if not rollout_id :
53138 return row
54139
55- # Query Elasticsearch for results logged by server
56140 try :
57141 from eval_protocol .log_utils .elasticsearch_client import ElasticsearchClient
58142
59143 es_config = create_elasticsearch_config_from_env ()
60144 es_client = ElasticsearchClient (es_config )
61145
62- # Search for results log from this rollout
63- query = {"bool" : {"must" : [{"term" : {"rollout_id.keyword" : rollout_id }}, {"exists" : {"field" : "results" }}]}}
64-
65- search_results = es_client .es .search (index = es_config .index_name , query = query , size = 1 )
66-
67- if search_results ["hits" ]["total" ]["value" ] > 0 :
68- hit = search_results ["hits" ]["hits" ][0 ]["_source" ]
69- results_data = hit .get ("results" , {})
70- resolved = results_data .get ("resolved" )
71- instance_id = results_data .get ("instance_id" )
72-
73- if resolved is not None :
74- row .evaluation_result = EvaluateResult (
75- score = 1.0 if resolved else 0.0 ,
76- reason = f"instance={ instance_id } , resolved={ resolved } " ,
77- is_score_valid = True ,
78- metrics = {
79- "resolved" : MetricResult (
80- score = 1.0 if resolved else 0.0 ,
81- is_score_valid = True ,
82- reason = f"resolved={ resolved } " ,
83- value = int (resolved ),
84- )
85- },
86- )
146+ # Get all logs for this rollout and find EVAL_RESULT message
147+ query = {"match" : {"rollout_id" : rollout_id }}
148+ search_results = es_client .search (query = query , size = 50 )
149+
150+ if search_results and search_results ["hits" ]["total" ]["value" ] > 0 :
151+ for hit in search_results ["hits" ]["hits" ]:
152+ message = hit ["_source" ].get ("message" , "" )
153+
154+ if message .startswith ("EVAL_RESULT:" ):
155+ result_json = message .replace ("EVAL_RESULT:" , "" )
156+ row .evaluation_result = EvaluateResult .model_validate_json (result_json )
157+ break
158+
87159 except Exception as e :
88- logger .warning (f"Could not read results from Elasticsearch: { e } " )
160+ import logging
161+
162+ logging .getLogger (__name__ ).warning (f"Could not read results from Elasticsearch: { e } " )
89163
90164 return row
0 commit comments