22from eval_protocol .data_loader .dynamic_data_loader import DynamicDataLoader
33from eval_protocol .models import EvaluationRow , EvaluateResult , MetricResult
44from eval_protocol .pytest import evaluation_test
5- from eval_protocol .pytest .remote_rollout_processor import RemoteRolloutProcessor , create_elasticsearch_config_from_env
6-
7- # from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
5+ from eval_protocol .pytest .remote_rollout_processor import RemoteRolloutProcessor
86from eval_protocol .utils .evaluation_row_utils import create_rows_from_indices
97
108
@@ -23,39 +21,59 @@ def rows() -> List[EvaluationRow]:
2321 model_base_url = "https://tracing.fireworks.ai" ,
2422 timeout_seconds = 1800 ,
2523 disable_elastic_search_setup = True ,
26- elastic_search_config = create_elasticsearch_config_from_env (),
2724 ),
2825 completion_params = [{"model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" }],
2926 max_concurrent_rollouts = 3 ,
3027)
3128async def test_swebench_remote (row : EvaluationRow ) -> EvaluationRow :
32- """Evaluate SWE-bench instance by reading results from Elasticsearch."""
29+ """Evaluate SWE-bench instance by reading results from Fireworks tracing logs."""
30+ import logging
31+
32+ logger = logging .getLogger (__name__ )
33+
3334 rollout_id = row .execution_metadata .rollout_id
35+ logger .info (f"[DEBUG] Processing rollout_id: { rollout_id } " )
36+
3437 if not rollout_id :
38+ logger .warning ("[DEBUG] No rollout_id" )
3539 return row
3640
3741 try :
38- from eval_protocol .log_utils . elasticsearch_client import ElasticsearchClient
42+ from eval_protocol .adapters . fireworks_tracing import FireworksTracingAdapter
3943
40- es_config = create_elasticsearch_config_from_env ( )
41- es_client = ElasticsearchClient ( es_config )
44+ adapter = FireworksTracingAdapter ( base_url = "https://tracing.fireworks.ai" )
45+ logger . info ( "[DEBUG] Created adapter for https://tracing.fireworks.ai" )
4246
43- # Get all logs for this rollout and find EVAL_RESULT message
44- query = { "match" : { " rollout_id" : rollout_id }}
45- search_results = es_client . search ( query = query , size = 50 )
47+ # Fetch logs for this rollout
48+ logger . info ( f"[DEBUG] Searching for tag: rollout_id: { rollout_id } " )
49+ log_entries = adapter . search_logs ( tags = [ f"rollout_id: { rollout_id } " ], limit = 100 , hours_back = 24 )
4650
47- if search_results and search_results [ "hits" ][ "total" ][ "value" ] > 0 :
48- for hit in search_results [ "hits" ][ "hits" ] :
49- message = hit [ "_source" ] .get (" message" , " " )
51+ logger . info ( f"[DEBUG] Received { len ( log_entries ) } log entries" )
52+ if log_entries :
53+ logger . info ( f"[DEBUG] Sample messages: { [ e .get (' message' , '' )[: 50 ] for e in log_entries [: 3 ]] } " )
5054
51- if message .startswith ("EVAL_RESULT:" ):
52- result_json = message .replace ("EVAL_RESULT:" , "" )
55+ # Find EVAL_RESULT message
56+ found = False
57+ for entry in log_entries :
58+ message = entry .get ("message" , "" )
59+ if message .startswith ("EVAL_RESULT:" ):
60+ logger .info ("[DEBUG] Found EVAL_RESULT message!" )
61+ result_json = message .replace ("EVAL_RESULT:" , "" )
62+ logger .info (f"[DEBUG] Parsing JSON: { result_json [:100 ]} ..." )
63+
64+ if result_json != "null" :
5365 row .evaluation_result = EvaluateResult .model_validate_json (result_json )
54- break
66+ logger .info (
67+ f"[DEBUG] Attached result: score={ row .evaluation_result .score } , reason={ row .evaluation_result .reason } "
68+ )
69+ found = True
70+ break
5571
56- except Exception as e :
57- import logging
72+ if not found :
73+ logger . warning ( f"[DEBUG] No EVAL_RESULT message found in { len ( log_entries ) } logs" )
5874
59- logging .getLogger (__name__ ).warning (f"Could not read results from Elasticsearch: { e } " )
75+ except Exception as e :
76+ logger .error (f"[DEBUG] Exception: { e } " , exc_info = True )
6077
78+ logger .info (f"[DEBUG] Returning row, has evaluation_result: { row .evaluation_result is not None } " )
6179 return row
0 commit comments