|
3 | 3 | from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult |
4 | 4 | from eval_protocol.pytest import evaluation_test |
5 | 5 | from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env |
6 | | -from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader |
7 | 6 |
|
8 | | - |
9 | | -def rows_from_indices(count: int) -> List[EvaluationRow]: |
10 | | - out: List[EvaluationRow] = [] |
11 | | - for idx in range(count): |
12 | | - out.append( |
13 | | - EvaluationRow( |
14 | | - messages=[], |
15 | | - input_metadata={ |
16 | | - "row_id": str(idx), |
17 | | - "instance_index": str(idx), |
18 | | - }, |
19 | | - ) |
20 | | - ) |
21 | | - return out |
| 7 | +# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader |
| 8 | +from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices |
22 | 9 |
|
23 | 10 |
|
24 | 11 | def rows() -> List[EvaluationRow]: |
25 | | - # Generate 10 rows by index; server maps index -> dataset instance via --slice |
26 | | - return rows_from_indices(2) |
| 12 | + return create_rows_from_indices(500) # All instances |
27 | 13 |
|
28 | 14 |
|
29 | 15 | # -------------------- Harness result attachment (UI pass/fail) -------------------- |
30 | 16 | @evaluation_test( |
31 | 17 | data_loaders=DynamicDataLoader( |
32 | 18 | generators=[rows], |
33 | 19 | ), |
| 20 | + max_dataset_rows=2, |
34 | 21 | rollout_processor=RemoteRolloutProcessor( |
35 | | - remote_base_url="http://35.209.134.123:3000", |
| 22 | + remote_base_url="http://127.0.0.1:3000", |
36 | 23 | model_base_url="https://tracing.fireworks.ai", |
37 | 24 | timeout_seconds=1800, |
38 | | - output_data_loader=default_fireworks_output_data_loader, |
39 | 25 | disable_elastic_search_setup=True, |
40 | 26 | elastic_search_config=create_elasticsearch_config_from_env(), |
41 | 27 | ), |
42 | 28 | completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], |
43 | 29 | max_concurrent_rollouts=3, |
44 | 30 | ) |
45 | | -# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: |
46 | | -# """Evaluate SWE-bench instance by reading results from Elasticsearch.""" |
47 | | -# import logging |
48 | | -# logger = logging.getLogger(__name__) |
49 | | - |
50 | | -# rollout_id = row.execution_metadata.rollout_id |
51 | | -# logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}") |
52 | | - |
53 | | -# if not rollout_id: |
54 | | -# logger.warning("[DEBUG] No rollout_id, returning early") |
55 | | -# return row |
56 | | - |
57 | | -# try: |
58 | | -# from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient |
59 | | - |
60 | | -# es_config = create_elasticsearch_config_from_env() |
61 | | -# es_client = ElasticsearchClient(es_config) |
62 | | -# logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}") |
63 | | - |
64 | | -# # Search for EVAL_RESULT log by message prefix |
65 | | -# query = {"match": {"rollout_id": rollout_id}} |
66 | | -# search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT |
67 | | -# logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}") |
68 | | - |
69 | | -# # Filter for EVAL_RESULT in Python |
70 | | -# if search_results and search_results["hits"]["total"]["value"] > 0: |
71 | | -# for hit in search_results["hits"]["hits"]: |
72 | | -# message = hit["_source"].get("message", "") |
73 | | - |
74 | | -# if message.startswith("EVAL_RESULT:"): |
75 | | -# logger.info(f"[DEBUG] Found EVAL_RESULT message!") |
76 | | -# result_json = message.replace("EVAL_RESULT:", "") |
77 | | -# row.evaluation_result = EvaluateResult.model_validate_json(result_json) |
78 | | -# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}") |
79 | | -# break |
80 | | -# else: |
81 | | -# logger.warning("[DEBUG] EVAL_RESULT message not found in logs") |
82 | | -# else: |
83 | | -# logger.warning("[DEBUG] No logs found for rollout") |
84 | | - |
85 | | -# logger.info(f"[DEBUG] Searching ES for EVAL_RESULT") |
86 | | -# import asyncio |
87 | | -# search_results = None |
88 | | -# for attempt in range(5): |
89 | | -# search_results = es_client.search(query=query, size=1) |
90 | | -# if search_results and search_results["hits"]["total"]["value"] > 0: |
91 | | -# logger.info(f"[DEBUG] Found result on attempt {attempt + 1}") |
92 | | -# break |
93 | | -# logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...") |
94 | | -# await asyncio.sleep(1) |
95 | | - |
96 | | -# logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits") |
97 | | -# debug_query = {"match": {"rollout_id": rollout_id}} |
98 | | -# debug_results = es_client.search(query=debug_query, size=26) |
99 | | -# logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}") |
100 | | - |
101 | | -# if debug_results["hits"]["total"]["value"] > 0: |
102 | | -# for hit in debug_results["hits"]["hits"]: |
103 | | -# msg = hit["_source"].get("message", "")[:80] |
104 | | -# logger.info(f"[DEBUG] Sample message: {msg}") |
105 | | -# else: |
106 | | -# logger.warning("[DEBUG] No logs at all for this rollout_id!") |
107 | | -# if search_results and search_results["hits"]["total"]["value"] > 0: |
108 | | -# hit = search_results["hits"]["hits"][0]["_source"] |
109 | | -# message = hit.get("message", "") |
110 | | -# logger.info(f"[DEBUG] Found message: {message[:100]}...") |
111 | | - |
112 | | -# if message.startswith("EVAL_RESULT:"): |
113 | | -# result_json = message.replace("EVAL_RESULT:", "") |
114 | | -# logger.info(f"[DEBUG] Parsing EvaluateResult JSON") |
115 | | - |
116 | | -# if result_json != "null": |
117 | | -# # Deserialize directly to EvaluateResult |
118 | | -# row.evaluation_result = EvaluateResult.model_validate_json(result_json) |
119 | | -# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}") |
120 | | -# else: |
121 | | -# logger.warning("[DEBUG] Result was null (no resolved status available)") |
122 | | -# else: |
123 | | -# logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}") |
124 | | -# else: |
125 | | -# logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch") |
126 | | - |
127 | | -# except Exception as e: |
128 | | -# logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True) |
129 | | - |
130 | | -# logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}") |
131 | | -# return row |
132 | | - |
133 | | - |
134 | 31 | async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: |
135 | 32 | """Evaluate SWE-bench instance by reading results from Elasticsearch.""" |
136 | 33 | rollout_id = row.execution_metadata.rollout_id |
|
0 commit comments