|
23 | 23 | Status, |
24 | 24 | ) |
25 | 25 | from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper |
| 26 | +from eval_protocol.pytest.dataset_preparation import load_and_prepare_rows |
26 | 27 | from eval_protocol.pytest.evaluation_test_postprocess import postprocess |
27 | 28 | from eval_protocol.pytest.execution import execute_pytest |
28 | 29 | from eval_protocol.pytest.generate_parameter_combinations import ( |
|
60 | 61 | rollout_processor_with_retry, |
61 | 62 | ) |
62 | 63 |
|
63 | | -from ..common_utils import load_jsonl |
64 | | - |
65 | | - |
66 | 64 | def evaluation_test( |
67 | 65 | *, |
68 | 66 | completion_params: Sequence[CompletionParams | None] | None = None, |
@@ -223,43 +221,14 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo |
223 | 221 | log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger) |
224 | 222 |
|
225 | 223 | try: |
226 | | - # Handle dataset loading |
227 | | - data: list[EvaluationRow] = [] |
228 | 224 | # Track all rows processed in the current run for error logging |
229 | 225 | processed_rows_in_run: list[EvaluationRow] = [] |
230 | | - if "dataset_path" in kwargs and kwargs["dataset_path"] is not None: |
231 | | - ds_arg: list[str] = kwargs["dataset_path"] |
232 | | - # Support either a single path or a list of paths; if a list is provided, |
233 | | - # concatenate the rows from each file in order. |
234 | | - data_jsonl: list[dict[str, object]] = [] |
235 | | - for p in ds_arg: |
236 | | - data_jsonl.extend(load_jsonl(p)) |
237 | | - # Apply override for max rows if present |
238 | | - if max_dataset_rows is not None: |
239 | | - data_jsonl = data_jsonl[:max_dataset_rows] |
240 | | - data = dataset_adapter(data_jsonl) |
241 | | - elif "input_messages" in kwargs and kwargs["input_messages"] is not None: |
242 | | - # Support either a single row (List[Message]) or many rows (List[List[Message]]) |
243 | | - im = kwargs["input_messages"] |
244 | | - data = [EvaluationRow(messages=dataset_messages) for dataset_messages in im] |
245 | | - elif "input_rows" in kwargs and kwargs["input_rows"] is not None: |
246 | | - # Deep copy pre-constructed EvaluationRow objects |
247 | | - data = [row.model_copy(deep=True) for row in kwargs["input_rows"]] |
248 | | - else: |
249 | | - raise ValueError("No input dataset, input messages, or input rows provided") |
250 | | - |
251 | | - if preprocess_fn: |
252 | | - data = preprocess_fn(data) |
253 | | - |
254 | | - for row in data: |
255 | | - # generate a stable row_id for each row |
256 | | - if row.input_metadata.row_id is None: |
257 | | - # Generate a stable, deterministic row_id using the row's hash and num_combinations |
258 | | - index = hash(row) |
259 | | - max_index = num_combinations() - 1 |
260 | | - # Ensure index is a non-negative integer within [0, max_index] |
261 | | - index = abs(index) % (max_index + 1) |
262 | | - row.input_metadata.row_id = generate_id(seed=0, index=index) |
| 226 | + data = load_and_prepare_rows( |
| 227 | + kwargs, |
| 228 | + dataset_adapter=dataset_adapter, |
| 229 | + preprocess_fn=preprocess_fn, |
| 230 | + max_dataset_rows=max_dataset_rows, |
| 231 | + ) |
263 | 232 |
|
264 | 233 | completion_params = kwargs["completion_params"] |
265 | 234 | # Create eval metadata with test function info and current commit hash |
|
0 commit comments