From 771bf4c9686eea99d0fd58fc82940b4b438296bd Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 30 Sep 2025 11:44:36 -0700 Subject: [PATCH 1/5] support only persisting jsonl to local but no upload --- eval_protocol/pytest/handle_persist_flow.py | 252 ++++++++++---------- eval_protocol/pytest/plugin.py | 4 + 2 files changed, 132 insertions(+), 124 deletions(-) diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py index ff386873..f9c4317f 100644 --- a/eval_protocol/pytest/handle_persist_flow.py +++ b/eval_protocol/pytest/handle_persist_flow.py @@ -16,9 +16,9 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str): try: # Default is to save and upload experiment JSONL files, unless explicitly disabled - should_save_and_upload = os.getenv("EP_NO_UPLOAD") != "1" + should_save = os.getenv("EP_NO_PERSIST_RESULTS_JSONL") != "1" - if should_save_and_upload: + if should_save: current_run_rows = [item for sublist in all_results for item in sublist] if current_run_rows: experiments: dict[str, list[EvaluationRow]] = defaultdict(list) @@ -81,129 +81,133 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: json.dump(row_data, f, ensure_ascii=False) f.write("\n") - def get_auth_value(key: str) -> str | None: - """Get auth value from config file or environment.""" - try: - config_path = Path.home() / ".fireworks" / "auth.ini" - if config_path.exists(): - config = configparser.ConfigParser() # noqa: F821 - config.read(config_path) - for section in ["DEFAULT", "auth"]: - if config.has_section(section) and config.has_option(section, key): - return config.get(section, key) - except Exception: - pass - return os.getenv(key) - - fireworks_api_key = get_auth_value("FIREWORKS_API_KEY") - fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID") - - if not fireworks_api_key and not fireworks_account_id: - store_experiment_link( - experiment_id, - "No Fireworks API key AND account ID found", - "failure", - ) - continue - elif not fireworks_api_key: - store_experiment_link( - experiment_id, - "No Fireworks API key found", - "failure", - ) - continue - elif not fireworks_account_id: - store_experiment_link( - experiment_id, - "No Fireworks account ID found", - "failure", - ) - continue - - headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"} - - # Make dataset first - dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets" - - dataset_payload = { # pyright: ignore[reportUnknownVariableType] - "dataset": { - "displayName": dataset_name, - "evalProtocol": {}, - "format": "FORMAT_UNSPECIFIED", - "exampleCount": f"{len(exp_rows)}", - }, - "datasetId": dataset_name, - } - - dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers) # pyright: ignore[reportUnknownArgumentType] - - # Skip if dataset creation failed - if dataset_response.status_code not in [200, 201]: - store_experiment_link( - experiment_id, - f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}", - "failure", - ) - continue - - dataset_data: dict[str, Any] = dataset_response.json() # pyright: ignore[reportAny, reportExplicitAny] - dataset_id = dataset_data.get("datasetId", dataset_name) # pyright: ignore[reportAny] - - # Upload the JSONL file content - upload_url = ( - f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload" - ) - upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"} - - with open(exp_file, "rb") as f: - files = {"file": f} - upload_response = requests.post(upload_url, files=files, headers=upload_headers) - - # Skip if upload failed - if upload_response.status_code not in [200, 201]: - store_experiment_link( - experiment_id, - f"File upload failed: {upload_response.status_code} {upload_response.text}", - "failure", - ) - continue - - # Create evaluation job (optional - don't skip experiment if this fails) - eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs" - # Truncate job ID to fit 63 character limit - job_id_base = f"{dataset_name}-job" - if len(job_id_base) > 63: - # Keep the "-job" suffix and truncate the dataset_name part - max_dataset_name_len = 63 - 4 # 4 = len("-job") - truncated_dataset_name = dataset_name[:max_dataset_name_len] - job_id_base = f"{truncated_dataset_name}-job" - - eval_job_payload = { - "evaluationJobId": job_id_base, - "evaluationJob": { - "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy", - "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy", - "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}", - }, - } - - eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers) - - if eval_response.status_code in [200, 201]: - eval_job_data = eval_response.json() # pyright: ignore[reportAny] - job_id = eval_job_data.get("evaluationJobId", job_id_base) # pyright: ignore[reportAny] - - store_experiment_link( - experiment_id, - f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}", - "success", - ) - else: - store_experiment_link( - experiment_id, - f"Job creation failed: {eval_response.status_code} {eval_response.text}", - "failure", + should_upload = os.getenv("EP_NO_UPLOAD") != "1" + + if should_upload: + + def get_auth_value(key: str) -> str | None: + """Get auth value from config file or environment.""" + try: + config_path = Path.home() / ".fireworks" / "auth.ini" + if config_path.exists(): + config = configparser.ConfigParser() # noqa: F821 + config.read(config_path) + for section in ["DEFAULT", "auth"]: + if config.has_section(section) and config.has_option(section, key): + return config.get(section, key) + except Exception: + pass + return os.getenv(key) + + fireworks_api_key = get_auth_value("FIREWORKS_API_KEY") + fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID") + + if not fireworks_api_key and not fireworks_account_id: + store_experiment_link( + experiment_id, + "No Fireworks API key AND account ID found", + "failure", + ) + continue + elif not fireworks_api_key: + store_experiment_link( + experiment_id, + "No Fireworks API key found", + "failure", + ) + continue + elif not fireworks_account_id: + store_experiment_link( + experiment_id, + "No Fireworks account ID found", + "failure", + ) + continue + + headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"} + + # Make dataset first + dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets" + + dataset_payload = { # pyright: ignore[reportUnknownVariableType] + "dataset": { + "displayName": dataset_name, + "evalProtocol": {}, + "format": "FORMAT_UNSPECIFIED", + "exampleCount": f"{len(exp_rows)}", + }, + "datasetId": dataset_name, + } + + dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers) # pyright: ignore[reportUnknownArgumentType] + + # Skip if dataset creation failed + if dataset_response.status_code not in [200, 201]: + store_experiment_link( + experiment_id, + f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}", + "failure", + ) + continue + + dataset_data: dict[str, Any] = dataset_response.json() # pyright: ignore[reportAny, reportExplicitAny] + dataset_id = dataset_data.get("datasetId", dataset_name) # pyright: ignore[reportAny] + + # Upload the JSONL file content + upload_url = ( + f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload" ) + upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"} + + with open(exp_file, "rb") as f: + files = {"file": f} + upload_response = requests.post(upload_url, files=files, headers=upload_headers) + + # Skip if upload failed + if upload_response.status_code not in [200, 201]: + store_experiment_link( + experiment_id, + f"File upload failed: {upload_response.status_code} {upload_response.text}", + "failure", + ) + continue + + # Create evaluation job (optional - don't skip experiment if this fails) + eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs" + # Truncate job ID to fit 63 character limit + job_id_base = f"{dataset_name}-job" + if len(job_id_base) > 63: + # Keep the "-job" suffix and truncate the dataset_name part + max_dataset_name_len = 63 - 4 # 4 = len("-job") + truncated_dataset_name = dataset_name[:max_dataset_name_len] + job_id_base = f"{truncated_dataset_name}-job" + + eval_job_payload = { + "evaluationJobId": job_id_base, + "evaluationJob": { + "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy", + "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy", + "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}", + }, + } + + eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers) + + if eval_response.status_code in [200, 201]: + eval_job_data = eval_response.json() # pyright: ignore[reportAny] + job_id = eval_job_data.get("evaluationJobId", job_id_base) # pyright: ignore[reportAny] + + store_experiment_link( + experiment_id, + f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}", + "success", + ) + else: + store_experiment_link( + experiment_id, + f"Job creation failed: {eval_response.status_code} {eval_response.text}", + "failure", + ) except Exception as e: # Do not fail evaluation if experiment JSONL writing fails diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 030c367e..43bea278 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -258,6 +258,10 @@ def pytest_configure(config) -> None: if threshold_env is not None: os.environ["EP_PASSED_THRESHOLD"] = threshold_env + if config.getoption("--ep-no-persist-results-jsonl"): + # flag to turn off persisting results as jsonl files + os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1" + if config.getoption("--ep-no-upload"): os.environ["EP_NO_UPLOAD"] = "1" From aef04219415c40a89403d7048c0cedfdcba9fee1 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 30 Sep 2025 13:02:11 -0700 Subject: [PATCH 2/5] add --- eval_protocol/pytest/plugin.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 43bea278..382dedd8 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -133,6 +133,12 @@ def pytest_addoption(parser) -> None: default=None, help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"), ) + group.addoption( + "--ep-no-persist-results-jsonl", + action="store_true", + default=False, + help=("Disable persisting results as jsonl files. Default: false (results are persisted by default)."), + ) def _normalize_max_rows(val: Optional[str]) -> Optional[str]: From acbd83cc48c4db3aa32f830717e7edc43af912df Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 30 Sep 2025 13:37:38 -0700 Subject: [PATCH 3/5] add --- eval_protocol/pytest/handle_persist_flow.py | 3 ++- eval_protocol/pytest/plugin.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py index f9c4317f..bee57067 100644 --- a/eval_protocol/pytest/handle_persist_flow.py +++ b/eval_protocol/pytest/handle_persist_flow.py @@ -83,7 +83,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: should_upload = os.getenv("EP_NO_UPLOAD") != "1" - if should_upload: + if not should_upload: + continue def get_auth_value(key: str) -> str | None: """Get auth value from config file or environment.""" diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 382dedd8..74f4dd66 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -264,7 +264,7 @@ def pytest_configure(config) -> None: if threshold_env is not None: os.environ["EP_PASSED_THRESHOLD"] = threshold_env - if config.getoption("--ep-no-persist-results-jsonl"): + if config.getoption("--ep-no-save-results-in-jsonl"): # flag to turn off persisting results as jsonl files os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1" From 6939f6621a8f632d2d3618bcdb28f1998e3c4af0 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 30 Sep 2025 13:39:42 -0700 Subject: [PATCH 4/5] update --- eval_protocol/pytest/handle_persist_flow.py | 245 ++++++++++---------- 1 file changed, 122 insertions(+), 123 deletions(-) diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py index bee57067..a2a7c815 100644 --- a/eval_protocol/pytest/handle_persist_flow.py +++ b/eval_protocol/pytest/handle_persist_flow.py @@ -82,133 +82,132 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: f.write("\n") should_upload = os.getenv("EP_NO_UPLOAD") != "1" - if not should_upload: continue - def get_auth_value(key: str) -> str | None: - """Get auth value from config file or environment.""" - try: - config_path = Path.home() / ".fireworks" / "auth.ini" - if config_path.exists(): - config = configparser.ConfigParser() # noqa: F821 - config.read(config_path) - for section in ["DEFAULT", "auth"]: - if config.has_section(section) and config.has_option(section, key): - return config.get(section, key) - except Exception: - pass - return os.getenv(key) - - fireworks_api_key = get_auth_value("FIREWORKS_API_KEY") - fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID") - - if not fireworks_api_key and not fireworks_account_id: - store_experiment_link( - experiment_id, - "No Fireworks API key AND account ID found", - "failure", - ) - continue - elif not fireworks_api_key: - store_experiment_link( - experiment_id, - "No Fireworks API key found", - "failure", - ) - continue - elif not fireworks_account_id: - store_experiment_link( - experiment_id, - "No Fireworks account ID found", - "failure", - ) - continue - - headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"} - - # Make dataset first - dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets" - - dataset_payload = { # pyright: ignore[reportUnknownVariableType] - "dataset": { - "displayName": dataset_name, - "evalProtocol": {}, - "format": "FORMAT_UNSPECIFIED", - "exampleCount": f"{len(exp_rows)}", - }, - "datasetId": dataset_name, - } - - dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers) # pyright: ignore[reportUnknownArgumentType] - - # Skip if dataset creation failed - if dataset_response.status_code not in [200, 201]: - store_experiment_link( - experiment_id, - f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}", - "failure", - ) - continue - - dataset_data: dict[str, Any] = dataset_response.json() # pyright: ignore[reportAny, reportExplicitAny] - dataset_id = dataset_data.get("datasetId", dataset_name) # pyright: ignore[reportAny] - - # Upload the JSONL file content - upload_url = ( - f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload" + def get_auth_value(key: str) -> str | None: + """Get auth value from config file or environment.""" + try: + config_path = Path.home() / ".fireworks" / "auth.ini" + if config_path.exists(): + config = configparser.ConfigParser() # noqa: F821 + config.read(config_path) + for section in ["DEFAULT", "auth"]: + if config.has_section(section) and config.has_option(section, key): + return config.get(section, key) + except Exception: + pass + return os.getenv(key) + + fireworks_api_key = get_auth_value("FIREWORKS_API_KEY") + fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID") + + if not fireworks_api_key and not fireworks_account_id: + store_experiment_link( + experiment_id, + "No Fireworks API key AND account ID found", + "failure", + ) + continue + elif not fireworks_api_key: + store_experiment_link( + experiment_id, + "No Fireworks API key found", + "failure", + ) + continue + elif not fireworks_account_id: + store_experiment_link( + experiment_id, + "No Fireworks account ID found", + "failure", + ) + continue + + headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"} + + # Make dataset first + dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets" + + dataset_payload = { # pyright: ignore[reportUnknownVariableType] + "dataset": { + "displayName": dataset_name, + "evalProtocol": {}, + "format": "FORMAT_UNSPECIFIED", + "exampleCount": f"{len(exp_rows)}", + }, + "datasetId": dataset_name, + } + + dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers) # pyright: ignore[reportUnknownArgumentType] + + # Skip if dataset creation failed + if dataset_response.status_code not in [200, 201]: + store_experiment_link( + experiment_id, + f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}", + "failure", + ) + continue + + dataset_data: dict[str, Any] = dataset_response.json() # pyright: ignore[reportAny, reportExplicitAny] + dataset_id = dataset_data.get("datasetId", dataset_name) # pyright: ignore[reportAny] + + # Upload the JSONL file content + upload_url = ( + f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload" + ) + upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"} + + with open(exp_file, "rb") as f: + files = {"file": f} + upload_response = requests.post(upload_url, files=files, headers=upload_headers) + + # Skip if upload failed + if upload_response.status_code not in [200, 201]: + store_experiment_link( + experiment_id, + f"File upload failed: {upload_response.status_code} {upload_response.text}", + "failure", + ) + continue + + # Create evaluation job (optional - don't skip experiment if this fails) + eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs" + # Truncate job ID to fit 63 character limit + job_id_base = f"{dataset_name}-job" + if len(job_id_base) > 63: + # Keep the "-job" suffix and truncate the dataset_name part + max_dataset_name_len = 63 - 4 # 4 = len("-job") + truncated_dataset_name = dataset_name[:max_dataset_name_len] + job_id_base = f"{truncated_dataset_name}-job" + + eval_job_payload = { + "evaluationJobId": job_id_base, + "evaluationJob": { + "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy", + "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy", + "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}", + }, + } + + eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers) + + if eval_response.status_code in [200, 201]: + eval_job_data = eval_response.json() # pyright: ignore[reportAny] + job_id = eval_job_data.get("evaluationJobId", job_id_base) # pyright: ignore[reportAny] + + store_experiment_link( + experiment_id, + f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}", + "success", + ) + else: + store_experiment_link( + experiment_id, + f"Job creation failed: {eval_response.status_code} {eval_response.text}", + "failure", ) - upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"} - - with open(exp_file, "rb") as f: - files = {"file": f} - upload_response = requests.post(upload_url, files=files, headers=upload_headers) - - # Skip if upload failed - if upload_response.status_code not in [200, 201]: - store_experiment_link( - experiment_id, - f"File upload failed: {upload_response.status_code} {upload_response.text}", - "failure", - ) - continue - - # Create evaluation job (optional - don't skip experiment if this fails) - eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs" - # Truncate job ID to fit 63 character limit - job_id_base = f"{dataset_name}-job" - if len(job_id_base) > 63: - # Keep the "-job" suffix and truncate the dataset_name part - max_dataset_name_len = 63 - 4 # 4 = len("-job") - truncated_dataset_name = dataset_name[:max_dataset_name_len] - job_id_base = f"{truncated_dataset_name}-job" - - eval_job_payload = { - "evaluationJobId": job_id_base, - "evaluationJob": { - "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy", - "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy", - "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}", - }, - } - - eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers) - - if eval_response.status_code in [200, 201]: - eval_job_data = eval_response.json() # pyright: ignore[reportAny] - job_id = eval_job_data.get("evaluationJobId", job_id_base) # pyright: ignore[reportAny] - - store_experiment_link( - experiment_id, - f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}", - "success", - ) - else: - store_experiment_link( - experiment_id, - f"Job creation failed: {eval_response.status_code} {eval_response.text}", - "failure", - ) except Exception as e: # Do not fail evaluation if experiment JSONL writing fails From 091a474ad6d2f78fdef63f2ca45a53ee29a117df Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 30 Sep 2025 13:54:53 -0700 Subject: [PATCH 5/5] output result to a custom dir --- eval_protocol/pytest/handle_persist_flow.py | 5 ++++- eval_protocol/pytest/plugin.py | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py index a2a7c815..e2f2a93d 100644 --- a/eval_protocol/pytest/handle_persist_flow.py +++ b/eval_protocol/pytest/handle_persist_flow.py @@ -16,7 +16,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str): try: # Default is to save and upload experiment JSONL files, unless explicitly disabled - should_save = os.getenv("EP_NO_PERSIST_RESULTS_JSONL") != "1" + custom_output_dir = os.getenv("EP_OUTPUT_DIR") + should_save = os.getenv("EP_NO_UPLOAD") != "1" or custom_output_dir is not None if should_save: current_run_rows = [item for sublist in all_results for item in sublist] @@ -27,6 +28,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: experiments[row.execution_metadata.experiment_id].append(row) eval_protocol_dir = find_eval_protocol_dir() + if custom_output_dir: + eval_protocol_dir = custom_output_dir exp_dir = pathlib.Path(eval_protocol_dir) / "experiment_results" exp_dir.mkdir(parents=True, exist_ok=True) diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 74f4dd66..d0c4af4d 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -134,10 +134,9 @@ def pytest_addoption(parser) -> None: help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"), ) group.addoption( - "--ep-no-persist-results-jsonl", - action="store_true", - default=False, - help=("Disable persisting results as jsonl files. Default: false (results are persisted by default)."), + "--ep-output-dir", + default=None, + help=("If set, save evaluation results to this directory in jsonl format."), ) @@ -264,9 +263,9 @@ def pytest_configure(config) -> None: if threshold_env is not None: os.environ["EP_PASSED_THRESHOLD"] = threshold_env - if config.getoption("--ep-no-save-results-in-jsonl"): - # flag to turn off persisting results as jsonl files - os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1" + if config.getoption("--ep-output-dir"): + # set this to save eval results to the target dir in jsonl format + os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir") if config.getoption("--ep-no-upload"): os.environ["EP_NO_UPLOAD"] = "1"