From 771bf4c9686eea99d0fd58fc82940b4b438296bd Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Tue, 30 Sep 2025 11:44:36 -0700
Subject: [PATCH 1/5] support only persisting jsonl to local but no upload

---
 eval_protocol/pytest/handle_persist_flow.py | 252 ++++++++++----------
 eval_protocol/pytest/plugin.py              |   4 +
 2 files changed, 132 insertions(+), 124 deletions(-)

diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py
index ff386873..f9c4317f 100644
--- a/eval_protocol/pytest/handle_persist_flow.py
+++ b/eval_protocol/pytest/handle_persist_flow.py
@@ -16,9 +16,9 @@
 def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str):
     try:
         # Default is to save and upload experiment JSONL files, unless explicitly disabled
-        should_save_and_upload = os.getenv("EP_NO_UPLOAD") != "1"
+        should_save = os.getenv("EP_NO_PERSIST_RESULTS_JSONL") != "1"
 
-        if should_save_and_upload:
+        if should_save:
             current_run_rows = [item for sublist in all_results for item in sublist]
             if current_run_rows:
                 experiments: dict[str, list[EvaluationRow]] = defaultdict(list)
@@ -81,129 +81,133 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
                             json.dump(row_data, f, ensure_ascii=False)
                             f.write("\n")
 
-                    def get_auth_value(key: str) -> str | None:
-                        """Get auth value from config file or environment."""
-                        try:
-                            config_path = Path.home() / ".fireworks" / "auth.ini"
-                            if config_path.exists():
-                                config = configparser.ConfigParser()  # noqa: F821
-                                config.read(config_path)
-                                for section in ["DEFAULT", "auth"]:
-                                    if config.has_section(section) and config.has_option(section, key):
-                                        return config.get(section, key)
-                        except Exception:
-                            pass
-                        return os.getenv(key)
-
-                    fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
-                    fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
-
-                    if not fireworks_api_key and not fireworks_account_id:
-                        store_experiment_link(
-                            experiment_id,
-                            "No Fireworks API key AND account ID found",
-                            "failure",
-                        )
-                        continue
-                    elif not fireworks_api_key:
-                        store_experiment_link(
-                            experiment_id,
-                            "No Fireworks API key found",
-                            "failure",
-                        )
-                        continue
-                    elif not fireworks_account_id:
-                        store_experiment_link(
-                            experiment_id,
-                            "No Fireworks account ID found",
-                            "failure",
-                        )
-                        continue
-
-                    headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"}
-
-                    # Make dataset first
-                    dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets"
-
-                    dataset_payload = {  # pyright: ignore[reportUnknownVariableType]
-                        "dataset": {
-                            "displayName": dataset_name,
-                            "evalProtocol": {},
-                            "format": "FORMAT_UNSPECIFIED",
-                            "exampleCount": f"{len(exp_rows)}",
-                        },
-                        "datasetId": dataset_name,
-                    }
-
-                    dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers)  # pyright: ignore[reportUnknownArgumentType]
-
-                    # Skip if dataset creation failed
-                    if dataset_response.status_code not in [200, 201]:
-                        store_experiment_link(
-                            experiment_id,
-                            f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}",
-                            "failure",
-                        )
-                        continue
-
-                    dataset_data: dict[str, Any] = dataset_response.json()  # pyright: ignore[reportAny, reportExplicitAny]
-                    dataset_id = dataset_data.get("datasetId", dataset_name)  # pyright: ignore[reportAny]
-
-                    # Upload the JSONL file content
-                    upload_url = (
-                        f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload"
-                    )
-                    upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"}
-
-                    with open(exp_file, "rb") as f:
-                        files = {"file": f}
-                        upload_response = requests.post(upload_url, files=files, headers=upload_headers)
-
-                    # Skip if upload failed
-                    if upload_response.status_code not in [200, 201]:
-                        store_experiment_link(
-                            experiment_id,
-                            f"File upload failed: {upload_response.status_code} {upload_response.text}",
-                            "failure",
-                        )
-                        continue
-
-                    # Create evaluation job (optional - don't skip experiment if this fails)
-                    eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs"
-                    # Truncate job ID to fit 63 character limit
-                    job_id_base = f"{dataset_name}-job"
-                    if len(job_id_base) > 63:
-                        # Keep the "-job" suffix and truncate the dataset_name part
-                        max_dataset_name_len = 63 - 4  # 4 = len("-job")
-                        truncated_dataset_name = dataset_name[:max_dataset_name_len]
-                        job_id_base = f"{truncated_dataset_name}-job"
-
-                    eval_job_payload = {
-                        "evaluationJobId": job_id_base,
-                        "evaluationJob": {
-                            "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy",
-                            "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy",
-                            "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}",
-                        },
-                    }
-
-                    eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers)
-
-                    if eval_response.status_code in [200, 201]:
-                        eval_job_data = eval_response.json()  # pyright: ignore[reportAny]
-                        job_id = eval_job_data.get("evaluationJobId", job_id_base)  # pyright: ignore[reportAny]
-
-                        store_experiment_link(
-                            experiment_id,
-                            f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
-                            "success",
-                        )
-                    else:
-                        store_experiment_link(
-                            experiment_id,
-                            f"Job creation failed: {eval_response.status_code} {eval_response.text}",
-                            "failure",
+                    should_upload = os.getenv("EP_NO_UPLOAD") != "1"
+
+                    if should_upload:
+
+                        def get_auth_value(key: str) -> str | None:
+                            """Get auth value from config file or environment."""
+                            try:
+                                config_path = Path.home() / ".fireworks" / "auth.ini"
+                                if config_path.exists():
+                                    config = configparser.ConfigParser()  # noqa: F821
+                                    config.read(config_path)
+                                    for section in ["DEFAULT", "auth"]:
+                                        if config.has_section(section) and config.has_option(section, key):
+                                            return config.get(section, key)
+                            except Exception:
+                                pass
+                            return os.getenv(key)
+
+                        fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
+                        fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
+
+                        if not fireworks_api_key and not fireworks_account_id:
+                            store_experiment_link(
+                                experiment_id,
+                                "No Fireworks API key AND account ID found",
+                                "failure",
+                            )
+                            continue
+                        elif not fireworks_api_key:
+                            store_experiment_link(
+                                experiment_id,
+                                "No Fireworks API key found",
+                                "failure",
+                            )
+                            continue
+                        elif not fireworks_account_id:
+                            store_experiment_link(
+                                experiment_id,
+                                "No Fireworks account ID found",
+                                "failure",
+                            )
+                            continue
+
+                        headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"}
+
+                        # Make dataset first
+                        dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets"
+
+                        dataset_payload = {  # pyright: ignore[reportUnknownVariableType]
+                            "dataset": {
+                                "displayName": dataset_name,
+                                "evalProtocol": {},
+                                "format": "FORMAT_UNSPECIFIED",
+                                "exampleCount": f"{len(exp_rows)}",
+                            },
+                            "datasetId": dataset_name,
+                        }
+
+                        dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers)  # pyright: ignore[reportUnknownArgumentType]
+
+                        # Skip if dataset creation failed
+                        if dataset_response.status_code not in [200, 201]:
+                            store_experiment_link(
+                                experiment_id,
+                                f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}",
+                                "failure",
+                            )
+                            continue
+
+                        dataset_data: dict[str, Any] = dataset_response.json()  # pyright: ignore[reportAny, reportExplicitAny]
+                        dataset_id = dataset_data.get("datasetId", dataset_name)  # pyright: ignore[reportAny]
+
+                        # Upload the JSONL file content
+                        upload_url = (
+                            f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload"
                         )
+                        upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"}
+
+                        with open(exp_file, "rb") as f:
+                            files = {"file": f}
+                            upload_response = requests.post(upload_url, files=files, headers=upload_headers)
+
+                        # Skip if upload failed
+                        if upload_response.status_code not in [200, 201]:
+                            store_experiment_link(
+                                experiment_id,
+                                f"File upload failed: {upload_response.status_code} {upload_response.text}",
+                                "failure",
+                            )
+                            continue
+
+                        # Create evaluation job (optional - don't skip experiment if this fails)
+                        eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs"
+                        # Truncate job ID to fit 63 character limit
+                        job_id_base = f"{dataset_name}-job"
+                        if len(job_id_base) > 63:
+                            # Keep the "-job" suffix and truncate the dataset_name part
+                            max_dataset_name_len = 63 - 4  # 4 = len("-job")
+                            truncated_dataset_name = dataset_name[:max_dataset_name_len]
+                            job_id_base = f"{truncated_dataset_name}-job"
+
+                        eval_job_payload = {
+                            "evaluationJobId": job_id_base,
+                            "evaluationJob": {
+                                "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy",
+                                "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy",
+                                "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}",
+                            },
+                        }
+
+                        eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers)
+
+                        if eval_response.status_code in [200, 201]:
+                            eval_job_data = eval_response.json()  # pyright: ignore[reportAny]
+                            job_id = eval_job_data.get("evaluationJobId", job_id_base)  # pyright: ignore[reportAny]
+
+                            store_experiment_link(
+                                experiment_id,
+                                f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
+                                "success",
+                            )
+                        else:
+                            store_experiment_link(
+                                experiment_id,
+                                f"Job creation failed: {eval_response.status_code} {eval_response.text}",
+                                "failure",
+                            )
 
     except Exception as e:
         # Do not fail evaluation if experiment JSONL writing fails
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index 030c367e..43bea278 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -258,6 +258,10 @@ def pytest_configure(config) -> None:
     if threshold_env is not None:
         os.environ["EP_PASSED_THRESHOLD"] = threshold_env
 
+    if config.getoption("--ep-no-persist-results-jsonl"):
+        # flag to turn off persisting results as jsonl files
+        os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1"
+
     if config.getoption("--ep-no-upload"):
         os.environ["EP_NO_UPLOAD"] = "1"
 

From aef04219415c40a89403d7048c0cedfdcba9fee1 Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Tue, 30 Sep 2025 13:02:11 -0700
Subject: [PATCH 2/5] add

---
 eval_protocol/pytest/plugin.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index 43bea278..382dedd8 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -133,6 +133,12 @@ def pytest_addoption(parser) -> None:
         default=None,
         help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
     )
+    group.addoption(
+        "--ep-no-persist-results-jsonl",
+        action="store_true",
+        default=False,
+        help=("Disable persisting results as jsonl files. Default: false (results are persisted by default)."),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:

From acbd83cc48c4db3aa32f830717e7edc43af912df Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Tue, 30 Sep 2025 13:37:38 -0700
Subject: [PATCH 3/5] add

---
 eval_protocol/pytest/handle_persist_flow.py | 3 ++-
 eval_protocol/pytest/plugin.py              | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py
index f9c4317f..bee57067 100644
--- a/eval_protocol/pytest/handle_persist_flow.py
+++ b/eval_protocol/pytest/handle_persist_flow.py
@@ -83,7 +83,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
 
                     should_upload = os.getenv("EP_NO_UPLOAD") != "1"
 
-                    if should_upload:
+                    if not should_upload:
+                        continue
 
                         def get_auth_value(key: str) -> str | None:
                             """Get auth value from config file or environment."""
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index 382dedd8..74f4dd66 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -264,7 +264,7 @@ def pytest_configure(config) -> None:
     if threshold_env is not None:
         os.environ["EP_PASSED_THRESHOLD"] = threshold_env
 
-    if config.getoption("--ep-no-persist-results-jsonl"):
+    if config.getoption("--ep-no-save-results-in-jsonl"):
         # flag to turn off persisting results as jsonl files
         os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1"
 

From 6939f6621a8f632d2d3618bcdb28f1998e3c4af0 Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Tue, 30 Sep 2025 13:39:42 -0700
Subject: [PATCH 4/5] update

---
 eval_protocol/pytest/handle_persist_flow.py | 245 ++++++++++----------
 1 file changed, 122 insertions(+), 123 deletions(-)

diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py
index bee57067..a2a7c815 100644
--- a/eval_protocol/pytest/handle_persist_flow.py
+++ b/eval_protocol/pytest/handle_persist_flow.py
@@ -82,133 +82,132 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
                             f.write("\n")
 
                     should_upload = os.getenv("EP_NO_UPLOAD") != "1"
-
                     if not should_upload:
                         continue
 
-                        def get_auth_value(key: str) -> str | None:
-                            """Get auth value from config file or environment."""
-                            try:
-                                config_path = Path.home() / ".fireworks" / "auth.ini"
-                                if config_path.exists():
-                                    config = configparser.ConfigParser()  # noqa: F821
-                                    config.read(config_path)
-                                    for section in ["DEFAULT", "auth"]:
-                                        if config.has_section(section) and config.has_option(section, key):
-                                            return config.get(section, key)
-                            except Exception:
-                                pass
-                            return os.getenv(key)
-
-                        fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
-                        fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
-
-                        if not fireworks_api_key and not fireworks_account_id:
-                            store_experiment_link(
-                                experiment_id,
-                                "No Fireworks API key AND account ID found",
-                                "failure",
-                            )
-                            continue
-                        elif not fireworks_api_key:
-                            store_experiment_link(
-                                experiment_id,
-                                "No Fireworks API key found",
-                                "failure",
-                            )
-                            continue
-                        elif not fireworks_account_id:
-                            store_experiment_link(
-                                experiment_id,
-                                "No Fireworks account ID found",
-                                "failure",
-                            )
-                            continue
-
-                        headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"}
-
-                        # Make dataset first
-                        dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets"
-
-                        dataset_payload = {  # pyright: ignore[reportUnknownVariableType]
-                            "dataset": {
-                                "displayName": dataset_name,
-                                "evalProtocol": {},
-                                "format": "FORMAT_UNSPECIFIED",
-                                "exampleCount": f"{len(exp_rows)}",
-                            },
-                            "datasetId": dataset_name,
-                        }
-
-                        dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers)  # pyright: ignore[reportUnknownArgumentType]
-
-                        # Skip if dataset creation failed
-                        if dataset_response.status_code not in [200, 201]:
-                            store_experiment_link(
-                                experiment_id,
-                                f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}",
-                                "failure",
-                            )
-                            continue
-
-                        dataset_data: dict[str, Any] = dataset_response.json()  # pyright: ignore[reportAny, reportExplicitAny]
-                        dataset_id = dataset_data.get("datasetId", dataset_name)  # pyright: ignore[reportAny]
-
-                        # Upload the JSONL file content
-                        upload_url = (
-                            f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload"
+                    def get_auth_value(key: str) -> str | None:
+                        """Get auth value from config file or environment."""
+                        try:
+                            config_path = Path.home() / ".fireworks" / "auth.ini"
+                            if config_path.exists():
+                                config = configparser.ConfigParser()  # noqa: F821
+                                config.read(config_path)
+                                for section in ["DEFAULT", "auth"]:
+                                    if config.has_section(section) and config.has_option(section, key):
+                                        return config.get(section, key)
+                        except Exception:
+                            pass
+                        return os.getenv(key)
+
+                    fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
+                    fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
+
+                    if not fireworks_api_key and not fireworks_account_id:
+                        store_experiment_link(
+                            experiment_id,
+                            "No Fireworks API key AND account ID found",
+                            "failure",
+                        )
+                        continue
+                    elif not fireworks_api_key:
+                        store_experiment_link(
+                            experiment_id,
+                            "No Fireworks API key found",
+                            "failure",
+                        )
+                        continue
+                    elif not fireworks_account_id:
+                        store_experiment_link(
+                            experiment_id,
+                            "No Fireworks account ID found",
+                            "failure",
+                        )
+                        continue
+
+                    headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"}
+
+                    # Make dataset first
+                    dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets"
+
+                    dataset_payload = {  # pyright: ignore[reportUnknownVariableType]
+                        "dataset": {
+                            "displayName": dataset_name,
+                            "evalProtocol": {},
+                            "format": "FORMAT_UNSPECIFIED",
+                            "exampleCount": f"{len(exp_rows)}",
+                        },
+                        "datasetId": dataset_name,
+                    }
+
+                    dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers)  # pyright: ignore[reportUnknownArgumentType]
+
+                    # Skip if dataset creation failed
+                    if dataset_response.status_code not in [200, 201]:
+                        store_experiment_link(
+                            experiment_id,
+                            f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}",
+                            "failure",
+                        )
+                        continue
+
+                    dataset_data: dict[str, Any] = dataset_response.json()  # pyright: ignore[reportAny, reportExplicitAny]
+                    dataset_id = dataset_data.get("datasetId", dataset_name)  # pyright: ignore[reportAny]
+
+                    # Upload the JSONL file content
+                    upload_url = (
+                        f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload"
+                    )
+                    upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"}
+
+                    with open(exp_file, "rb") as f:
+                        files = {"file": f}
+                        upload_response = requests.post(upload_url, files=files, headers=upload_headers)
+
+                    # Skip if upload failed
+                    if upload_response.status_code not in [200, 201]:
+                        store_experiment_link(
+                            experiment_id,
+                            f"File upload failed: {upload_response.status_code} {upload_response.text}",
+                            "failure",
+                        )
+                        continue
+
+                    # Create evaluation job (optional - don't skip experiment if this fails)
+                    eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs"
+                    # Truncate job ID to fit 63 character limit
+                    job_id_base = f"{dataset_name}-job"
+                    if len(job_id_base) > 63:
+                        # Keep the "-job" suffix and truncate the dataset_name part
+                        max_dataset_name_len = 63 - 4  # 4 = len("-job")
+                        truncated_dataset_name = dataset_name[:max_dataset_name_len]
+                        job_id_base = f"{truncated_dataset_name}-job"
+
+                    eval_job_payload = {
+                        "evaluationJobId": job_id_base,
+                        "evaluationJob": {
+                            "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy",
+                            "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy",
+                            "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}",
+                        },
+                    }
+
+                    eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers)
+
+                    if eval_response.status_code in [200, 201]:
+                        eval_job_data = eval_response.json()  # pyright: ignore[reportAny]
+                        job_id = eval_job_data.get("evaluationJobId", job_id_base)  # pyright: ignore[reportAny]
+
+                        store_experiment_link(
+                            experiment_id,
+                            f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
+                            "success",
+                        )
+                    else:
+                        store_experiment_link(
+                            experiment_id,
+                            f"Job creation failed: {eval_response.status_code} {eval_response.text}",
+                            "failure",
                         )
-                        upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"}
-
-                        with open(exp_file, "rb") as f:
-                            files = {"file": f}
-                            upload_response = requests.post(upload_url, files=files, headers=upload_headers)
-
-                        # Skip if upload failed
-                        if upload_response.status_code not in [200, 201]:
-                            store_experiment_link(
-                                experiment_id,
-                                f"File upload failed: {upload_response.status_code} {upload_response.text}",
-                                "failure",
-                            )
-                            continue
-
-                        # Create evaluation job (optional - don't skip experiment if this fails)
-                        eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs"
-                        # Truncate job ID to fit 63 character limit
-                        job_id_base = f"{dataset_name}-job"
-                        if len(job_id_base) > 63:
-                            # Keep the "-job" suffix and truncate the dataset_name part
-                            max_dataset_name_len = 63 - 4  # 4 = len("-job")
-                            truncated_dataset_name = dataset_name[:max_dataset_name_len]
-                            job_id_base = f"{truncated_dataset_name}-job"
-
-                        eval_job_payload = {
-                            "evaluationJobId": job_id_base,
-                            "evaluationJob": {
-                                "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy",
-                                "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy",
-                                "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}",
-                            },
-                        }
-
-                        eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers)
-
-                        if eval_response.status_code in [200, 201]:
-                            eval_job_data = eval_response.json()  # pyright: ignore[reportAny]
-                            job_id = eval_job_data.get("evaluationJobId", job_id_base)  # pyright: ignore[reportAny]
-
-                            store_experiment_link(
-                                experiment_id,
-                                f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
-                                "success",
-                            )
-                        else:
-                            store_experiment_link(
-                                experiment_id,
-                                f"Job creation failed: {eval_response.status_code} {eval_response.text}",
-                                "failure",
-                            )
 
     except Exception as e:
         # Do not fail evaluation if experiment JSONL writing fails

From 091a474ad6d2f78fdef63f2ca45a53ee29a117df Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Tue, 30 Sep 2025 13:54:53 -0700
Subject: [PATCH 5/5] output result to a custom dir

---
 eval_protocol/pytest/handle_persist_flow.py |  5 ++++-
 eval_protocol/pytest/plugin.py              | 13 ++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/eval_protocol/pytest/handle_persist_flow.py b/eval_protocol/pytest/handle_persist_flow.py
index a2a7c815..e2f2a93d 100644
--- a/eval_protocol/pytest/handle_persist_flow.py
+++ b/eval_protocol/pytest/handle_persist_flow.py
@@ -16,7 +16,8 @@
 def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str):
     try:
         # Default is to save and upload experiment JSONL files, unless explicitly disabled
-        should_save = os.getenv("EP_NO_PERSIST_RESULTS_JSONL") != "1"
+        custom_output_dir = os.getenv("EP_OUTPUT_DIR")
+        should_save = os.getenv("EP_NO_UPLOAD") != "1" or custom_output_dir is not None
 
         if should_save:
             current_run_rows = [item for sublist in all_results for item in sublist]
@@ -27,6 +28,8 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
                         experiments[row.execution_metadata.experiment_id].append(row)
 
                 eval_protocol_dir = find_eval_protocol_dir()
+                if custom_output_dir:
+                    eval_protocol_dir = custom_output_dir
                 exp_dir = pathlib.Path(eval_protocol_dir) / "experiment_results"
                 exp_dir.mkdir(parents=True, exist_ok=True)
 
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index 74f4dd66..d0c4af4d 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -134,10 +134,9 @@ def pytest_addoption(parser) -> None:
         help=("If set, use this base URL for remote rollout processing. Example: http://localhost:8000"),
     )
     group.addoption(
-        "--ep-no-persist-results-jsonl",
-        action="store_true",
-        default=False,
-        help=("Disable persisting results as jsonl files. Default: false (results are persisted by default)."),
+        "--ep-output-dir",
+        default=None,
+        help=("If set, save evaluation results to this directory in jsonl format."),
     )
 
 
@@ -264,9 +263,9 @@ def pytest_configure(config) -> None:
     if threshold_env is not None:
         os.environ["EP_PASSED_THRESHOLD"] = threshold_env
 
-    if config.getoption("--ep-no-save-results-in-jsonl"):
-        # flag to turn off persisting results as jsonl files
-        os.environ["EP_NO_PERSIST_RESULTS_JSONL"] = "1"
+    if config.getoption("--ep-output-dir"):
+        # set this to save eval results to the target dir in jsonl format
+        os.environ["EP_OUTPUT_DIR"] = config.getoption("--ep-output-dir")
 
     if config.getoption("--ep-no-upload"):
         os.environ["EP_NO_UPLOAD"] = "1"