python-sdk/eval_protocol/pytest/handle_persist_flow.py at e11a8d75f3106491597548f74136f5853f0216e2 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from collections import defaultdict
import configparser
import json
import os
from pathlib import Path
import pathlib
import re
from typing import Any

from eval_protocol.directory_utils import find_eval_protocol_dir
from eval_protocol.models import EvaluationRow
from eval_protocol.pytest.store_experiment_link import store_experiment_link
import requests


def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name: str):
    try:
        # Default is to save and upload experiment JSONL files, unless explicitly disabled
        should_save_and_upload = os.getenv("EP_NO_UPLOAD") != "1"

        if should_save_and_upload:
            current_run_rows = [item for sublist in all_results for item in sublist]
            if current_run_rows:
                experiments: dict[str, list[EvaluationRow]] = defaultdict(list)
                for row in current_run_rows:
                    if row.execution_metadata and row.execution_metadata.experiment_id:
                        experiments[row.execution_metadata.experiment_id].append(row)

                eval_protocol_dir = find_eval_protocol_dir()
                exp_dir = pathlib.Path(eval_protocol_dir) / "experiment_results"
                exp_dir.mkdir(parents=True, exist_ok=True)

                # Create one JSONL file per experiment_id
                for experiment_id, exp_rows in experiments.items():
                    if not experiment_id or not exp_rows:
                        continue

                    # Generate dataset name (sanitize for Fireworks API compatibility)
                    # API requires: lowercase a-z, 0-9, and hyphen (-) only
                    safe_experiment_id = re.sub(r"[^a-zA-Z0-9-]", "-", experiment_id).lower()
                    safe_test_func_name = re.sub(r"[^a-zA-Z0-9-]", "-", test_func_name).lower()
                    dataset_name = f"{safe_test_func_name}-{safe_experiment_id}"

                    if len(dataset_name) > 63:
                        dataset_name = dataset_name[:63]

                    # Fireworks requires: last character of id must not be '-'
                    dataset_name = dataset_name.rstrip("-")

                    # Ensure non-empty after stripping; fallback to safe_test_func_name
                    if not dataset_name:
                        dataset_name = safe_test_func_name[:63].rstrip("-") or "dataset"

                    exp_file = exp_dir / f"{experiment_id}.jsonl"
                    with open(exp_file, "w", encoding="utf-8") as f:
                        for row in exp_rows:
                            row_data = row.model_dump(exclude_none=True, mode="json")

                            if row.evaluation_result:
                                row_data["evals"] = {"score": row.evaluation_result.score}

                                row_data["eval_details"] = {
                                    "score": row.evaluation_result.score,
                                    "is_score_valid": row.evaluation_result.is_score_valid,
                                    "reason": row.evaluation_result.reason or "",
                                    "metrics": {
                                        name: metric.model_dump() if metric else {}
                                        for name, metric in (row.evaluation_result.metrics or {}).items()
                                    },
                                }
                            else:
                                # Default values if no evaluation result
                                row_data["evals"] = {"score": 0}
                                row_data["eval_details"] = {
                                    "score": 0,
                                    "is_score_valid": False,
                                    "reason": "No evaluation result",
                                    "metrics": {},
                                }

                            json.dump(row_data, f, ensure_ascii=False)
                            f.write("\n")

                    def get_auth_value(key: str) -> str | None:
                        """Get auth value from config file or environment."""
                        try:
                            config_path = Path.home() / ".fireworks" / "auth.ini"
                            if config_path.exists():
                                config = configparser.ConfigParser()  # noqa: F821
                                config.read(config_path)
                                for section in ["DEFAULT", "auth"]:
                                    if config.has_section(section) and config.has_option(section, key):
                                        return config.get(section, key)
                        except Exception:
                            pass
                        return os.getenv(key)

                    fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
                    fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")

                    if not fireworks_api_key and not fireworks_account_id:
                        store_experiment_link(
                            experiment_id,
                            "No Fireworks API key AND account ID found",
                            "failure",
                        )
                        continue
                    elif not fireworks_api_key:
                        store_experiment_link(
                            experiment_id,
                            "No Fireworks API key found",
                            "failure",
                        )
                        continue
                    elif not fireworks_account_id:
                        store_experiment_link(
                            experiment_id,
                            "No Fireworks account ID found",
                            "failure",
                        )
                        continue

                    headers = {"Authorization": f"Bearer {fireworks_api_key}", "Content-Type": "application/json"}

                    # Make dataset first
                    dataset_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets"

                    dataset_payload = {  # pyright: ignore[reportUnknownVariableType]
                        "dataset": {
                            "displayName": dataset_name,
                            "evalProtocol": {},
                            "format": "FORMAT_UNSPECIFIED",
                            "exampleCount": f"{len(exp_rows)}",
                        },
                        "datasetId": dataset_name,
                    }

                    dataset_response = requests.post(dataset_url, json=dataset_payload, headers=headers)  # pyright: ignore[reportUnknownArgumentType]

                    # Skip if dataset creation failed
                    if dataset_response.status_code not in [200, 201]:
                        store_experiment_link(
                            experiment_id,
                            f"Dataset creation failed: {dataset_response.status_code} {dataset_response.text}",
                            "failure",
                        )
                        continue

                    dataset_data: dict[str, Any] = dataset_response.json()  # pyright: ignore[reportAny, reportExplicitAny]
                    dataset_id = dataset_data.get("datasetId", dataset_name)  # pyright: ignore[reportAny]

                    # Upload the JSONL file content
                    upload_url = (
                        f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/datasets/{dataset_id}:upload"
                    )
                    upload_headers = {"Authorization": f"Bearer {fireworks_api_key}"}

                    with open(exp_file, "rb") as f:
                        files = {"file": f}
                        upload_response = requests.post(upload_url, files=files, headers=upload_headers)

                    # Skip if upload failed
                    if upload_response.status_code not in [200, 201]:
                        store_experiment_link(
                            experiment_id,
                            f"File upload failed: {upload_response.status_code} {upload_response.text}",
                            "failure",
                        )
                        continue

                    # Create evaluation job (optional - don't skip experiment if this fails)
                    eval_job_url = f"https://api.fireworks.ai/v1/accounts/{fireworks_account_id}/evaluationJobs"
                    # Truncate job ID to fit 63 character limit
                    job_id_base = f"{dataset_name}-job"
                    if len(job_id_base) > 63:
                        # Keep the "-job" suffix and truncate the dataset_name part
                        max_dataset_name_len = 63 - 4  # 4 = len("-job")
                        truncated_dataset_name = dataset_name[:max_dataset_name_len]
                        job_id_base = f"{truncated_dataset_name}-job"

                    eval_job_payload = {
                        "evaluationJobId": job_id_base,
                        "evaluationJob": {
                            "evaluator": f"accounts/{fireworks_account_id}/evaluators/dummy",
                            "inputDataset": f"accounts/{fireworks_account_id}/datasets/dummy",
                            "outputDataset": f"accounts/{fireworks_account_id}/datasets/{dataset_id}",
                        },
                    }

                    eval_response = requests.post(eval_job_url, json=eval_job_payload, headers=headers)

                    if eval_response.status_code in [200, 201]:
                        eval_job_data = eval_response.json()  # pyright: ignore[reportAny]
                        job_id = eval_job_data.get("evaluationJobId", job_id_base)  # pyright: ignore[reportAny]

                        store_experiment_link(
                            experiment_id,
                            f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
                            "success",
                        )
                    else:
                        store_experiment_link(
                            experiment_id,
                            f"Job creation failed: {eval_response.status_code} {eval_response.text}",
                            "failure",
                        )

    except Exception as e:
        # Do not fail evaluation if experiment JSONL writing fails
        print(f"Warning: Failed to persist results: {e}")
        pass