-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathupload.py
More file actions
516 lines (436 loc) · 19 KB
/
upload.py
File metadata and controls
516 lines (436 loc) · 19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
import argparse
import importlib.util
import inspect
import json
import os
import pkgutil
import re
import runpy
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Iterable, Optional
import pytest
from eval_protocol.evaluation import create_evaluation
@dataclass
class DiscoveredTest:
module_path: str
module_name: str
qualname: str
file_path: str
lineno: int | None
has_parametrize: bool
param_count: int
nodeids: list[str]
def _iter_python_files(root: str) -> Iterable[str]:
# Don't follow symlinks to avoid infinite loops
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
# Skip common virtualenv and node paths
if any(
skip in dirpath
for skip in [
"/.venv",
"/venv",
"/node_modules",
"/.git",
"/dist",
"/build",
"/__pycache__",
".egg-info",
"/vendor",
]
):
continue
# Also skip specific directories by modifying dirnames in-place
dirnames[:] = [
d
for d in dirnames
if not d.startswith(".") and d not in ["venv", "node_modules", "__pycache__", "dist", "build", "vendor"]
]
for name in filenames:
# Skip setup files, test discovery scripts, __init__, and hidden files
if (
name.endswith(".py")
and not name.startswith(".")
and not name.startswith("test_discovery")
and name not in ["setup.py", "versioneer.py", "conf.py", "__main__.py"]
):
yield os.path.join(dirpath, name)
def _is_eval_protocol_test(obj: Any) -> bool:
# evaluation_test decorator returns a dual_mode_wrapper with _origin_func and pytest marks
if not callable(obj):
return False
origin = getattr(obj, "_origin_func", None)
if origin is None:
return False
# Must have pytest marks from evaluation_test
marks = getattr(obj, "pytestmark", [])
return len(marks) > 0
def _extract_param_info_from_marks(obj: Any) -> tuple[bool, int, list[str]]:
"""Extract parametrization info from pytest marks.
Returns:
(has_parametrize, param_count, param_ids)
"""
marks = getattr(obj, "pytestmark", [])
has_parametrize = False
total_combinations = 0
all_param_ids: list[str] = []
for m in marks:
if getattr(m, "name", "") == "parametrize":
has_parametrize = True
# The data is in kwargs for eval_protocol's parametrization
kwargs = getattr(m, "kwargs", {})
argnames = kwargs.get("argnames", m.args[0] if m.args else "")
argvalues = kwargs.get("argvalues", m.args[1] if len(m.args) > 1 else [])
ids = kwargs.get("ids", [])
# Count this dimension of parameters
if isinstance(argvalues, (list, tuple)):
count = len(argvalues)
total_combinations = count # For now, just use the count from this mark
# Use provided IDs
if ids and isinstance(ids, (list, tuple)):
all_param_ids = list(ids[:count])
else:
# Generate IDs based on argnames
if isinstance(argnames, str) and "," not in argnames:
# Single parameter
all_param_ids = [f"{argnames}={i}" for i in range(count)]
else:
# Multiple parameters
all_param_ids = [f"variant_{i}" for i in range(count)]
return has_parametrize, total_combinations, all_param_ids
def _discover_tests(root: str) -> list[DiscoveredTest]:
abs_root = os.path.abspath(root)
if abs_root not in sys.path:
sys.path.insert(0, abs_root)
discovered: list[DiscoveredTest] = []
# Collect all test functions from Python files
for file_path in _iter_python_files(root):
try:
unique_name = "ep_upload_" + re.sub(r"[^a-zA-Z0-9_]", "_", os.path.abspath(file_path))
spec = importlib.util.spec_from_file_location(unique_name, file_path)
if spec and spec.loader:
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module) # type: ignore[attr-defined]
else:
continue
except Exception:
continue
for name, obj in inspect.getmembers(module):
if _is_eval_protocol_test(obj):
origin = getattr(obj, "_origin_func", obj)
try:
src_file = inspect.getsourcefile(origin) or file_path
_, lineno = inspect.getsourcelines(origin)
except Exception:
src_file, lineno = file_path, None
# Extract parametrization info from marks
has_parametrize, param_count, param_ids = _extract_param_info_from_marks(obj)
# Generate synthetic nodeids for display
base_nodeid = f"{os.path.basename(file_path)}::{name}"
if has_parametrize and param_ids:
nodeids = [f"{base_nodeid}[{pid}]" for pid in param_ids]
else:
nodeids = [base_nodeid]
discovered.append(
DiscoveredTest(
module_path=module.__name__,
module_name=module.__name__,
qualname=f"{module.__name__}.{name}",
file_path=os.path.abspath(src_file),
lineno=lineno,
has_parametrize=has_parametrize,
param_count=param_count,
nodeids=nodeids,
)
)
# Deduplicate by qualname (in case same test appears multiple times)
by_qual: dict[str, DiscoveredTest] = {}
for t in discovered:
existing = by_qual.get(t.qualname)
if not existing or t.param_count > existing.param_count:
by_qual[t.qualname] = t
return sorted(by_qual.values(), key=lambda x: (x.file_path, x.lineno or 0))
def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
# Accept module:function or path::function
entry = entry.strip()
if "::" in entry:
path_part, func = entry.split("::", 1)
abs_path = os.path.abspath(os.path.join(cwd, path_part))
module_name = Path(abs_path).stem
return abs_path, func
elif ":" in entry:
module, func = entry.split(":", 1)
return module, func
else:
raise ValueError("--entry must be in 'module:function' or 'path::function' format")
def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str]:
target, func = _parse_entry(entry, cwd)
# Check if target looks like a file path
if "/" in target or "\\" in target or os.path.exists(target):
# It's a file path - convert to absolute and load as module
if not os.path.isabs(target):
target = os.path.abspath(os.path.join(cwd, target))
if not target.endswith(".py"):
target = target + ".py"
if not os.path.isfile(target):
raise ValueError(f"File not found: {target}")
# Import module from file path
spec = importlib.util.spec_from_file_location(Path(target).stem, target)
if not spec or not spec.loader:
raise ValueError(f"Unable to load module from path: {target}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module) # type: ignore[attr-defined]
module_name = spec.name
else:
# Treat as module path (e.g., "my_package.my_module")
module_name = target
module = importlib.import_module(module_name)
if not hasattr(module, func):
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
qualname = f"{module_name}.{func}"
code, file_name = _generate_ts_mode_code(
DiscoveredTest(
module_path=module_name,
module_name=module_name,
qualname=qualname,
file_path=getattr(module, "__file__", module_name),
lineno=None,
has_parametrize=False,
param_count=0,
nodeids=[],
)
)
return code, file_name, qualname
def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
# Generate a minimal main.py that imports the test module and calls the function
module = test.module_name
func = test.qualname.split(".")[-1]
code = f"""
from typing import Any, Dict, List, Optional, Union
from eval_protocol.models import EvaluationRow, Message
from {module} import {func} as _ep_test
def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
result = _ep_test(row) # Supports sync/async via decorator's dual-mode
if hasattr(result, "__await__"):
import asyncio
result = asyncio.get_event_loop().run_until_complete(result)
if result.evaluation_result is None:
return {{"score": 0.0, "reason": "No evaluation_result set"}}
out = {{
"score": float(result.evaluation_result.score or 0.0),
"reason": result.evaluation_result.reason,
"metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
}}
return out
"""
return (code, "main.py")
def _normalize_evaluator_id(evaluator_id: str) -> str:
"""
Normalize evaluator ID to meet Fireworks requirements:
- Only lowercase a-z, 0-9, and hyphen (-)
- Maximum 63 characters
"""
# Convert to lowercase
normalized = evaluator_id.lower()
# Replace underscores with hyphens
normalized = normalized.replace("_", "-")
# Remove any characters that aren't alphanumeric or hyphen
normalized = re.sub(r"[^a-z0-9-]", "", normalized)
# Remove consecutive hyphens
normalized = re.sub(r"-+", "-", normalized)
# Remove leading/trailing hyphens
normalized = normalized.strip("-")
# Ensure it starts with a letter (Fireworks requirement)
if normalized and not normalized[0].isalpha():
normalized = "eval-" + normalized
# Truncate to 63 characters
if len(normalized) > 63:
normalized = normalized[:63].rstrip("-")
return normalized
def _format_test_choice(test: DiscoveredTest, idx: int) -> str:
"""Format a test as a choice string for display."""
# Shorten the qualname for display
name = test.qualname.split(".")[-1]
location = f"{Path(test.file_path).name}:{test.lineno}" if test.lineno else Path(test.file_path).name
if test.has_parametrize and test.param_count > 1:
return f"{name} ({test.param_count} variants) - {location}"
else:
return f"{name} - {location}"
def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTest]:
"""Interactive selection with arrow keys using questionary."""
try:
import questionary
from questionary import Style
# Custom style similar to Vercel CLI
custom_style = Style(
[
("qmark", "fg:#673ab7 bold"),
("question", "bold"),
("answer", "fg:#f44336 bold"),
("pointer", "fg:#673ab7 bold"),
("highlighted", "fg:#673ab7 bold"),
("selected", "fg:#cc5454"),
("separator", "fg:#cc5454"),
("instruction", ""),
("text", ""),
]
)
# Check if only one test - auto-select it
if len(tests) == 1:
print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask()
if confirm:
return tests
else:
return []
# Create choices with nice formatting
choices = []
for idx, test in enumerate(tests, 1):
choice_text = _format_test_choice(test, idx)
choices.append({"name": choice_text, "value": idx - 1})
print("\n")
print("💡 Tip: Use ↑/↓ arrows to navigate, press ENTER to select\n")
selected_index = questionary.select(
"Select evaluation test to upload:",
choices=choices,
style=custom_style,
).ask()
if selected_index is None: # User pressed Ctrl+C
print("\nUpload cancelled.")
return []
print(f"\n✓ Selected: {_format_test_choice(tests[selected_index], selected_index + 1)}")
return [tests[selected_index]]
except ImportError:
# Fallback to simpler implementation
return _prompt_select_fallback(tests)
except KeyboardInterrupt:
print("\n\nUpload cancelled.")
return []
def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]:
"""Fallback prompt selection for when questionary is not available."""
print("\n" + "=" * 80)
print("Discovered evaluation tests:")
print("=" * 80)
print("\nTip: Install questionary for better UX: pip install questionary\n")
for idx, t in enumerate(tests, 1):
loc = f"{t.file_path}:{t.lineno}" if t.lineno else t.file_path
print(f" [{idx}] {t.qualname}")
print(f" Location: {loc}")
if t.has_parametrize and t.nodeids:
print(f" Parameterized: {t.param_count} variant(s)")
# Show first few variants as examples
example_nodeids = t.nodeids[:3]
for nodeid in example_nodeids:
# Extract just the parameter part for display
if "[" in nodeid:
param_part = nodeid.split("[", 1)[1].rstrip("]")
print(f" - {param_part}")
if len(t.nodeids) > 3:
print(f" ... and {len(t.nodeids) - 3} more")
else:
print(" Type: Single test (no parametrization)")
print()
print("=" * 80)
try:
choice = input("Enter numbers to upload (comma or space-separated), or 'all': ").strip()
except KeyboardInterrupt:
print("\n\nUpload cancelled.")
return []
if choice.lower() in ("all", "a", "*"):
return tests
indices: list[int] = []
for token in re.split(r"[\s,]+", choice):
if token.isdigit():
n = int(token)
if 1 <= n <= len(tests):
indices.append(n - 1)
indices = sorted(set(indices))
return [tests[i] for i in indices]
def _prompt_select(tests: list[DiscoveredTest], non_interactive: bool) -> list[DiscoveredTest]:
"""Prompt user to select tests to upload."""
if non_interactive:
return tests
return _prompt_select_interactive(tests)
def upload_command(args: argparse.Namespace) -> int:
root = os.path.abspath(getattr(args, "path", "."))
entries_arg = getattr(args, "entry", None)
if entries_arg:
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
selected_specs: list[tuple[str, str, str, str]] = []
for e in entries:
code, file_name, qualname = _generate_ts_mode_code_from_entry(e, root)
# For --entry mode, extract file path from the entry
file_path = e.split("::")[0] if "::" in e else ""
selected_specs.append((code, file_name, qualname, file_path))
else:
print("Scanning for evaluation tests...")
tests = _discover_tests(root)
if not tests:
print("No evaluation tests found.")
print("\nHint: Make sure your tests use the @evaluation_test decorator.")
return 1
selected_tests = _prompt_select(tests, non_interactive=bool(getattr(args, "yes", False)))
if not selected_tests:
print("No tests selected.")
return 1
# Warn about parameterized tests
parameterized_tests = [t for t in selected_tests if t.has_parametrize]
if parameterized_tests:
print("\nNote: Parameterized tests will be uploaded as a single evaluator that")
print(" handles all parameter combinations. The evaluator will work with")
print(" the same logic regardless of which model/parameters are used.")
selected_specs = []
for t in selected_tests:
code, file_name = _generate_ts_mode_code(t)
# Store test info for better ID generation
selected_specs.append((code, file_name, t.qualname, t.file_path))
base_id = getattr(args, "id", None)
display_name = getattr(args, "display_name", None)
description = getattr(args, "description", None)
force = bool(getattr(args, "force", False))
exit_code = 0
for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
# Use ts_mode to upload evaluator
# Generate a short default ID from just the test function name
if base_id:
evaluator_id = base_id
if len(selected_specs) > 1:
evaluator_id = f"{base_id}-{i + 1}"
else:
# Extract just the test function name from qualname
test_func_name = qualname.split(".")[-1]
# Extract source file name (e.g., "test_gpqa.py" -> "test_gpqa")
if source_file_path:
source_file_name = Path(source_file_path).stem
else:
source_file_name = "eval"
# Create a shorter ID: filename-testname
evaluator_id = f"{source_file_name}-{test_func_name}"
# Normalize the evaluator ID to meet Fireworks requirements
evaluator_id = _normalize_evaluator_id(evaluator_id)
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
try:
result = create_evaluation(
evaluator_id=evaluator_id,
python_code_to_evaluate=code,
python_file_name_for_code=file_name,
criterion_name_for_code=qualname,
criterion_description_for_code=description or f"Evaluator for {qualname}",
display_name=display_name or evaluator_id,
description=description or f"Evaluator for {qualname}",
force=force,
)
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
# Print success message with Fireworks dashboard link
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
print("📊 View in Fireworks Dashboard:")
print(f" https://app.fireworks.ai/dashboard/evaluators/{evaluator_id}")
print()
except Exception as e:
print(f"Failed to upload {qualname}: {e}")
exit_code = 2
return exit_code