Skip to content
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
0a00bcd
Refactor Fireworks client integration
Jan 7, 2026
d465a89
remove launch.json
Jan 7, 2026
348bb58
Add .vscode/launch.json to .gitignore
Jan 7, 2026
acaa901
Enhance environment variable loading in auth module
Jan 7, 2026
4b71ddb
Add evaluator version creation in evaluation module
Jan 7, 2026
3dbcd59
test
Jan 8, 2026
532e071
REVERT this later
Jan 8, 2026
5e7a5fa
Merge branch 'main' into dhuang/dxe-478-implement-evaluator-versions
Jan 8, 2026
060d72c
fix mock tests
Jan 9, 2026
bc31c9f
Add error handling for evaluator creation in evaluation module
Jan 9, 2026
ea08062
Support EP_REMOTE_API_KEY
Jan 9, 2026
f246087
Merge branch 'main' into dhuang/dxe-478-implement-evaluator-versions
Jan 9, 2026
6b53ac1
include launch.json.backup
Jan 12, 2026
ec0c8ca
rename to .example and add docker run extra arg
Jan 12, 2026
fc036f5
use ignore-docker by default
Jan 12, 2026
4566584
delete backup
Jan 13, 2026
f103b69
ignore-docker by default in dev
Jan 13, 2026
9c3e417
Refactor evaluator function calls to use Fireworks directly for metho…
Jan 13, 2026
ea673f4
use in-flight SDK version
Jan 13, 2026
26fbc2d
Enhance evaluator handling by returning version ID on creation and up…
Jan 13, 2026
4702307
update
Jan 13, 2026
9d1bc74
use published a22 of fireworks-ai
Jan 13, 2026
3314bec
uv lock
Jan 13, 2026
66f191a
Refactor dotenv handling in auth module and integrate environment var…
Jan 14, 2026
165afe1
add create rft launch configuration
Jan 14, 2026
838c7a5
Refactor dotenv handling in auth module and integrate environment var…
Jan 14, 2026
71599e6
Merge branch 'pass-dot-env-to-docker-container' into dhuang/dxe-478-i…
Jan 14, 2026
0144c9f
actually not necessary for local test since local-test mounts the wor…
Jan 14, 2026
c8774a6
increase sql retries
Jan 14, 2026
2076f0a
Refactor dotenv loading to use explicit paths in CLI and API modules
Jan 14, 2026
8acdc35
Merge branch 'main' into dhuang/dxe-478-implement-evaluator-versions
Jan 14, 2026
432a649
Refactor dotenv loading to use explicit paths in CLI and API modules
Jan 14, 2026
ab04086
Merge branch 'ensure-explicit-dotenv' into dhuang/dxe-478-implement-e…
Jan 14, 2026
3c2db59
"ep create evj"
Jan 14, 2026
17eb18f
use SDK for Dataset API calls
Jan 14, 2026
1fd66f7
Implement evaluator upload and status polling in create commands
Jan 15, 2026
fc4f913
Add secret management for uploads in CLI
Jan 15, 2026
2f88428
handle existing secrets with caution
Jan 15, 2026
c6a8c51
Integrate secrets upload handling in CLI commands
Jan 15, 2026
a2165fb
Remove unused `_to_pyargs_nodeid` function from `upload.py` to enhanc…
Jan 15, 2026
1445d75
increase sql retries
Jan 14, 2026
7969a6e
Refactor secret loading in CLI to use python-dotenv
Jan 15, 2026
d4a445b
make connection more robust
Jan 16, 2026
b3adfee
Merge branch 'increase-sql-retries' into dhuang/dxe-478-implement-eva…
Jan 16, 2026
37f4856
passes
Jan 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,5 @@ package.json
tau2-bench
*.err
eval-protocol

.vscode/launch.json
39 changes: 0 additions & 39 deletions .vscode/launch.json

This file was deleted.

11 changes: 9 additions & 2 deletions eval_protocol/adapters/fireworks_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,17 +253,24 @@ def __init__(
project_id: Optional[str] = None,
base_url: str = "https://tracing.fireworks.ai",
timeout: int = 300,
api_key: Optional[str] = None,
):
"""Initialize the Fireworks Tracing adapter.

Args:
project_id: Optional project ID. If not provided, uses the default project configured on the server.
base_url: The base URL of the tracing proxy (default: https://tracing.fireworks.ai)
timeout: Request timeout in seconds (default: 300)
api_key: Optional API key. If not provided, falls back to FIREWORKS_API_KEY environment variable.
"""
self.project_id = project_id
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self._api_key = api_key

def _get_api_key(self) -> Optional[str]:
"""Get the API key, preferring instance-level key over environment variable."""
return self._api_key or os.environ.get("FIREWORKS_API_KEY")

def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
"""Fetch logs from Fireworks tracing gateway /logs endpoint.
Expand All @@ -276,7 +283,7 @@ def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -
from ..common_utils import get_user_agent

headers = {
"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
"Authorization": f"Bearer {self._get_api_key()}",
"User-Agent": get_user_agent(),
}
params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
Expand Down Expand Up @@ -407,7 +414,7 @@ def get_evaluation_rows(
from ..common_utils import get_user_agent

headers = {
"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
"Authorization": f"Bearer {self._get_api_key()}",
"User-Agent": get_user_agent(),
}

Expand Down
30 changes: 29 additions & 1 deletion eval_protocol/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,30 @@
from typing import Optional

import requests
from dotenv import find_dotenv, load_dotenv

logger = logging.getLogger(__name__)

# --- Load .env files ---
# Attempt to load .env.dev first, then .env as a fallback.
# This happens when the module is imported.
# We use override=False (default) so that existing environment variables
# (e.g., set in the shell) are NOT overridden by .env files.
_ENV_DEV_PATH = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
if _ENV_DEV_PATH:
load_dotenv(dotenv_path=_ENV_DEV_PATH, override=False)
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_DEV_PATH}")
else:
_ENV_PATH = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
if _ENV_PATH:
load_dotenv(dotenv_path=_ENV_PATH, override=False)
logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_PATH}")
else:
logger.debug(
"eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
)
# --- End .env loading ---


def get_fireworks_api_key() -> Optional[str]:
"""
Expand Down Expand Up @@ -73,6 +94,8 @@ def verify_api_key_and_get_account_id(
Args:
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
dev.api.fireworks.ai for the verification call.

Returns:
The resolved account id if verification succeeds and the header is present; otherwise None.
Expand All @@ -81,7 +104,12 @@ def verify_api_key_and_get_account_id(
resolved_key = api_key or get_fireworks_api_key()
if not resolved_key:
return None
resolved_base = api_base or get_fireworks_api_base()
provided_base = api_base or get_fireworks_api_base()
# Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
if "api.fireworks.ai" in provided_base:
resolved_base = provided_base
else:
resolved_base = "https://dev.api.fireworks.ai"
Comment thread
cursor[bot] marked this conversation as resolved.

from .common_utils import get_user_agent

Expand Down
13 changes: 3 additions & 10 deletions eval_protocol/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import sys
from pathlib import Path

from fireworks import Fireworks

from .cli_commands.common import setup_logging
from .cli_commands.utils import add_args_from_callable_signature
from .fireworks_client import create_fireworks_client

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -81,14 +80,9 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
"--env-file",
help="Path to .env file containing secrets to upload (default: .env in current directory)",
)
upload_parser.add_argument(
"--force",
action="store_true",
help="Overwrite existing evaluator with the same ID",
)

# Auto-generate flags from SDK Fireworks().evaluators.create() signature
create_evaluator_fn = Fireworks().evaluators.create
create_evaluator_fn = create_fireworks_client().evaluators.create
Comment thread
dphuang2 marked this conversation as resolved.
Outdated

upload_skip_fields = {
"__top_level__": {
Expand Down Expand Up @@ -137,7 +131,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse

rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing --env-file argument for rft and evj commands

Medium Severity

Both create_rft_command and create_evj_command access env_file via getattr(args, "env_file", None) and pass it to handle_secrets_upload, but neither rft_parser nor evj_parser defines the --env-file CLI argument. Users cannot specify a custom env file path for these commands, and the code always receives None. The upload command correctly defines this argument but it's missing from the create subcommands.

Additional Locations (2)

Fix in Cursor Fix in Web

rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
rft_parser.add_argument(
"--ignore-docker",
Expand Down Expand Up @@ -198,7 +191,7 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
"loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
}

create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
create_rft_job_fn = create_fireworks_client().reinforcement_fine_tuning_jobs.create

add_args_from_callable_signature(
rft_parser,
Expand Down
71 changes: 31 additions & 40 deletions eval_protocol/cli_commands/create_rft.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pydantic import ValidationError

from ..auth import get_fireworks_api_base, get_fireworks_api_key
from ..fireworks_client import create_fireworks_client
from ..common_utils import get_user_agent, load_jsonl
from ..fireworks_rft import (
create_dataset_from_jsonl,
Expand All @@ -35,8 +36,6 @@
)
from .local_test import run_evaluator_test

from fireworks import Fireworks


def _extract_dataset_adapter(
test_file_path: str, test_func_name: str
Expand Down Expand Up @@ -568,37 +567,35 @@ def _upload_and_ensure_evaluator(
evaluator_resource_name: str,
api_key: str,
api_base: str,
force: bool,
) -> bool:
"""Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
# Optional short-circuit: if evaluator already exists and not forcing, skip upload path
if not force:
try:
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"User-Agent": get_user_agent(),
}
resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
if resp.ok:
state = resp.json().get("state", "STATE_UNSPECIFIED")
print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
# Poll for ACTIVE before proceeding
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
if not _poll_evaluator_status(
evaluator_resource_name=evaluator_resource_name,
api_key=api_key,
api_base=api_base,
timeout_minutes=10,
):
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
print("\n❌ Evaluator is not ready within the timeout period.")
print(f"📊 Please check the evaluator status at: {dashboard_url}")
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
return False
return True
except requests.exceptions.RequestException:
pass
# Check if evaluator already exists
try:
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"User-Agent": get_user_agent(),
}
resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
if resp.ok:
state = resp.json().get("state", "STATE_UNSPECIFIED")
print(f"✓ Evaluator exists (state: {state}). Skipping upload.")
# Poll for ACTIVE before proceeding
print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
if not _poll_evaluator_status(
evaluator_resource_name=evaluator_resource_name,
api_key=api_key,
api_base=api_base,
timeout_minutes=10,
):
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
print("\n❌ Evaluator is not ready within the timeout period.")
print(f"📊 Please check the evaluator status at: {dashboard_url}")
print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
return False
return True
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated
except requests.exceptions.RequestException:
pass

# Ensure evaluator exists by invoking the upload flow programmatically
Comment thread
cursor[bot] marked this conversation as resolved.
try:
Expand All @@ -623,14 +620,10 @@ def _upload_and_ensure_evaluator(
id=evaluator_id,
display_name=None,
description=None,
force=force, # Pass through the --force flag
yes=True,
env_file=None, # Add the new env_file parameter
env_file=None,
)

if force:
print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")

rc = upload_command(upload_args)
if rc == 0:
print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
Expand Down Expand Up @@ -672,7 +665,7 @@ def _create_rft_job(
) -> int:
"""Build and submit the RFT job request (via Fireworks SDK)."""

signature = inspect.signature(Fireworks().reinforcement_fine_tuning_jobs.create)
signature = inspect.signature(create_fireworks_client().reinforcement_fine_tuning_jobs.create)
Comment thread
cursor[bot] marked this conversation as resolved.

# Build top-level SDK kwargs
sdk_kwargs: Dict[str, Any] = {
Expand Down Expand Up @@ -711,7 +704,7 @@ def _create_rft_job(
return 0

try:
fw: Fireworks = Fireworks(api_key=api_key, base_url=api_base)
fw: Fireworks = create_fireworks_client(api_key=api_key, base_url=api_base)
job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
job_name = job.name
print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
Expand Down Expand Up @@ -739,7 +732,6 @@ def create_rft_command(args) -> int:
evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
non_interactive: bool = bool(getattr(args, "yes", False))
dry_run: bool = bool(getattr(args, "dry_run", False))
force: bool = bool(getattr(args, "force", False))
skip_validation: bool = bool(getattr(args, "skip_validation", False))
ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
Expand Down Expand Up @@ -817,7 +809,6 @@ def create_rft_command(args) -> int:
evaluator_resource_name=evaluator_resource_name,
api_key=api_key,
api_base=api_base,
force=force,
):
return 1

Expand Down
2 changes: 0 additions & 2 deletions eval_protocol/cli_commands/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def upload_command(args: argparse.Namespace) -> int:
base_id = getattr(args, "id", None)
display_name = getattr(args, "display_name", None)
description = getattr(args, "description", None)
force = bool(getattr(args, "force", False))
env_file = getattr(args, "env_file", None)

# Load secrets from .env file and ensure they're available on Fireworks
Expand Down Expand Up @@ -382,7 +381,6 @@ def upload_command(args: argparse.Namespace) -> int:
evaluator_id=evaluator_id,
display_name=display_name or evaluator_id,
description=description or f"Evaluator for {qualname}",
force=force,
entry_point=entry_point,
)
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
Expand Down
Loading
Loading