Skip to content

Commit f33eb10

Browse files
committed
fix multi metrics issue
1 parent 4fb2911 commit f33eb10

File tree

2 files changed

+16
-38
lines changed

2 files changed

+16
-38
lines changed

eval_protocol/cli_commands/upload.py

Lines changed: 11 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,7 @@ def upload_command(args: argparse.Namespace) -> int:
528528
fw_api_key_value = get_fireworks_api_key()
529529
if not fw_account_id and fw_api_key_value:
530530
# Attempt to verify and resolve account id from server headers
531-
resolved = verify_api_key_and_get_account_id(
532-
api_key=fw_api_key_value, api_base=get_fireworks_api_base()
533-
)
531+
resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
534532
if resolved:
535533
fw_account_id = resolved
536534
# Propagate to environment so downstream calls use it if needed
@@ -593,31 +591,17 @@ def upload_command(args: argparse.Namespace) -> int:
593591

594592
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
595593
try:
596-
# Upload full directory of the test as multi-metric if the dir contains multiple files
594+
# Always treat as a single evaluator (single-metric) even if folder has helper modules
597595
test_dir = os.path.dirname(source_file_path) if source_file_path else root
598-
# Use multi_metrics if multiple .py files exist at the root dir; otherwise treat as single metric dir
599-
py_files = [f for f in os.listdir(test_dir) if f.endswith(".py")]
600-
if len(py_files) > 1:
601-
result = create_evaluation(
602-
evaluator_id=evaluator_id,
603-
multi_metrics=True,
604-
folder=test_dir,
605-
display_name=display_name or evaluator_id,
606-
description=description or f"Evaluator for {qualname}",
607-
force=force,
608-
entry_point=entry_point,
609-
)
610-
else:
611-
# Single metric mode: metric name derived from folder name; include all files recursively
612-
metric_name = os.path.basename(test_dir) or "metric"
613-
result = create_evaluation(
614-
evaluator_id=evaluator_id,
615-
metric_folders=[f"{metric_name}={test_dir}"],
616-
display_name=display_name or evaluator_id,
617-
description=description or f"Evaluator for {qualname}",
618-
force=force,
619-
entry_point=entry_point,
620-
)
596+
metric_name = os.path.basename(test_dir) or "metric"
597+
result = create_evaluation(
598+
evaluator_id=evaluator_id,
599+
metric_folders=[f"{metric_name}={test_dir}"],
600+
display_name=display_name or evaluator_id,
601+
description=description or f"Evaluator for {qualname}",
602+
force=force,
603+
entry_point=entry_point,
604+
)
621605
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
622606

623607
# Print success message with Fireworks dashboard link

eval_protocol/evaluation.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -362,20 +362,17 @@ def preview(self, sample_file, max_samples=5):
362362
logger.error("Authentication error: Missing Fireworks Account ID or API Key.")
363363
raise ValueError("Missing Fireworks Account ID or API Key.")
364364

365-
# Determine multiMetrics for payload based on ts_mode_config or original flag
366-
payload_multi_metrics = True
367-
payload_rollup_settings = {"skipRollup": True}
365+
# Do not set multiMetrics/rollupSettings in preview payload; keep minimal
368366

369367
# For preview, evaluator_id might not be as critical for shim's env var name,
370368
# but pass it for consistency. Use display_name as a proxy if no specific ID.
371369
preview_evaluator_id_for_shim = self.display_name or "preview_evaluator"
372370
evaluator_payload_data = {
373371
"displayName": self.display_name or "Preview Evaluator",
374372
"description": self.description or "Preview Evaluator",
375-
"multiMetrics": payload_multi_metrics,
373+
# multiMetrics omitted intentionally
376374
"criteria": self._construct_criteria(criteria_data={}),
377-
"requirements": self._get_combined_requirements(), # Changed to use combined requirements
378-
"rollupSettings": payload_rollup_settings,
375+
"requirements": self._get_combined_requirements(),
379376
}
380377

381378
sample_strings = [json.dumps(sample) for sample in samples]
@@ -525,19 +522,16 @@ def create(self, evaluator_id, display_name=None, description=None, force=False)
525522
self.display_name = display_name or evaluator_id
526523
self.description = description or f"Evaluator created from {evaluator_id}"
527524

528-
# Determine multiMetrics for payload
529-
payload_multi_metrics = True
530-
payload_rollup_settings = {"skipRollup": True}
525+
# Do not set multiMetrics/rollupSettings; server will infer when needed
531526

532527
payload_data = {
533528
"evaluator": {
534529
"displayName": self.display_name,
535530
"description": self.description,
536-
"multiMetrics": payload_multi_metrics, # How results are structured
531+
# multiMetrics omitted intentionally
537532
# "rewardFunctionMode": self.reward_function_mode, # How input is processed by user func
538533
"criteria": self._construct_criteria(criteria_data={}),
539534
"requirements": "",
540-
"rollupSettings": payload_rollup_settings,
541535
},
542536
"evaluatorId": evaluator_id,
543537
}

0 commit comments

Comments
 (0)