Update scoring scripts

aahladc · aahladc · commit 27e3cbd8498f · 2026-03-18T16:41:01.000-07:00
diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
@@ -59,7 +59,7 @@
 # workloads and rules for the scoring to be correct.
 # We do not use the workload registry since it contains test and development
 # workloads as well.
-NUM_BASE_WORKLOADS = 8
+NUM_BASE_WORKLOADS = 9
 NUM_VARIANT_WORKLOADS = 0
 NUM_TRIALS = 5
 NUM_STUDIES = 3
diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py
@@ -75,10 +75,10 @@
 FLAGS = flags.FLAGS
 
 
-def get_summary_df(workload, workload_df, include_test_split=False):
+def get_summary_df(workload, workload_df):
   print(f' WORKLOAD: {workload}')
   validation_metric, validation_target = (
-    scoring_utils.get_workload_metrics_and_targets(workload, split='validation')
+    scoring_utils.get_workload_metrics_and_targets(workload)
   )
 
   is_minimized = performance_profile.check_if_minimized(validation_metric)
@@ -145,51 +145,18 @@ def delta(series):
 
   summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
 
-  # test metrics
-  if include_test_split:
-    test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(
-      workload, split='test'
-    )
-
-    summary_df['test target metric name'] = test_metric
-    summary_df['test target metric value'] = test_target
-
-    summary_df['test target reached'] = (
-      workload_df[test_metric]
-      .apply(lambda x: target_op(x, test_target))
-      .apply(np.any)
-    )
-    summary_df['best metric value on test'] = workload_df[test_metric].apply(
-      lambda x: best_op(x)
-    )
-    workload_df['index best eval on test'] = workload_df[test_metric].apply(
-      lambda x: idx_op(x)
-    )
-    summary_df['time to best eval on test (s)'] = workload_df.apply(
-      lambda x: x['accumulated_submission_time'][x['index best eval on test']],
-      axis=1,
-    )
-    summary_df['time to target on test (s)'] = summary_df.apply(
-      lambda x: x['time to best eval on test (s)']
-      if x['test target reached']
-      else np.inf,
-      axis=1,
-    )
-
   return summary_df
 
 
-def get_submission_summary(df, include_test_split=False):
+def get_submission_summary(df):
   """Summarizes the submission results into metric and time tables
   organized by workload.
   """
 
   dfs = []
   print(df)
   for workload, group in df.groupby('workload'):
-    summary_df = get_summary_df(
-      workload, group, include_test_split=include_test_split
-    )
+    summary_df = get_summary_df(workload, group)
     dfs.append(summary_df)
 
   df = pd.concat(dfs)
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
@@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir):
 
 
 ## Get workload properties
-def get_workload_metrics_and_targets(workload, split='validation'):
+def get_workload_metrics_and_targets(workload):
   """Returns workload target metric name and value."""
   workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
   framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
@@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'):
     workload_init_kwargs=workload_init_kwargs,
   )
   metric_name = workload_obj.target_metric_name
-  if split == 'validation':
-    metric = f'validation/{metric_name}'
-    target = workload_obj.validation_target_value
-  elif split == 'test':
-    metric = f'test/{metric_name}'
-    target = workload_obj.test_target_value
+  metric = f'validation/{metric_name}'
+  target = workload_obj.validation_target_value
   return metric, target