Prepare scoring scripts for a100 runs.

aahladc · aahladc · commit c0d4087ea2e1 · 2026-03-19T13:22:34.000-07:00
1. Update base workloads to 9 (with finewebedu).
2. Remove all logic related to test targets, since they are no longer used. Work only with validation targets.
3. Fix step time computation.
diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
@@ -59,7 +59,7 @@
 # workloads and rules for the scoring to be correct.
 # We do not use the workload registry since it contains test and development
 # workloads as well.
-NUM_BASE_WORKLOADS = 8
+NUM_BASE_WORKLOADS = 9
 NUM_VARIANT_WORKLOADS = 0
 NUM_TRIALS = 5
 NUM_STUDIES = 3
diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py
@@ -75,10 +75,10 @@
 FLAGS = flags.FLAGS
 
 
-def get_summary_df(workload, workload_df, include_test_split=False):
+def get_summary_df(workload, workload_df):
   print(f' WORKLOAD: {workload}')
   validation_metric, validation_target = (
-    scoring_utils.get_workload_metrics_and_targets(workload, split='validation')
+    scoring_utils.get_workload_metrics_and_targets(workload)
   )
 
   is_minimized = performance_profile.check_if_minimized(validation_metric)
@@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False):
 
   # compute the step times
   def delta(series):
-    return series.shift(1, fill_value=0) - series
+    return series.apply(lambda x: np.diff(x, prepend=0))
 
   accumulated_time_intervals = delta(workload_df['accumulated_submission_time'])
   step_intervals = delta(workload_df['global_step'])
@@ -136,57 +136,27 @@ def delta(series):
       f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}'
     )
 
-  summary_df['step_time (s)'] = np.median(
-    (accumulated_time_intervals / step_intervals).iloc[0]
-  )
-
-  summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
-
-  # test metrics
-  if include_test_split:
-    test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(
-      workload, split='test'
+  # Flatten all intervals from all trials and take the global median
+  with np.errstate(divide='ignore', invalid='ignore'):
+    all_ratios = np.concatenate(
+      (accumulated_time_intervals / step_intervals).values
     )
+  summary_df['step_time (s)'] = np.nanmedian(all_ratios)
 
-    summary_df['test target metric name'] = test_metric
-    summary_df['test target metric value'] = test_target
-
-    summary_df['test target reached'] = (
-      workload_df[test_metric]
-      .apply(lambda x: target_op(x, test_target))
-      .apply(np.any)
-    )
-    summary_df['best metric value on test'] = workload_df[test_metric].apply(
-      lambda x: best_op(x)
-    )
-    workload_df['index best eval on test'] = workload_df[test_metric].apply(
-      lambda x: idx_op(x)
-    )
-    summary_df['time to best eval on test (s)'] = workload_df.apply(
-      lambda x: x['accumulated_submission_time'][x['index best eval on test']],
-      axis=1,
-    )
-    summary_df['time to target on test (s)'] = summary_df.apply(
-      lambda x: x['time to best eval on test (s)']
-      if x['test target reached']
-      else np.inf,
-      axis=1,
-    )
+  summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
 
   return summary_df
 
 
-def get_submission_summary(df, include_test_split=False):
+def get_submission_summary(df):
   """Summarizes the submission results into metric and time tables
   organized by workload.
   """
 
   dfs = []
   print(df)
   for workload, group in df.groupby('workload'):
-    summary_df = get_summary_df(
-      workload, group, include_test_split=include_test_split
-    )
+    summary_df = get_summary_df(workload, group)
     dfs.append(summary_df)
 
   df = pd.concat(dfs)
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
@@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir):
 
 
 ## Get workload properties
-def get_workload_metrics_and_targets(workload, split='validation'):
+def get_workload_metrics_and_targets(workload):
   """Returns workload target metric name and value."""
   workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
   framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
@@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'):
     workload_init_kwargs=workload_init_kwargs,
   )
   metric_name = workload_obj.target_metric_name
-  if split == 'validation':
-    metric = f'validation/{metric_name}'
-    target = workload_obj.validation_target_value
-  elif split == 'test':
-    metric = f'test/{metric_name}'
-    target = workload_obj.test_target_value
+  metric = f'validation/{metric_name}'
+  target = workload_obj.validation_target_value
   return metric, target