Skip to content

Commit c0d4087

Browse files
committed
Prepare scoring scripts for a100 runs.
1. Update base workloads to 9 (with finewebedu). 2. Remove all logic related to test targets, since they are no longer used. Work only with validation targets. 3. Fix step time computation.
1 parent b4734a4 commit c0d4087

File tree

3 files changed

+15
-49
lines changed

3 files changed

+15
-49
lines changed

scoring/performance_profile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
# workloads and rules for the scoring to be correct.
6060
# We do not use the workload registry since it contains test and development
6161
# workloads as well.
62-
NUM_BASE_WORKLOADS = 8
62+
NUM_BASE_WORKLOADS = 9
6363
NUM_VARIANT_WORKLOADS = 0
6464
NUM_TRIALS = 5
6565
NUM_STUDIES = 3

scoring/score_submissions.py

Lines changed: 11 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@
7575
FLAGS = flags.FLAGS
7676

7777

78-
def get_summary_df(workload, workload_df, include_test_split=False):
78+
def get_summary_df(workload, workload_df):
7979
print(f' WORKLOAD: {workload}')
8080
validation_metric, validation_target = (
81-
scoring_utils.get_workload_metrics_and_targets(workload, split='validation')
81+
scoring_utils.get_workload_metrics_and_targets(workload)
8282
)
8383

8484
is_minimized = performance_profile.check_if_minimized(validation_metric)
@@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False):
127127

128128
# compute the step times
129129
def delta(series):
130-
return series.shift(1, fill_value=0) - series
130+
return series.apply(lambda x: np.diff(x, prepend=0))
131131

132132
accumulated_time_intervals = delta(workload_df['accumulated_submission_time'])
133133
step_intervals = delta(workload_df['global_step'])
@@ -136,57 +136,27 @@ def delta(series):
136136
f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}'
137137
)
138138

139-
summary_df['step_time (s)'] = np.median(
140-
(accumulated_time_intervals / step_intervals).iloc[0]
141-
)
142-
143-
summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
144-
145-
# test metrics
146-
if include_test_split:
147-
test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(
148-
workload, split='test'
139+
# Flatten all intervals from all trials and take the global median
140+
with np.errstate(divide='ignore', invalid='ignore'):
141+
all_ratios = np.concatenate(
142+
(accumulated_time_intervals / step_intervals).values
149143
)
144+
summary_df['step_time (s)'] = np.nanmedian(all_ratios)
150145

151-
summary_df['test target metric name'] = test_metric
152-
summary_df['test target metric value'] = test_target
153-
154-
summary_df['test target reached'] = (
155-
workload_df[test_metric]
156-
.apply(lambda x: target_op(x, test_target))
157-
.apply(np.any)
158-
)
159-
summary_df['best metric value on test'] = workload_df[test_metric].apply(
160-
lambda x: best_op(x)
161-
)
162-
workload_df['index best eval on test'] = workload_df[test_metric].apply(
163-
lambda x: idx_op(x)
164-
)
165-
summary_df['time to best eval on test (s)'] = workload_df.apply(
166-
lambda x: x['accumulated_submission_time'][x['index best eval on test']],
167-
axis=1,
168-
)
169-
summary_df['time to target on test (s)'] = summary_df.apply(
170-
lambda x: x['time to best eval on test (s)']
171-
if x['test target reached']
172-
else np.inf,
173-
axis=1,
174-
)
146+
summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
175147

176148
return summary_df
177149

178150

179-
def get_submission_summary(df, include_test_split=False):
151+
def get_submission_summary(df):
180152
"""Summarizes the submission results into metric and time tables
181153
organized by workload.
182154
"""
183155

184156
dfs = []
185157
print(df)
186158
for workload, group in df.groupby('workload'):
187-
summary_df = get_summary_df(
188-
workload, group, include_test_split=include_test_split
189-
)
159+
summary_df = get_summary_df(workload, group)
190160
dfs.append(summary_df)
191161

192162
df = pd.concat(dfs)

scoring/scoring_utils.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir):
214214

215215

216216
## Get workload properties
217-
def get_workload_metrics_and_targets(workload, split='validation'):
217+
def get_workload_metrics_and_targets(workload):
218218
"""Returns workload target metric name and value."""
219219
workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
220220
framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
@@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'):
233233
workload_init_kwargs=workload_init_kwargs,
234234
)
235235
metric_name = workload_obj.target_metric_name
236-
if split == 'validation':
237-
metric = f'validation/{metric_name}'
238-
target = workload_obj.validation_target_value
239-
elif split == 'test':
240-
metric = f'test/{metric_name}'
241-
target = workload_obj.test_target_value
236+
metric = f'validation/{metric_name}'
237+
target = workload_obj.validation_target_value
242238
return metric, target
243239

244240

0 commit comments

Comments
 (0)