Skip to content

Commit 5d63c77

Browse files
committed
add bert pretraining
1 parent b6dbf3d commit 5d63c77

7 files changed

Lines changed: 873 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Adapted from ml-daikon-input-programs/accelerate/complete_nlp_example
Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import argparse
15+
import os
16+
import time
17+
18+
import evaluate
19+
import torch
20+
from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
21+
from datasets import load_dataset
22+
from torch.optim import AdamW
23+
from torch.utils.data import DataLoader
24+
from transformers import (
25+
AutoModelForSequenceClassification,
26+
AutoTokenizer,
27+
get_linear_schedule_with_warmup,
28+
set_seed,
29+
)
30+
31+
MD_BATCH_FILE_NAME = "iteration_times.txt"
32+
with open(MD_BATCH_FILE_NAME, "w") as f:
33+
f.write("")
34+
35+
########################################################################
36+
# This is a fully working simple example to use Accelerate
37+
#
38+
# This example trains a Bert base model on GLUE MRPC
39+
# in any of the following settings (with the same script):
40+
# - single CPU or single GPU
41+
# - multi GPUS (using PyTorch distributed mode)
42+
# - (multi) TPUs
43+
# - fp16 (mixed-precision) or fp32 (normal precision)
44+
#
45+
# This example also demonstrates the checkpointing and sharding capabilities
46+
#
47+
# To run it in each of these various modes, follow the instructions
48+
# in the readme for examples:
49+
# https://github.com/huggingface/accelerate/tree/main/examples
50+
#
51+
########################################################################
52+
53+
54+
MAX_GPU_BATCH_SIZE = 16
55+
EVAL_BATCH_SIZE = 32
56+
57+
58+
def training_function(config, args):
59+
# Initialize accelerator
60+
dataloader_config = DataLoaderConfiguration(
61+
use_stateful_dataloader=args.use_stateful_dataloader
62+
)
63+
if args.with_tracking:
64+
accelerator = Accelerator(
65+
cpu=args.cpu,
66+
mixed_precision=args.mixed_precision,
67+
dataloader_config=dataloader_config,
68+
log_with="all",
69+
project_dir=args.project_dir,
70+
)
71+
else:
72+
accelerator = Accelerator(
73+
cpu=args.cpu,
74+
mixed_precision=args.mixed_precision,
75+
dataloader_config=dataloader_config,
76+
)
77+
78+
if hasattr(args.checkpointing_steps, "isdigit"):
79+
if args.checkpointing_steps == "epoch":
80+
checkpointing_steps = args.checkpointing_steps
81+
elif args.checkpointing_steps.isdigit():
82+
checkpointing_steps = int(args.checkpointing_steps)
83+
else:
84+
raise ValueError(
85+
f"Argument `checkpointing_steps` must be either a number or `epoch`. `{args.checkpointing_steps}` passed."
86+
)
87+
else:
88+
checkpointing_steps = None
89+
# Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
90+
lr = config["lr"]
91+
num_epochs = int(config["num_epochs"])
92+
seed = int(config["seed"])
93+
batch_size = int(config["batch_size"])
94+
95+
# We need to initialize the trackers we use, and also store our configuration
96+
if args.with_tracking:
97+
run = os.path.split(__file__)[-1].split(".")[0]
98+
accelerator.init_trackers(run, config)
99+
100+
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
101+
datasets = load_dataset("glue", "mrpc")
102+
metric = evaluate.load("glue", "mrpc")
103+
104+
def tokenize_function(examples):
105+
# max_length=None => use the model max length (it's actually the default)
106+
outputs = tokenizer(
107+
examples["sentence1"],
108+
examples["sentence2"],
109+
truncation=True,
110+
max_length=None,
111+
)
112+
return outputs
113+
114+
# Apply the method we just defined to all the examples in all the splits of the dataset
115+
# starting with the main process first:
116+
with accelerator.main_process_first():
117+
tokenized_datasets = datasets.map(
118+
tokenize_function,
119+
batched=True,
120+
remove_columns=["idx", "sentence1", "sentence2"],
121+
)
122+
123+
# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
124+
# transformers library
125+
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
126+
127+
# If the batch size is too big we use gradient accumulation
128+
gradient_accumulation_steps = 1
129+
if (
130+
batch_size > MAX_GPU_BATCH_SIZE
131+
and accelerator.distributed_type != DistributedType.XLA
132+
):
133+
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
134+
batch_size = MAX_GPU_BATCH_SIZE
135+
136+
def collate_fn(examples):
137+
# On TPU it's best to pad everything to the same length or training will be very slow.
138+
max_length = (
139+
128 if accelerator.distributed_type == DistributedType.XLA else None
140+
)
141+
# When using mixed precision we want round multiples of 8/16
142+
if accelerator.mixed_precision == "fp8":
143+
pad_to_multiple_of = 16
144+
elif accelerator.mixed_precision != "no":
145+
pad_to_multiple_of = 8
146+
else:
147+
pad_to_multiple_of = None
148+
149+
return tokenizer.pad(
150+
examples,
151+
padding="longest",
152+
max_length=max_length,
153+
pad_to_multiple_of=pad_to_multiple_of,
154+
return_tensors="pt",
155+
)
156+
157+
# Instantiate dataloaders.
158+
train_dataloader = DataLoader(
159+
tokenized_datasets["train"],
160+
shuffle=True,
161+
collate_fn=collate_fn,
162+
batch_size=batch_size,
163+
)
164+
eval_dataloader = DataLoader(
165+
tokenized_datasets["validation"],
166+
shuffle=False,
167+
collate_fn=collate_fn,
168+
batch_size=EVAL_BATCH_SIZE,
169+
)
170+
171+
set_seed(seed)
172+
173+
# Instantiate the model (we build the model here so that the seed also control new weights initialization)
174+
model = AutoModelForSequenceClassification.from_pretrained(
175+
"bert-base-cased", return_dict=True
176+
)
177+
178+
# We could avoid this line since the accelerator is set with `device_placement=True` (default value).
179+
# Note that if you are placing tensors on devices manually, this line absolutely needs to be before the optimizer
180+
# creation otherwise training will not work on TPU (`accelerate` will kindly throw an error to make us aware of that).
181+
model = model.to(accelerator.device)
182+
183+
# Instantiate optimizer
184+
optimizer = AdamW(params=model.parameters(), lr=lr)
185+
186+
# Instantiate scheduler
187+
lr_scheduler = get_linear_schedule_with_warmup(
188+
optimizer=optimizer,
189+
num_warmup_steps=100,
190+
num_training_steps=(len(train_dataloader) * num_epochs)
191+
// gradient_accumulation_steps,
192+
)
193+
194+
# Prepare everything
195+
# There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
196+
# prepare method.
197+
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = (
198+
accelerator.prepare(
199+
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
200+
)
201+
)
202+
203+
# We need to keep track of how many total steps we have iterated over
204+
overall_step = 0
205+
# We also need to keep track of the stating epoch so files are named properly
206+
starting_epoch = 0
207+
208+
# Potentially load in the weights and states from a previous save
209+
if args.resume_from_checkpoint:
210+
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
211+
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
212+
accelerator.load_state(args.resume_from_checkpoint)
213+
path = os.path.basename(args.resume_from_checkpoint)
214+
else:
215+
# Get the most recent checkpoint
216+
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
217+
dirs.sort(key=os.path.getctime)
218+
path = dirs[
219+
-1
220+
] # Sorts folders by date modified, most recent checkpoint is the last
221+
# Extract `epoch_{i}` or `step_{i}`
222+
training_difference = os.path.splitext(path)[0]
223+
224+
if "epoch" in training_difference:
225+
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
226+
resume_step = None
227+
else:
228+
resume_step = int(training_difference.replace("step_", ""))
229+
starting_epoch = resume_step // len(train_dataloader)
230+
resume_step -= starting_epoch * len(train_dataloader)
231+
232+
# Now we train the model
233+
for epoch in range(starting_epoch, num_epochs):
234+
model.train()
235+
if args.with_tracking:
236+
total_loss = 0
237+
if (
238+
args.resume_from_checkpoint
239+
and epoch == starting_epoch
240+
and resume_step is not None
241+
):
242+
# We need to skip steps until we reach the resumed step
243+
if not args.use_stateful_dataloader:
244+
active_dataloader = accelerator.skip_first_batches(
245+
train_dataloader, resume_step
246+
)
247+
else:
248+
active_dataloader = train_dataloader
249+
overall_step += resume_step
250+
else:
251+
# After the first iteration though, we need to go back to the original dataloader
252+
active_dataloader = train_dataloader
253+
for step, batch in enumerate(active_dataloader):
254+
# We could avoid this line since we set the accelerator with `device_placement=True`.
255+
BATCH_START = time.perf_counter()
256+
257+
batch.to(accelerator.device)
258+
outputs = model(**batch)
259+
loss = outputs.loss
260+
loss = loss / gradient_accumulation_steps
261+
# We keep track of the loss at each epoch
262+
if args.with_tracking:
263+
total_loss += loss.detach().float()
264+
accelerator.backward(loss)
265+
if step % gradient_accumulation_steps == 0:
266+
optimizer.step()
267+
lr_scheduler.step()
268+
optimizer.zero_grad()
269+
270+
overall_step += 1
271+
272+
if isinstance(checkpointing_steps, int):
273+
output_dir = f"step_{overall_step}"
274+
if overall_step % checkpointing_steps == 0:
275+
if args.output_dir is not None:
276+
output_dir = os.path.join(args.output_dir, output_dir)
277+
accelerator.save_state(output_dir)
278+
279+
BATCH_END = time.perf_counter()
280+
with open(MD_BATCH_FILE_NAME, "a") as f:
281+
f.write("%s\n" % (BATCH_END - BATCH_START))
282+
model.eval()
283+
for step, batch in enumerate(eval_dataloader):
284+
# We could avoid this line since we set the accelerator with `device_placement=True`.
285+
batch.to(accelerator.device)
286+
with torch.no_grad():
287+
outputs = model(**batch)
288+
predictions = outputs.logits.argmax(dim=-1)
289+
predictions, references = accelerator.gather_for_metrics(
290+
(predictions, batch["labels"])
291+
)
292+
metric.add_batch(
293+
predictions=predictions,
294+
references=references,
295+
)
296+
297+
eval_metric = metric.compute()
298+
# Use accelerator.print to print only on the main process.
299+
accelerator.print(f"epoch {epoch}:", eval_metric)
300+
if args.with_tracking:
301+
accelerator.log(
302+
{
303+
"accuracy": eval_metric["accuracy"],
304+
"f1": eval_metric["f1"],
305+
"train_loss": total_loss.item() / len(train_dataloader),
306+
"epoch": epoch,
307+
},
308+
step=epoch,
309+
)
310+
311+
if checkpointing_steps == "epoch":
312+
output_dir = f"epoch_{epoch}"
313+
if args.output_dir is not None:
314+
output_dir = os.path.join(args.output_dir, output_dir)
315+
accelerator.save_state(output_dir)
316+
317+
accelerator.end_training()
318+
319+
320+
def main():
321+
parser = argparse.ArgumentParser(description="Simple example of training script.")
322+
parser.add_argument(
323+
"--mixed_precision",
324+
type=str,
325+
default=None,
326+
choices=["no", "fp16", "bf16", "fp8"],
327+
help="Whether to use mixed precision. Choose"
328+
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
329+
"and an Nvidia Ampere GPU.",
330+
)
331+
parser.add_argument(
332+
"--cpu", action="store_true", help="If passed, will train on the CPU."
333+
)
334+
parser.add_argument(
335+
"--checkpointing_steps",
336+
type=str,
337+
default=None,
338+
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
339+
)
340+
parser.add_argument(
341+
"--resume_from_checkpoint",
342+
type=str,
343+
default=None,
344+
help="If the training should continue from a checkpoint folder.",
345+
)
346+
parser.add_argument(
347+
"--use_stateful_dataloader",
348+
action="store_true",
349+
help="If the dataloader should be a resumable stateful dataloader.",
350+
)
351+
parser.add_argument(
352+
"--with_tracking",
353+
action="store_true",
354+
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
355+
)
356+
parser.add_argument(
357+
"--output_dir",
358+
type=str,
359+
default=".",
360+
help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
361+
)
362+
parser.add_argument(
363+
"--project_dir",
364+
type=str,
365+
default="logs",
366+
help="Location on where to store experiment tracking logs` and relevent project information",
367+
)
368+
args = parser.parse_args()
369+
config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 16}
370+
training_function(config, args)
371+
372+
373+
if __name__ == "__main__":
374+
main()

0 commit comments

Comments
 (0)