Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions StructBERT/race.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# RACE

## RACE
Please download all the related files from this link:
https://www.cs.cmu.edu/~glai1/data/race/

Including: RACE
--train
--dev
--test

## Guide

1. Install Transformers(Version - 4.21.2)
```
pip install transformers
```
2. Adjust the hyperparameters in the ```run_race.sh```.

Then, run the following code:

```
bash run_race.sh
```

## Example Usage

```
squad.py
--model_path "Path to pretrained models"
--train_file "Path to RACE file"
--epoch "Number of epoches"
--learning_rate "Learning rate for Adam"
--weight_decay "Weight decay for training"
--batch_size "Batch size for training"
--step_size "Step size for learning rate scheduler"
--gamma "Multiplicative factor of learning rate decay"
```
109 changes: 109 additions & 0 deletions StructBERT/race.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba-inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import glob
import torch
import argparse
from tqdm import tqdm
from transformers import BertTokenizerFast, BertForMultipleChoice, AdamW

def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_path",
default=None,
type=str,
required=True,
help="Path to pretrained models",
)
parser.add_argument(
"--train_file",
default=None,
type=str,
required=True,
help="Input RACE dataset",
)
# Optional parameters
parser.add_argument("--epoch", default=3, type=int, help="Number of training epoches")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="Learning rate for Adam")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay for training")
parser.add_argument("--batch_size", default=16, type=int, help="Batch size for training")
parser.add_argument("--step_size", default=500, type=int, help="Step size for learning rate scheduler")
parser.add_argument("--gamma", default=0.5, type=float, help="Multiplicative factor of learning rate decay")

return parser.parse_args()

def convert_race(paths):
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
input_ids, attention_mask, segment_id, labels = [], [], [], []
for path in paths:
filenames = glob.glob(path+"/*txt")
for filename in filenames:
with open(filename, 'r') as file:
raw = json.load(file)
for i in range(len(raw['answers'])):
cur_input_ids, cur_attention_mask, cur_segment_id = [], [], []
for option in range(len(raw['options'][i])):
encodings = tokenizer(raw['article'], raw['questions'][i]+raw['options'][i][option], truncation=True, padding='max_length', max_length=320)
cur_input_ids.append(encodings.input_ids)
cur_attention_mask.append(encodings.attention_mask)
cur_segment_id.append(encodings.token_type_ids)

labels.append(ord(raw['answers'][i]) - ord('A'))
input_ids.append(cur_input_ids)
attention_mask.append(cur_attention_mask)
segment_id.append(cur_segment_id)

return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(segment_id), torch.tensor(labels)

def train(args):
model = BertForMultipleChoice.from_pretrained("bert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
total_input_ids, total_attention_mask, total_segment_id, total_labels = convert_race([args.train_file+"/train/middle", args.train_file+"/train/high"])
train_dataset = torch.utils.data.TensorDataset(total_input_ids, total_attention_mask, total_segment_id, total_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)

for epoch in range(args.epoch):
losses = []
for batch in tqdm(train_loader):
optimizer.zero_grad()
loss = model(
input_ids=batch[0].to(device),
attention_mask=batch[1].to(device),
token_type_ids=batch[2].to(device),
labels=batch[3].to(device))[0]
loss.backward()
optimizer.step()
scheduler.step()
with torch.no_grad():
losses.append(loss.item())
print("Epoch: {} Loss: {}".format(epoch, sum(losses)/len(losses)))

if __name__ == "__main__":
args = parse_args()
train(args)
27 changes: 27 additions & 0 deletions StructBERT/run_race.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
echo "Loading params"
training=${1:-"./RACE"}
model=${2:-"./model"}

lr=${3:-5e-5}
weight_decay=${4:-0.0}
batch_size=${5:-8}
step_size=${6:-500}
gamma=${7:-0.5}
epoch=${8:-3}

echo "Running"
CMD="race.py"
CMD+=" --model_path=$model"
CMD+=" --train_file=$training"
CMD+=" --batch_size=$batch_size"
CMD+=" --learning_rate=$lr"
CMD+=" --weight_decay=$weight_decay"
CMD+=" --gamma=$gamma"
CMD+=" --epoch=$epoch"

CMD="python3 $CMD"
echo "$CMD"

$CMD

echo "Finished Fine-tuning"
27 changes: 27 additions & 0 deletions StructBERT/run_squad.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
echo "Loading params"
training=${1:-"./train-v2.0.json"}
model=${2:-"./model"}

lr=${3:-5e-5}
weight_decay=${4:-0.0}
batch_size=${5:-16}
step_size=${6:-500}
gamma=${7:-0.5}
epoch=${8:-3}

echo "Running"
CMD="squad.py"
CMD+=" --model_path=$model"
CMD+=" --train_file=$training"
CMD+=" --batch_size=$batch_size"
CMD+=" --learning_rate=$lr"
CMD+=" --weight_decay=$weight_decay"
CMD+=" --gamma=$gamma"
CMD+=" --epoch=$epoch"

CMD="python3 $CMD"
echo "$CMD"

$CMD

echo "Finished Fine-tuning"
36 changes: 36 additions & 0 deletions StructBERT/squad.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Squad

## SQUAD V2.0
Please download all the related files from this link:
https://rajpurkar.github.io/SQuAD-explorer/

Including: train-v2.0.json, dev-v2.0.json, evaluate.py

## Guide

1. Install Transformers(Version - 4.21.2)
```
pip install transformers
```
2. Adjust the hyperparameters in the ```run_squad.sh```.

Then, run the following code:

```
bash run_squad.sh
```
3. Use the fine-tuned model to generate predictions and get exact_match and f1 scores with ```evaluate.py```.

## Example Usage

```
squad.py
--model_path "Path to pretrained models"
--train_file "Path to SQAUD V2.0 Trasn JSON file"
--epoch "Number of epoches"
--learning_rate "Learning rate for Adam"
--weight_decay "Weight decay for training"
--batch_size "Batch size for training"
--step_size "Step size for learning rate scheduler"
--gamma "Multiplicative factor of learning rate decay"
```
136 changes: 136 additions & 0 deletions StructBERT/squad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba-inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import torch
import argparse
from tqdm import tqdm
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW

class squad_data(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data

def __getitem__(self, idx):
return {key: torch.tensor(val[idx]) for key, val in self.data.items()}

def __len__(self):
return len(self.data.input_ids)

def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_path",
default=None,
type=str,
required=True,
help="Path to pretrained models",
)
parser.add_argument(
"--train_file",
default=None,
type=str,
required=True,
help="Input training JSON SQUAD V2.0 file",
)
# Optional parameters
parser.add_argument("--epoch", default=3, type=int, help="Number of training epoches")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="Learning rate for Adam")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay for training")
parser.add_argument("--batch_size", default=16, type=int, help="Batch size for training")
parser.add_argument("--step_size", default=500, type=int, help="Step size for learning rate scheduler")
parser.add_argument("--gamma", default=0.5, type=float, help="Multiplicative factor of learning rate decay")

return parser.parse_args()

def parse_squad(path):
with open(path, 'rb') as f:
dataset = json.load(f)
contexts = []
questions = []
answers = []
for data in dataset['data']:
for passage in data['paragraphs']:
context = passage['context']
for qas in passage['qas']:
question = qas['question']
for answer in qas['answers']:
contexts.append(context)
questions.append(question)
answers.append(answer)
return contexts, questions, answers

def data_preprocess(encodings, answers, max_len):
starts = []
ends = []
for answer in answers:
start = answer['answer_start']
end = start + len(answer['text'])
answer['answer_end'] = end
for i, answer in enumerate(answers):
offset = 1
starts.append(encodings.char_to_token(i, answer['answer_start']))
ends.append(encodings.char_to_token(i, answer['answer_end']))
if not starts[-1]: starts[-1] = max_len
while not ends[-1]:
ends[-1] = encodings.char_to_token(i, answer['answer_end'] - offset)
offset += 1
encodings.update({'start': starts, 'end': ends})

def train(args, train_dataset):
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)

for epoch in range(args.epoch):
losses = []
for batch in tqdm(train_loader):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start'].to(device)
end_positions = batch['end'].to(device)
loss = model(input_ids, attention_mask=attention_mask,
start_positions=start_positions,
end_positions=end_positions)[0]
loss.backward()
optimizer.step()
scheduler.step()
with torch.no_grad():
losses.append(loss.item())
print("Epoch: {} Loss: {}".format(epoch, sum(losses)/len(losses)))

if __name__ == "__main__":
args = parse_args()
train_contexts, train_questions, train_answers = parse_squad(args.train_file)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
data_preprocess(train_encodings, train_answers, tokenizer.model_max_length)
train_dataset = squad_data(train_encodings)
train(args, train_dataset)