alibaba · Itok2000u · Aug 29, 2022
diff --git a/StructBERT/race.md b/StructBERT/race.md
@@ -0,0 +1,38 @@
+# RACE
+
+## RACE
+Please download all the related files from this link:
+https://www.cs.cmu.edu/~glai1/data/race/
+
+Including: RACE
+            --train
+            --dev
+            --test
+
+## Guide
+
+1. Install Transformers(Version - 4.21.2)
+```
+pip install transformers
+```
+2. Adjust the hyperparameters in the ```run_race.sh```.
+
+   Then, run the following code:
+
+```
+bash run_race.sh
+```
+
+## Example Usage
+
+```
+squad.py
+  --model_path "Path to pretrained models"
+  --train_file "Path to RACE file"
+  --epoch "Number of epoches"
+  --learning_rate "Learning rate for Adam"
+  --weight_decay "Weight decay for training"
+  --batch_size "Batch size for training"
+  --step_size "Step size for learning rate scheduler"
+  --gamma "Multiplicative factor of learning rate decay"
+```
diff --git a/StructBERT/race.py b/StructBERT/race.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba-inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import glob
+import torch
+import argparse
+from tqdm import tqdm
+from transformers import BertTokenizerFast, BertForMultipleChoice, AdamW
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained models",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        required=True,
+        help="Input RACE dataset",
+    )
+    # Optional parameters
+    parser.add_argument("--epoch", default=3, type=int, help="Number of training epoches")
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="Learning rate for Adam")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay for training")
+    parser.add_argument("--batch_size", default=16, type=int, help="Batch size for training")
+    parser.add_argument("--step_size", default=500, type=int, help="Step size for learning rate scheduler")
+    parser.add_argument("--gamma", default=0.5, type=float, help="Multiplicative factor of learning rate decay")
+
+    return parser.parse_args()
+
+def convert_race(paths):
+    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+    input_ids, attention_mask, segment_id, labels = [], [], [], []
+    for path in paths:
+        filenames = glob.glob(path+"/*txt")
+        for filename in filenames:
+            with open(filename, 'r') as file:
+                raw = json.load(file)
+                for i in range(len(raw['answers'])):
+                    cur_input_ids, cur_attention_mask, cur_segment_id = [], [], []
+                    for option in range(len(raw['options'][i])):
+                        encodings = tokenizer(raw['article'], raw['questions'][i]+raw['options'][i][option], truncation=True, padding='max_length', max_length=320)
+                        cur_input_ids.append(encodings.input_ids)
+                        cur_attention_mask.append(encodings.attention_mask)
+                        cur_segment_id.append(encodings.token_type_ids)
+
+                    labels.append(ord(raw['answers'][i]) - ord('A'))
+                    input_ids.append(cur_input_ids)
+                    attention_mask.append(cur_attention_mask)
+                    segment_id.append(cur_segment_id)
+
+    return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(segment_id), torch.tensor(labels)
+
+def train(args):
+    model = BertForMultipleChoice.from_pretrained("bert-base-uncased")
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model.to(device)
+    model.train()
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+        "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
+    total_input_ids, total_attention_mask, total_segment_id, total_labels = convert_race([args.train_file+"/train/middle", args.train_file+"/train/high"])
+    train_dataset = torch.utils.data.TensorDataset(total_input_ids, total_attention_mask, total_segment_id, total_labels)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
+
+    for epoch in range(args.epoch):
+        losses = []
+        for batch in tqdm(train_loader):
+            optimizer.zero_grad()
+            loss = model(
+                input_ids=batch[0].to(device), 
+                attention_mask=batch[1].to(device), 
+                token_type_ids=batch[2].to(device), 
+                labels=batch[3].to(device))[0]
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            with torch.no_grad():
+                losses.append(loss.item())
+        print("Epoch: {} Loss: {}".format(epoch, sum(losses)/len(losses)))
+
+if __name__ == "__main__":
+    args = parse_args()
+    train(args)
diff --git a/StructBERT/run_race.sh b/StructBERT/run_race.sh
@@ -0,0 +1,27 @@
+echo "Loading params"
+training=${1:-"./RACE"}
+model=${2:-"./model"}
+
+lr=${3:-5e-5}
+weight_decay=${4:-0.0}
+batch_size=${5:-8}
+step_size=${6:-500}
+gamma=${7:-0.5}
+epoch=${8:-3}
+
+echo "Running"
+CMD="race.py"
+CMD+=" --model_path=$model"
+CMD+=" --train_file=$training"
+CMD+=" --batch_size=$batch_size"
+CMD+=" --learning_rate=$lr"
+CMD+=" --weight_decay=$weight_decay"
+CMD+=" --gamma=$gamma"
+CMD+=" --epoch=$epoch"
+
+CMD="python3 $CMD"
+echo "$CMD"
+
+$CMD
+
+echo "Finished Fine-tuning"
diff --git a/StructBERT/run_squad.sh b/StructBERT/run_squad.sh
@@ -0,0 +1,27 @@
+echo "Loading params"
+training=${1:-"./train-v2.0.json"}
+model=${2:-"./model"}
+
+lr=${3:-5e-5}
+weight_decay=${4:-0.0}
+batch_size=${5:-16}
+step_size=${6:-500}
+gamma=${7:-0.5}
+epoch=${8:-3}
+
+echo "Running"
+CMD="squad.py"
+CMD+=" --model_path=$model"
+CMD+=" --train_file=$training"
+CMD+=" --batch_size=$batch_size"
+CMD+=" --learning_rate=$lr"
+CMD+=" --weight_decay=$weight_decay"
+CMD+=" --gamma=$gamma"
+CMD+=" --epoch=$epoch"
+
+CMD="python3 $CMD"
+echo "$CMD"
+
+$CMD
+
+echo "Finished Fine-tuning"
diff --git a/StructBERT/squad.md b/StructBERT/squad.md
@@ -0,0 +1,36 @@
+# Squad
+
+## SQUAD V2.0
+Please download all the related files from this link:
+https://rajpurkar.github.io/SQuAD-explorer/
+
+Including: train-v2.0.json, dev-v2.0.json, evaluate.py
+
+## Guide
+
+1. Install Transformers(Version - 4.21.2)
+```
+pip install transformers
+```
+2. Adjust the hyperparameters in the ```run_squad.sh```.
+
+   Then, run the following code:
+
+```
+bash run_squad.sh
+```
+3. Use the fine-tuned model to generate predictions and get exact_match and f1 scores with ```evaluate.py```.
+
+## Example Usage
+
+```
+squad.py
+  --model_path "Path to pretrained models"
+  --train_file "Path to SQAUD V2.0 Trasn JSON file"
+  --epoch "Number of epoches"
+  --learning_rate "Learning rate for Adam"
+  --weight_decay "Weight decay for training"
+  --batch_size "Batch size for training"
+  --step_size "Step size for learning rate scheduler"
+  --gamma "Multiplicative factor of learning rate decay"
+```
diff --git a/StructBERT/squad.py b/StructBERT/squad.py
@@ -0,0 +1,136 @@
+
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba-inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import torch
+import argparse
+from tqdm import tqdm
+from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW
+
+class squad_data(torch.utils.data.Dataset):
+    def __init__(self, data):
+        self.data = data
+
+    def __getitem__(self, idx):
+        return {key: torch.tensor(val[idx]) for key, val in self.data.items()}
+
+    def __len__(self):
+        return len(self.data.input_ids)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained models",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        required=True,
+        help="Input training JSON SQUAD V2.0 file",
+    )
+    # Optional parameters
+    parser.add_argument("--epoch", default=3, type=int, help="Number of training epoches")
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="Learning rate for Adam")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay for training")
+    parser.add_argument("--batch_size", default=16, type=int, help="Batch size for training")
+    parser.add_argument("--step_size", default=500, type=int, help="Step size for learning rate scheduler")
+    parser.add_argument("--gamma", default=0.5, type=float, help="Multiplicative factor of learning rate decay")
+
+    return parser.parse_args()
+
+def parse_squad(path):
+    with open(path, 'rb') as f:
+        dataset = json.load(f)
+    contexts = []
+    questions = []
+    answers = []
+    for data in dataset['data']:
+        for passage in data['paragraphs']:
+            context = passage['context']
+            for qas in passage['qas']:
+                question = qas['question']
+                for answer in qas['answers']:
+                    contexts.append(context)
+                    questions.append(question)
+                    answers.append(answer)
+    return contexts, questions, answers
+
+def data_preprocess(encodings, answers, max_len):
+    starts = []
+    ends = []
+    for answer in answers:
+        start = answer['answer_start']
+        end = start + len(answer['text'])
+        answer['answer_end'] = end
+    for i, answer in enumerate(answers):
+        offset = 1
+        starts.append(encodings.char_to_token(i, answer['answer_start']))
+        ends.append(encodings.char_to_token(i, answer['answer_end']))
+        if not starts[-1]: starts[-1] = max_len
+        while not ends[-1]:
+            ends[-1] = encodings.char_to_token(i, answer['answer_end'] - offset)
+            offset += 1
+    encodings.update({'start': starts, 'end': ends})
+
+def train(args, train_dataset):
+    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model.to(device)
+    model.train()
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+        "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
+
+    for epoch in range(args.epoch):
+        losses = []
+        for batch in tqdm(train_loader):
+            optimizer.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            start_positions = batch['start'].to(device)
+            end_positions = batch['end'].to(device)
+            loss = model(input_ids, attention_mask=attention_mask,
+                            start_positions=start_positions,
+                            end_positions=end_positions)[0]
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            with torch.no_grad():
+                losses.append(loss.item())
+        print("Epoch: {} Loss: {}".format(epoch, sum(losses)/len(losses)))
+
+if __name__ == "__main__":
+    args = parse_args()
+    train_contexts, train_questions, train_answers = parse_squad(args.train_file)
+    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
+    data_preprocess(train_encodings, train_answers, tokenizer.model_max_length)
+    train_dataset = squad_data(train_encodings)
+    train(args, train_dataset)