-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathht5.py
More file actions
336 lines (287 loc) · 18 KB
/
ht5.py
File metadata and controls
336 lines (287 loc) · 18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, get_linear_schedule_with_warmup, get_scheduler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from datasets import load_dataset, load_metric, list_metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch
import os
from tqdm.auto import tqdm
import pandas as pd
import argparse
import time
import pickle
import numpy as np
import re
#import utility_hs as util
# commandline arguments
""" If you are running this in a jupyter notebook then you need to change the parser lines below
to equality e.g. traindata = "english_dataset/english_dataset.tsv" and then remove args. where applicable
"""
parser = argparse.ArgumentParser(description='Hate Speech Model')
# --datatype: hasoc, ...
parser.add_argument('--datatype', type=str, default='hasoc', help='data of choice')
parser.add_argument('--has19_traindata', type=str, default='/home/shared_data/h/has19_traindata.csv', help='location of the training data')
parser.add_argument('--has19_devdata', type=str, default='/home/shared_data/h/has19_devdata.csv', help='location of the dev data')
parser.add_argument('--has19_testdata', type=str, default='/home/shared_data/h/has19_testdata.csv', help='location of the test data')
parser.add_argument('--has20_traindata', type=str, default='/home/shared_data/h/has20_traindata.csv', help='location of the training data')
parser.add_argument('--has20_devdata', type=str, default='/home/shared_data/h/has20_devdata.csv', help='location of the dev data')
parser.add_argument('--has20_testdata', type=str, default='/home/shared_data/h/has20_testdata.csv', help='location of the test data')
parser.add_argument('--has21_traindata', type=str, default='/home/shared_data/h/has21_traindata.csv', help='location of the training data')
parser.add_argument('--has21_devdata', type=str, default='/home/shared_data/h/has21_devdata.csv', help='location of the dev data')
parser.add_argument('--has21_testdata', type=str, default='/home/shared_data/h/has21_testdata.csv', help='location of the test data')
# Additional datasets
parser.add_argument('--task_pref', type=str, default="classification: ", help='Task prefix')
parser.add_argument('--datayear', type=str, default="2021", help='Data year')
parser.add_argument('--taskno', type=str, default="1", help='Task Number')
parser.add_argument('--savet', type=str, default='t5large2_hasoc21a.pt', help='filename of the model checkpoint')
parser.add_argument('--pikle', type=str, default='t5large_hasoc21a.pkl', help='pickle filename of the model checkpoint')
parser.add_argument('--msave', type=str, default='t5large2_hasoc21a', help='folder to save the finetuned model')
parser.add_argument('--ofile1', type=str, default='outputfile_', help='output file')
parser.add_argument('--ofile2', type=str, default='outputfile_', help='output file')
parser.add_argument('--submission1', type=str, default='submitfile_task1a.csv', help='submission file')
parser.add_argument('--seed', type=int, default=11, help='random seed')
parser.add_argument('--lr', type=float, default=0.0005, help='initial learning rate') # bestloss at 0.0002; dpred- 1; weighted F1: 0.9386351943374753, micro F1: 0.9376623376623376; test #weighted F1: 0.8210865645981863, micro F1: 0.8227946916471507
parser.add_argument('--epochs', type=int, default=6, help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=8, metavar='N', help='batch size') # smaller batch size for big model to fit GPU
args = parser.parse_args()
def preprocess_pandas(data, columns):
''' <data> is a dataframe which contain a <text> column '''
df_ = pd.DataFrame(columns=columns)
df_ = data
df_['text'] = data['text'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True) # remove emails
df_['text'] = data['text'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True) # remove IP address
df_['text'] = data['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True) # remove URLs
df_['text'] = data['text'].str.replace('[#,@,&,<,>,\,/,-]','') # remove special characters
df_['text'] = data['text'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE) # remove emojis+
df_['text'] = data['text'].str.replace('[','')
df_['text'] = data['text'].str.replace(']','')
df_['text'] = data['text'].str.replace('\n', ' ')
df_['text'] = data['text'].str.replace('\t', ' ')
df_['text'] = data['text'].str.replace(' {2,}', ' ', regex=True) # remove 2 or more spaces
df_['text'] = data['text'].str.lower()
df_['text'] = data['text'].str.strip()
df_['text'] = data['text'].replace('\d', '', regex=True) # remove numbers
df_.drop_duplicates(subset=['text'], keep='first')
df_.dropna()
return df_
def data_augment(data, columns, type='drop'):
"""
types: drop, replace, generate
1st type: delete 2 tokens
get rows from (rows, labels) > 5 tokens
if the 1st & last tokens != ['HOF words'] then drop them
Add these new (rows, labels) to the training data
2nd type: replace 2 tokens
get rows from (rows, labels) > 2 tokens
if the 1st & last tokens != ['HOF words'] then replace them using contextual embeddings
Add these new (rows, labels) to the training data
3rd type: add 2 new tokens
for all (rows, labels) add 2 tokens at the end
if any of the 2 tokens == ['HOF words'] then change label appropriately
Add these new (rows, labels) to the training data
"""
df_ = pd.DataFrame(columns=columns)
df_ = data
return df_
def f1_score_func(preds, labels):
preds_flat = []
preds_flat = ['0' if a == '' or len(a) > 1 else a for a in preds] # get rid of empty & lengthy predictions
#preds_flat.extend(preds_flat_)
return f1_score(labels, preds_flat, average=None), f1_score(labels, preds_flat, average="weighted"), f1_score(labels, preds_flat, average="macro")
# def accuracy_score_func(preds, labels):
# preds_flat = np.argmax(preds, axis=1).flatten()
# labels_flat = labels.flatten()
# return accuracy_score(labels_flat, preds_flat, normalize='False')
def confusion_matrix_func(preds, labels):
preds_flat = []
preds_flat_ = ['0' if a == '' or len(a) > 1 else a for a in preds] # get rid of empty & lengthy predictions
preds_flat.extend(preds_flat_)
print(confusion_matrix(labels, preds_flat))
def train(train_data, train_tags):
"""One epoch of a training loop"""
print("Training...")
epoch_loss, train_steps, train_loss = 0, 0, 0
# tokenizer.encode() converts the text to a list of unique integers before returning tensors
einput_ids = tokenizer(train_data, padding=True, truncation=True, return_tensors='pt')
input_ids, attention_mask = einput_ids.input_ids, einput_ids.attention_mask
labels = tokenizer(train_tags, padding=True, truncation=True, return_tensors='pt').input_ids
input_ids = input_ids.to(device)
labels = labels.to(device)
attention_mask = attention_mask.to(device)
train_tensor = TensorDataset(input_ids, attention_mask, labels)
train_sampler = RandomSampler(train_tensor)
train_dataloader = DataLoader(train_tensor, sampler=train_sampler, batch_size=args.batch_size)
model.train() # Turn on training mode
for batch in tqdm(train_dataloader):
optimizer.zero_grad()
batch_input_ids, batch_att_mask, batch_labels = batch
loss = model(input_ids=batch_input_ids, attention_mask=batch_att_mask, labels=batch_labels).loss
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
train_loss += loss.item()
train_steps += 1
epoch_loss = train_loss / train_steps
return epoch_loss
def evaluate(val_data, val_tags):
"""One epoch of an evaluation loop"""
print("Evaluation...")
epoch_loss, val_steps, val_loss = 0, 0, 0
# tokenizer.encode() converts the text to a list of unique integers before returning tensors
einput_ids = tokenizer(val_data, padding=True, return_tensors='pt')
input_ids, attention_mask = einput_ids.input_ids, einput_ids.attention_mask
labels = tokenizer(val_tags, padding=True, return_tensors='pt').input_ids
input_ids = input_ids.to(device)
labels = labels.to(device)
attention_mask = attention_mask.to(device)
val_tensor = TensorDataset(input_ids, attention_mask, labels) #, ids)
val_sampler = SequentialSampler(val_tensor)
val_dataloader = DataLoader(val_tensor, sampler=val_sampler, batch_size=args.batch_size)
predictions = []
model.eval() # Turn on evaluation mode
for batch in tqdm(val_dataloader):
with torch.no_grad():
batch_input_ids, batch_att_mask, batch_labels = batch
outputs = model(input_ids=batch_input_ids, attention_mask=batch_att_mask, labels=batch_labels)
logits = outputs.logits
prediction = torch.argmax(logits, dim=-1)
prediction = tokenizer.batch_decode(prediction, skip_special_tokens=True)
for a in prediction: # pick each element - no list comprehension
predictions.append(a)
val_loss += outputs.loss.item()
val_steps += 1
true_vals = val_tags
epoch_loss = val_loss / val_steps
return epoch_loss, predictions, true_vals
def get_data(datatype, datayear='2020', combined_traindata=False):
""" Select the dataset to use """
if datatype == 'hasoc':
if not combined_traindata:
if datayear == '2020':
data1 = pd.read_csv(args.has20_traindata)
data2 = pd.read_csv(args.has20_devdata)
data3 = pd.read_csv(args.has20_testdata)
traindata, devdata, testdata = data1, data2, data3
else:
data1 = pd.read_csv(args.has21_traindata)
data2 = pd.read_csv(args.has21_devdata)
data3 = pd.read_csv(args.has21_testdata)
traindata, devdata, testdata = data1, data2, data3
else:
data1a = pd.read_csv(args.has19_traindata)
data2b = pd.read_csv(args.has19_devdata)
data1aa = pd.read_csv(args.has20_traindata)
data2bb = pd.read_csv(args.has20_devdata)
data1aaa = pd.read_csv(args.has21_traindata)
data2bbb = pd.read_csv(args.has21_devdata)
traindata, devdata = pd.concat([data1a, data1aa, data1aaa]), pd.concat([data2b, data2bb, data2bbb])
if datayear == '2020':
testdata = pd.read_csv(args.has20_testdata)
else:
testdata = pd.read_csv(args.has21_testdata)
elif datatype =='hateval':
data1 = pd.read_csv(args.has19_traindata)
else:
data1 = pd.read_csv(args.has19_traindata)
return traindata.drop_duplicates(keep='first'), devdata.drop_duplicates(keep='first'), testdata.drop_duplicates(keep='first')
# def random_seeding(seed_value, device): # set for reproducibility
# #numpy.random.seed(seed_value)
# #random.seed(seed_value)
# torch.manual_seed(seed_value)
# if device == "cuda": torch.cuda.manual_seed_all(seed_value)
if __name__ == '__main__':
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-large").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-large")
tokenizer.pad_token = tokenizer.eos_token # to avoid an error
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # momentum=0.9, betas=(0.7, 0.99))
# get_data has the following args: (datatype, datayear='2020', combined_traindata=False)
# datayear: 2020 or 2021;
traindata, devdata, testdata = get_data(args.datatype, args.datayear, combined_traindata=False)
# Comment out the below if preprocessing not needed
traindata = preprocess_pandas(traindata, list(traindata.columns))
valdata = preprocess_pandas(devdata, list(devdata.columns))
test_data = preprocess_pandas(testdata, list(testdata.columns))
print(traindata['task_1'].value_counts()) # count the total of each category
#Add task prefix for T5 better performance
traindata['text'] = args.task_pref + traindata['text']
valdata['text'] = args.task_pref + valdata['text']
test_data['text'] = args.task_pref + test_data['text']
train_data = traindata['text'].values.tolist()
val_data = valdata['text'].values.tolist()
test_data_texts = test_data['text'].values.tolist()
outfile = ''
label_dict = {} # For associating raw labels with indices/nos
if args.datatype == 'hasoc' and args.taskno == '1':
possible_labels = traindata.task_1.unique()
for index, possible_label in enumerate(possible_labels):
label_dict[possible_label] = index
print(label_dict) # NOT: 0; HOF: 1
traindata['task_1'] = traindata.task_1.replace(label_dict) # replace labels with their nos
traindata['task_1'] = traindata['task_1'].apply(str) # string conversion
valdata['task_1'] = valdata.task_1.replace(label_dict) # replace labels with their nos
valdata['task_1'] = valdata['task_1'].apply(str) # string conversion
if args.datatype == 'hasoc' and not args.datayear == '2021': # we'll do 2021 inference on testset elsewhere
test_data['task_1'] = test_data.task_1.replace(label_dict) # replace labels with their nos
test_data['task_1'] = test_data['task_1'].apply(str) # string conversion
test_data_labels = test_data['task_1'].values.tolist()
train_tags = traindata['task_1'].values.tolist()
val_tags = valdata['task_1'].values.tolist()
outfile = args.ofile1 + 'task1_'
elif args.datatype == 'hasoc' and args.taskno == '2':
possible_labels = traindata.task_2.unique()
for index, possible_label in enumerate(possible_labels):
label_dict[possible_label] = index
print(label_dict) # for sanity check {'NONE': 0, 'PRFN': 1, 'OFFN': 2, 'HATE': 3}
traindata['task_2'] = traindata.task_2.replace(label_dict) # replace labels with their nos
traindata['task_2'] = traindata['task_2'].apply(str) # string conversion
valdata['task_2'] = valdata.task_2.replace(label_dict) # replace labels with their nos
valdata['task_2'] = valdata['task_2'].apply(str) # string conversion
if args.datatype == 'hasoc' and not args.datayear == '2021': # we'll do 2021 inference on testset elsewhere
test_data['task_2'] = test_data.task_2.replace(label_dict) # replace labels with their nos
test_data['task_2'] = test_data['task_2'].apply(str) # string conversion
test_data_labels = test_data['task_2'].values.tolist()
train_tags = traindata['task_2'].values.tolist()
val_tags = valdata['task_2'].values.tolist()
outfile = args.ofile2 + 'task2_'
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0,
num_training_steps=len(train_data)*args.epochs)
best_val_wf1 = None
best_loss = None
best_model = None
for epoch in range(1, args.epochs + 1):
epoch_start_time = time.time()
train_loss = train(train_data, train_tags)
val_loss, predictions, true_vals = evaluate(val_data, val_tags) # val_ids added for Hasoc submission
val_f1, val_f1_w, val_f1_mic = f1_score_func(predictions, true_vals)
epoch_time_elapsed = time.time() - epoch_start_time
print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f} '.format(epoch, train_loss, val_loss) + f'F1: {val_f1}, weighted F1: {val_f1_w}, macro F1: {val_f1_mic}') # metric_sc['f1']))
with open(outfile + 't5base.txt', "a+") as f:
s = f.write('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f} '.format(epoch, train_loss, val_loss) + f'F1: {val_f1}, weighted F1: {val_f1_w}, macro F1: {val_f1_mic}' + "\n")
#if not best_val_wf1 or val_f1_w > best_val_wf1:
if not best_loss or val_loss < best_loss:
with open(args.savet, 'wb') as f: # create file but deletes implicitly 1st if already exists
#No need to save the models for now so that they don't use up space
#torch.save(model.state_dict(), f) # save best model's learned parameters (based on lowest loss)
best_model = model
if args.datatype == 'hasoc' and args.datayear == '2021': # save model for hasoc 2021 inference
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(args.msave) # transformers save
tokenizer.save_pretrained(args.msave)
#with open(args.pikle, 'wb') as file: # save the classifier as a pickle file
#pickle.dump(model, file)
#best_val_wf1 = val_f1_w
best_loss = val_loss
# {'HOF': 0, 'NOT': 1} - hasoc 2021
# Hasoc 2021 test set will be run according to Hasoc format in order to prepare, so...
if args.datatype == 'hasoc' and not args.datayear == '2021':
model = best_model
eval_loss, predictions, true_vals = evaluate(test_data_texts, test_data_labels) # test_ids added for Hasoc submission
eval_f1, eval_f1_w, eval_f1_mic = f1_score_func(predictions, true_vals)
print('Test Loss: {:.4f} '.format(eval_loss) + f'F1: {eval_f1}, weighted F1: {eval_f1_w}, macro F1: {eval_f1_mic}')
with open(outfile + 't5base.txt', "a+") as f:
s = f.write('Test Loss: {:.4f} '.format(eval_loss) + f'F1: {eval_f1}, weighted F1: {eval_f1_w}, macro F1: {eval_f1_mic}' + "\n")