-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_bert.py
More file actions
258 lines (226 loc) · 8.13 KB
/
model_bert.py
File metadata and controls
258 lines (226 loc) · 8.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# import libraries
import numpy as np
import pandas as pd
import re
import torch.nn as nn
import json
import torch
from torchinfo import summary
from torch.optim import lr_scheduler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
# specify GPU
USE_CUDA = torch.cuda.is_available()
device = torch.device("cpu")
# Initialize The encoder
le = LabelEncoder()
#load intents
data_file = open('intents_negotiation.json').read()
intents = json.loads(data_file)
# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Import the DistilBert pretrained model
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
for param in bert.parameters():
param.requires_grad = False
torch.cuda.is_available()
# freeze all the parameters. This will prevent updating of model weights during fine-tuning.
for param in bert.parameters():
param.requires_grad = False
# Hyperparameters
epochs = 150
batch_size = 16
max_seq_len = 8
def split_data(intents):
'''1- we get at first a dataframe with (label and text)
'''
classes = []
documents = []
for intent in intents['intents']:
for pattern in intent['patterns']:
#add documents in the corpus
documents.append((pattern, intent['tag']))
# add to our classes list
if intent['tag'] not in classes:
classes.append(intent['tag'])
df = pd.DataFrame(documents, columns = ["text", "label"])
return df
def encoding_train_labels(data):
'''
encoding train set ( labels)
'''
# encoding labesl
data['label'] = le.fit_transform(data['label'])
'''
check class distribution
df['label'].value_counts(normalize = True)
'''
return data['label']
def encoding_train_texts(data):
'''
'''
train_labels = encoding_train_labels(data)
train_text = data['text']
# tokenize and encode sequences in the training set(texts)
tokens_train = tokenizer(
train_text.tolist(),
max_length = max_seq_len,
pad_to_max_length=True,
truncation=True,
return_token_type_ids=False
)
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())
# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# DataLoader for train set
train_set_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
return train_set_dataloader
'''
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_text]
pd.Series(seq_len).hist(bins = 10)
# Based on the histogram we are selecting the max len as 8 and that's why max_lenght is equal to 8
'''
class BERT_Arch(nn.Module):
def __init__(self, bert):
super(BERT_Arch, self).__init__()
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.2)
# relu activation function
self.relu = nn.ReLU()
# dense layer
self.fc1 = nn.Linear(768,512)
self.fc2 = nn.Linear(512,256)
self.fc3 = nn.Linear(256,8)
#softmax activation function
self.softmax = nn.LogSoftmax(dim=1)
#define the forward pass
def forward(self, sent_id, mask):
#pass the inputs to the model
cls_hs = self.bert(sent_id, attention_mask=mask)[0][:,0]
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc3(x)
# apply softmax activation
x = self.softmax(x)
return x
def classes_to_tensors(intents):
'''
'''
train_labels = encoding_train_labels(split_data(intents))
class_wts = compute_class_weight( class_weight = "balanced",
classes = np.unique(train_labels),
y = train_labels)
return torch.tensor(class_wts,dtype=torch.float).to(device)
def train():
'''
'''
model = BERT_Arch(bert)
# push the model to GPU /cpu
model = model.to(device)
# from torchinfo import summary
# summary(model)
# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)
#
model.train()
total_loss = 0
# We can also use learning rate scheduler to achieve better results
# lr_sch = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)
#
weights = classes_to_tensors(intents)
# loss function
cross_entropy = nn.NLLLoss(weight=weights)
# define dataloader
train_dataloader = encoding_train_texts(split_data(intents))
# empty list to save model predictions
total_preds=[]
# iterate over batches
for step,batch in enumerate(train_dataloader):
# progress update after every 50 batches.
if step % 50 == 0 and not step == 0:
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
# push the batch to gpu
batch = [r.to(device) for r in batch]
sent_id, mask, labels = batch
# get model predictions for the current batch
preds = model(sent_id, mask)
# compute the loss between actual and predicted values
loss = cross_entropy(preds, labels)
# add on to the total loss
total_loss = total_loss + loss.item()
# backward pass to calculate the gradients
loss.backward()
# clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update parameters
optimizer.step()
# clear calculated gradients
optimizer.zero_grad()
# We are not using learning rate scheduler as of now
# lr_sch.step()
# model predictions are stored on GPU. So, push it to CPU
preds=preds.detach().cpu().numpy()
# append the model predictions
total_preds.append(preds)
# compute the training loss of the epoch
avg_loss = total_loss / len(train_dataloader)
# predictions are in the form of (no. of batches, size of batch, no. of classes).
# reshape the predictions in form of (number of samples, no. of classes)
total_preds = np.concatenate(total_preds, axis=0)
#returns the loss and predictions
return avg_loss, total_preds
def train_run():
# push the model to GPU/CPU
model = BERT_Arch(bert).to(device)
# summary(model)
# empty lists to store training and validation loss of each epoch
train_losses=[]
# number of training epochs
for epoch in range(epochs):
train_loss, _ = train()
print('\n Epoch {:} / {:} / loss {:}'.format(epoch + 1, epochs, train_loss))
# append training and validation loss
train_losses.append(train_loss)
# it can make your experiment reproducible, similar to set random seed to all options where there needs a random seed.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print(f'\nTraining Loss: {train_loss:.3f}')
torch.save(model.state_dict(), "model.h5")
def get_prediction(str, max_seq_len):
#load the model
model = BERT_Arch(bert)
model.load_state_dict(torch.load("modelll.h5", device))
model = model.to(device)
encoding_train_labels(split_data(intents))
str = re.sub(r'[^a-zA-Z ]+', '', str)
test_text = [str]
model.eval()
tokens_test_data = tokenizer(
test_text,
max_length = max_seq_len,
pad_to_max_length=True,
truncation=True,
return_token_type_ids=False
)
test_seq = torch.tensor(tokens_test_data['input_ids'])
test_mask = torch.tensor(tokens_test_data['attention_mask'])
preds = None
with torch.no_grad():
preds = model(test_seq.to(device), test_mask.to(device))
preds = preds.detach().cpu().numpy()
preds = np.argmax(preds, axis = 1)
return le.inverse_transform(preds)[0]