-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
106 lines (77 loc) · 3.8 KB
/
preprocess.py
File metadata and controls
106 lines (77 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# standard
import pandas as pd
import numpy as np
import argparse as ap
import pickle
import os
# 3rd party
from keras.utils import to_categorical
# local
import utils
def main(args):
wnuttrain = args.data_dir+'/wnuttrain.txt'
train = pd.read_table(wnuttrain, header=None, names=['token', 'label', 'bio_only', 'upos'])
wnutdev = args.data_dir+'/wnutdev.txt'
dev = pd.read_table(wnutdev, header=None, names=['token', 'label', 'bio_only', 'upos'])
wnuttest = args.data_dir+'/wnuttest.txt'
test = pd.read_table(wnuttest, header=None, names=['token', 'upos'])
# extracting features
train_copy = utils.extract_features(train)
dev_copy = utils.extract_features(dev)
test_copy = utils.extract_features(test, istest=True)
# TODO: this will depend on whether or not it needs to be cleaned
# for now just using pkls for development to save time
train_seqs = utils.tokens2sequences(train_copy)
dev_seqs = utils.tokens2sequences(dev_copy)
test_seqs = utils.tokens2sequences(test_copy, istest=True)
train_seqs.to_pickle(args.data_dir+'/seqs/train_seqs.pkl')
dev_seqs.to_pickle(args.data_dir+'/seqs/dev_seqs.pkl')
test_seqs.to_pickle(args.data_dir+'/seqs/test_seqs.pkl')
# train_seqs = pd.read_pickle(args.data_dir+'/seqs/train_seqs.pkl')
# dev_seqs = pd.read_pickle(args.data_dir+'/seqs/dev_seqs.pkl')
# test_seqs = pd.read_pickle(args.data_dir+'/seqs/test_seqs.pkl')
# take the longest sequence and make it the sequence length
seq_length = utils.find_seq_len(train_seqs, dev_seqs, test_seqs)
token_vocab = train.token.unique().tolist()
oov = len(token_vocab) # OOV (out of vocabulary) token as vocab length (because that's max.index + 1)
with open(args.data_dir + '/token_vocab.pkl', 'wb') as f:
pickle.dump(token_vocab, f)
# a new dummy token index, one more than OOV
padtok = oov+1
print('The padding token index is %i' % padtok)
padlab = 3
print('The padding label index is %i' % padlab)
train_seqs_padded, train_labs_padded = utils.pad(train_seqs, seq_length, padtok, padlab)
dev_seqs_padded, dev_labs_padded = utils.pad(dev_seqs, seq_length, padtok, padlab)
test_seqs_padded = utils.pad(test_seqs, seq_length, padtok, padlab, istest=True)
# convert those labels to one-hot encoding
n_labs = 4 # we have 3 labels: B, I, O (0, 1, 2) + the pad label 3
train_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in train_labs_padded]
dev_labs_onehot = [to_categorical(i, num_classes=n_labs) for i in dev_labs_padded]
X = np.array(train_seqs_padded)
y = np.array(train_labs_onehot)
train_pkl = {'X': X, 'y': y}
dev_X = np.array(dev_seqs_padded)
dev_y = np.array(dev_labs_onehot)
dev_pkl = {'dev_X': dev_X, 'dev_y': dev_y}
test_X = np.array(test_seqs_padded)
test_pkl = {'test_X': test_X, 'test_y': None}
############### writing to file ################
with open(args.data_dir+'/padded/train.pkl', 'wb') as f:
pickle.dump(train_pkl, f)
with open(args.data_dir+'/padded/dev.pkl', 'wb') as f:
pickle.dump(dev_pkl, f)
with open(args.data_dir+'/padded/test.pkl', 'wb') as f:
pickle.dump(test_pkl, f)
if __name__ == "__main__":
p = ap.ArgumentParser()
p.add_argument('--output-dir', required=True, \
help='output directory for label list, word files, and pickles')
p.add_argument('--data-dir', required=True, \
help='current model\'s location (generated from a bash script using Job ID)')
p.add_argument('--clean', action='store_true', \
help='an extra preprocessing step to scrub handles and urls from data and replace with dummy token')
# p.add_argument('--dataset', required=True, \
# help='keyword for dataset to use')
args = p.parse_args()
main(args)