-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathprocess_data.py
More file actions
123 lines (109 loc) · 3.84 KB
/
process_data.py
File metadata and controls
123 lines (109 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gensim, pdb, sys, scipy.io as io, numpy as np, pickle, string, random
# read datasets line by line
def read_line_by_line(dataset_name,C,model,vec_size):
# get stop words (except for twitter!)
#SW = set()
stop = set()
for line in open('stop_words.txt'):
line = line.strip()
if line != '':
#SW.add(line)
stop.add(line)
#stop = list(SW)
f = open(dataset_name)
if len(C) == 0:
C = np.array([], dtype=np.object)
num_lines = sum(1 for line in open(dataset_name))
y = np.zeros((num_lines,))
X = np.zeros((num_lines,), dtype=np.object)
BOW_X = np.zeros((num_lines,), dtype=np.object)
count = 0
remain = np.zeros((num_lines,), dtype=np.object)
the_words = np.zeros((num_lines,), dtype=np.object)
for line in f:
#print '%d out of %d' % (count+1, num_lines)
line = line.strip()
line = line.translate(string.maketrans("",""), string.punctuation)
T = line.split('\t')
classID = T[0]
if classID in C:
IXC = np.where(C==classID)
y[count] = IXC[0]+1
else:
C = np.append(C,classID)
y[count] = len(C)
W = line.split()
F = np.zeros((vec_size,len(W)-1))
#F = np.zeros((len(W)-1, vec_size))
inner = 0
RC = np.zeros((len(W)-1,), dtype=np.object)
#word_order = np.zeros((len(W)-1), dtype=np.object)
word_order = np.empty((len(W) - 1), dtype=np.object)
word_order.fill('')
bow_x = np.zeros((len(W)-1,))
for word in W[1:len(W)]:
try:
test = model[word]
if word in stop:
#word_order[inner] = ''
continue
if word in word_order:
IXW = np.where(word_order==word)
bow_x[IXW] += 1
#word_order[inner] = ''
else:
word_order[inner] = word
bow_x[inner] += 1
F[:,inner] = model[word]
#F[inner,:] = model[word]
except KeyError, e:
#print 'Key error: "%s"' % str(e)
word_order[inner] = ''
inner = inner + 1
Fs = F.T[~np.all(F.T == 0, axis=1)]
#Fs = F[~np.all(F == 0, axis=1)]
word_orders = word_order[word_order != '']
bow_xs = bow_x[bow_x != 0]
# if len(bow_xs) > 0: #there is at least one word in the document
nbow_xs = normalize_bow(bow_xs) #Added by MH
X[count] = Fs.T
the_words[count] = word_orders
BOW_X[count] = nbow_xs #bow_xs
count = count + 1
# else:
# print "empty document (must have only been stop words)"
# print "number of nonempty documents: ", count
return (X,BOW_X,y,C,the_words)
def save_data(dataset,save_file_data,save_file_labels,model,vec_size):
# 2. read document data
(X,BOW_X,y,C,words) = read_line_by_line(dataset,[],model,vec_size)
# 3. save pickle of extracted variables
with open(save_file_data, 'w') as f:
pickle.dump([X, BOW_X, C, words], f)
with open(save_file_labels, 'w') as labels_file:
pickle.dump(y,labels_file)
#Take a bag of words and normalize its entries so that they sum to 1
def normalize_bow(bow):
return bow / float(np.sum(bow))
def main():
# 0. load word2vec model (trained on Google News)
#model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model = gensim.models.Word2Vec.load("GoogleNews_vectors")
vec_size = 300
#handle input argument errors
if len(sys.argv) != 3: #wrong number of arguments
print "Usage: python process_data.py [train_data_text_file] [test_data_text_file]"
# 1. specify train/test datasets
train_dataset = sys.argv[1] # e.g.: 'twitter.txt'
test_dataset = sys.argv[2] #e.g. twitter_test.txt'
train_name = train_dataset.split(".")[0]
test_name = test_dataset.split(".")[0]
train_data_file = train_name + ".pk"
test_data_file = test_name + ".pk"
train_labels_file = train_name + "_labels.pk"
test_labels_file = test_name + "_labels.pk"
# 2. read and save document data
save_data(train_dataset,train_data_file,train_labels_file,model,vec_size)
save_data(test_dataset,test_data_file,test_labels_file,model,vec_size)
if __name__ == "__main__":
main()