RedditClassifier/neural_classifier.py at master · MaxASchwarzer/RedditClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
from __future__ import absolute_import

import theano
import numpy
import os
import cPickle as pkl
import time
from os import listdir
from os.path import isfile, join

from postmunge import PostmungedTextIterator

from keras.models import Sequential, load_model, Model
from keras.layers import Embedding, Dense, MaxoutDense, Input, merge, MaxoutDense, Flatten
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.core import Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.layers.pooling import GlobalMaxPooling1D
from keras.optimizers import Nadam
from keras import backend as K
from keras.layers.advanced_activations import PReLU, LeakyReLU

# TODO: Add evaluation on the testing set.
# WARNING: This code will barely run on CPU.  GPU use advised.

def prepare_data(seqs_x, seqs_y, maxlen = None, class_weights = [1, 1], batch_size = 32):
	# x: a list of sentences
	lengths_x = [(len(s[0]), len(s[2])) for s in seqs_x]


	if maxlen is not None:

		maxlen = (batch_size*maxlen)/len(seqs_x)

		post_lengths = map(lambda x : x[0], lengths_x)
		parent_lengths = map(lambda x : x[1], lengths_x)
		maxlen_post = numpy.max(post_lengths) + 1
		maxlen_parent = numpy.max(parent_lengths) + 1

		if maxlen_post < maxlen / 2:
			maxlen_parent = maxlen - maxlen_post
		elif maxlen_parent < maxlen / 2:
			maxlen_post = maxlen - maxlen_parent

		else:
			maxlen_post = maxlen / 2 + 1
			maxlen_parent = maxlen / 2 + 1


		new_seqs_x = []
		new_lengths_x = []
		for (l_x1, l_x2), s_x in zip(lengths_x, seqs_x):
			if l_x1 < maxlen_post and l_x2 < maxlen_parent:
				new_seqs_x.append(s_x)
				new_lengths_x.append((l_x1, l_x2))
			else:
				new_text_length = min(maxlen_post, l_x1)
				new_parent_length = min(maxlen_parent, l_x2)
				new_seqs_x.append((s_x[0][:new_text_length], s_x[1], s_x[2][:new_parent_length]))
				new_lengths_x.append((new_text_length, new_parent_length))

		lengths_x = new_lengths_x
		seqs_x = new_seqs_x
		if len(lengths_x) < 1:
			return None, None

	n_samples = len(seqs_x)
	post_lengths = map(lambda x : x[0], lengths_x)
	parent_lengths = map(lambda x : x[1], lengths_x)

	maxlen_post = numpy.max(post_lengths) + 1
	maxlen_parent = numpy.max(parent_lengths) + 1

	seqs_y = numpy.asarray(seqs_y)

	x_text = numpy.zeros((n_samples, maxlen_post)).astype('int32')
	x_parent = numpy.zeros((n_samples, maxlen_parent)).astype('int32')
	x_sr = numpy.zeros((n_samples)).astype("int32")

	for idx, s_x in enumerate(seqs_x):
		x_text[idx, :lengths_x[idx][0]] = s_x[0]
		x_parent[idx, :lengths_x[idx][1]] = s_x[2]
		x_sr[idx] = s_x[1]
	return [x_text, x_sr, x_parent], seqs_y

def get_class_weights(inputfile):

	total = 0
	class1 = 0
	class2 = 0
	with open(inputfile, "rb") as f:
		for example in f:
			if "removecomment" in example.strip().split("\t")[-1]:
				class2 += 1
			else:
				class1 += 1
	return {0 : 1, 1 : class1/class2}


def build_model(dim=256, word_dim = 256, subreddit_dim = 64, vocab_size = 30000, n_subreddits = 1000, maxlen = None, use_dropout = False):
	""" This network structure is based on Recurrent Convolutional Neural Networks for Text Classification, Lai et al., 2015, """

	input_text = Input(shape=(None,), dtype='int32', name='text_input')
	text_model = Embedding(vocab_size, word_dim, mask_zero = False)(input_text)
	text_model = Bidirectional(LSTM(dim, return_sequences = True), merge_mode = "concat")(text_model)
	text_model = LeakyReLU(0.2)(text_model)
	if use_dropout:
		text_model = Dropout(0.2)(text_model)
	text_model = Bidirectional(LSTM(dim, return_sequences = True), merge_mode = "concat")(text_model)
	text_model = LeakyReLU(0.2)(text_model)
	if use_dropout:
		text_model = Dropout(0.2)(text_model)
	text_model = GlobalMaxPooling1D()(text_model)
	text_model = LeakyReLU(0.2)(text_model)

	input_parent = Input(shape=(None,), dtype='int32', name='parent_input')
	parent_model = Embedding(vocab_size, word_dim, mask_zero = False)(input_parent)
	parent_model = Bidirectional(LSTM(dim, return_sequences = True), merge_mode = "concat")(parent_model)
	parent_model = LeakyReLU(0.2)(parent_model)
	if use_dropout:
		parent_model = Dropout(0.2)(parent_model)
	parent_model = Bidirectional(LSTM(dim, return_sequences = True), merge_mode = "concat")(parent_model)
	parent_model = LeakyReLU(0.2)(parent_model)
	if use_dropout:
		parent_model = Dropout(0.2)(parent_model)
	parent_model = GlobalMaxPooling1D()(parent_model)
	parent_model = LeakyReLU(0.2)(parent_model)

	input_subreddit = Input(shape=(1,), dtype='int32', name='subreddit_input')
	sr_embedding = Embedding(n_subreddits, subreddit_dim, mask_zero = False)(input_subreddit)
	sr_flattened = Flatten()(sr_embedding)
	model = merge([sr_flattened, text_model, parent_model], mode="concat", concat_axis = 1)
	if use_dropout:
		model = Dropout(0.2)(model)
	model = Dense(dim)(model)
	if use_dropout:
		model = Dropout(0.5)(model)
	model = LeakyReLU(0.2)(model)
	model = Dense(dim/2)(model)
	model = LeakyReLU(0.2)(model)
	if use_dropout:
		model = Dropout(0.5)(model)
	modelout = MaxoutDense(1, nb_feature = 5)(model)
	modelout = Activation("sigmoid")(modelout)
	model = Model(input = [input_text, input_subreddit, input_parent], output = [modelout])
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model


def train(word_dim=256,  # word vector dimensionality
		  dim=512,  # the number of LSTM units
		  patience=2,  # early stopping patience
		  max_epochs=5000,
		  finish_after=10000000,  # finish after this many updates
		  dispFreq=100,
		  vocab_size=30000,  # vocabulary size
		  n_subreddits = 8, # number of subreddits to track specifically
		  subreddit_dim = 128, # subreddit vector dimensionality
		  maxlen=300,  # maximum length of the description
		  batch_size=96,
		  valid_batch_size=96,
		  savedir="./",
		  validFreq=100000,
		  saveFreq=25000,   # save the parameters after every saveFreq updates
		  dataset="./reddit_comment_training.tsv",
		  test_dataset = "./reddit_comment_testing.tsv",
		  valid_dataset="./reddit_comment_valid.tsv",
		  dictionary="./reddit_comment_training.tsv_worddict.pkl",
		  sr_dictionary="./reddit_comment_training.tsv_srdict.pkl",
		  legal_subreddits = None,#["science"],
		  use_dropout=True,
		  reload=False,
		  overwrite=False):


	#class_weights = get_class_weights(dataset)
	#print class_weights
	# The dataset this model was built for is heavily unbalanced, so we generate weightings to equalize the importance of the classes.
	train = PostmungedTextIterator(dataset, dictionary, sr_dictionary, n_words_source=vocab_size, n_subreddits = n_subreddits, batch_size=batch_size, shuffle = False, legal_subreddits = legal_subreddits, maxlen = maxlen)
	valid = PostmungedTextIterator(valid_dataset, dictionary, sr_dictionary, n_words_source=vocab_size, n_subreddits = n_subreddits, batch_size=batch_size, shuffle = False, legal_subreddits = legal_subreddits, maxlen = maxlen)

	print "Building the model"
	model = build_model(dim = dim, word_dim  = word_dim, vocab_size = vocab_size, n_subreddits = n_subreddits, subreddit_dim = subreddit_dim, use_dropout = use_dropout)
	print "Model built"


	# Initializaton
	uidx = 0
	scores = []
	history_errs = []
	ud_start = time.time()
	estop = False

	if (reload):
		print "Attempting to reload"
		modelfiles = [(join(savedir, f), int(f.split(".")[-2].replace("iter", ""))) for f in listdir(savedir) if isfile(join(savedir, f)) and "model" in f and ".h5" in f and not ".png" in f]
		most_recent_model = ("", 0)
		for modelfile in modelfiles:
			if modelfile[1] >= most_recent_model[1]:
				most_recent_model = modelfile

		if os.path.isfile(most_recent_model[0]):
			print "Loading from model", most_recent_model[0]
			model = load_model(most_recent_model[0])
			uidx = most_recent_model[1] + 1  #Adding one avoids repeating a validation error calculation for many reloads.
		else:
			print "Failed to load model -- no acceptable models found"

	for eidx in xrange(max_epochs):

		n_samples = 0

		for x, y in train:
			n_samples += len(x)
			x, y = prepare_data(x, y, maxlen = maxlen)

			if x is None:
				print 'Minibatch with zero samples under length ', maxlen
				uidx -= 1
				continue

			score = model.train_on_batch(x, y)#, class_weight = class_weights)
			scores.append(score)

			# check for bad numbers; if one is encountered, just reload the model from the most recent save.  Dropout's randomness should ensure that this will
			# eventually progress past the NaN, if it is enabled, although it may take several reloaded attempts in some cases.
			if numpy.isnan(score) or numpy.isinf(score):
				print 'NaN detected'

				if use_dropout:
					print "Attempting to reload model to reset to pre-NaN state"

					# this is a slightly modified version of the reload code, above.
					modelfiles = [(join(savedir, f), int(f.split(".")[-2].replace("iter", ""))) for f in listdir(savedir) if isfile(join(savedir, f)) and "model" in f and ".h5" in f]
					for modelfile in modelfiles:
						if modelfile[1] >= most_recent_model[1]:
							most_recent_model = modelfile

					if os.path.isfile(most_recent_model[0]):
						print "Loading from model", most_recent_model[0]

						# We need to rebuild the model to clean out its gradients (as they are likely NaN in some position).
						model = build_model(batch_size=batch_size, num_units = dim, vocab_size = vocab_size, n_subreddits = n_subreddits, subreddit_dim = subreddit_dim, word_dim  = word_dim, use_dropout = use_dropout)

						model.load_weights(most_recent_model[0])
						uidx = most_recent_model[1] + 1
						continue
					else:
						print "Failed to find a valid model to reload."
						return 1., 1., 1.
				else:
					print "Dropout not enabled, so model is deterministic.  Terminating."
					return 1., 1., 1.

			# display a brief status update
			if numpy.mod(uidx, dispFreq) == 0:
				ud = (time.time() - ud_start)/dispFreq

				try:
					reportString = 'Epoch ' + str(eidx).strip() + '  Update ' + str(uidx).strip() + " Loss: " + str(numpy.mean(scores)) + "  Average time taken: " + str(ud)
					print reportString
				except:
					print "Exception encountered while printing report.  Continuing training."

				ud_start = time.time()
				scores = []

			# save the best model so far, in addition, save the latest model
			# into a separate file with the iteration number for external eval
			if numpy.mod(uidx, saveFreq) == 0:

				# save with uidx
				print 'Saving the model at iteration {}...'.format(uidx),
				saveto_uidx = join(savedir, 'model.iter{}.h5'.format(uidx))
				model.save(saveto_uidx)
				print 'Done'

			# validate model on validation set and early stop if necessary
			if numpy.mod(uidx, validFreq) == 0:

				valid_errs = []
				for x, y in valid:
					x, y = prepare_data(x, y, maxlen = maxlen)
					v_err = model.evaluate(x, y, batch_size = len(x[0]))
					valid_errs.append(v_err)
				valid_err = numpy.mean(valid_errs)
				history_errs.append(valid_err)

				if uidx == 0 or valid_err <= numpy.array(history_errs).min():
					print "New best valid error!"
					history_errs = [valid_err]
				print 'Valid ', valid_err

				if len(history_errs) > patience:
					estop = True
					print "Halting training: early stopping patience exceeded!"

			# finish after this many updates
			if uidx >= finish_after:
				print 'Finishing after %d iterations!' % uidx
				estop = True
				break
			uidx += 1


		print 'Seen %d samples' % n_samples

		if estop:
			break

	#If stopping training, get the validation error one more time and then save to a special file
	valid_errs = []
	for x, y in valid:
		x, y = prepare_data(x, y)
		v_err = model.evaluate(x, y, batch_size = len(x[0]))
		valid_errs.append(v_err)
	valid_err = numpy.mean(valid_errs)
	print 'Valid ', valid_err
	model.save(savedir + "final.h5")
	return valid_err

if __name__ == '__main__':
	train()