ml/NeuralNetClassifier.py at master · dhammack/ml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
from numpy import *
from numpy.random import *


class NeuralNetClassifier(object):
	"""
	Neural network classifier. Has one hidden (projection) layer with
	adaptable weights. Default activation function is the rectifier.
	After the rectifier layer, a logistic classifier is learned.
	"""

	def __init__(self, layer_one_size=10, activation=None):
		"""
		Instantiate a single-hidden layer neural network
		activation - function taking in a bunch of row vectors (a matrix)
			and a weight matrix. It must compute the activation and the gradient
			and return both.
		"""
		self.W_hid = array([]) #basis mapping layer
		self.W_out = array([]) #logistic classifier layer
		self.layer_one_size = layer_one_size
		if activation == None:
			activation = self.rectifier

		self.activation = activation

	def add_bias(self, X):
		#adds a dummy column of ones to a matrix. Allows for fitting noncentered data.
		return hstack((X,ones((X.shape[0],1))))

	def fit(self, X, Y, itrs=100, learn_rate=0.1, reg=0.1,
			momentum=0.5, report_cost=False, batch_size=-1):
		"""
		Fit the model. Returns nothing unless report_cost == True.
		If cost reporting is turned on, return the costs and L2 norm of the gradient over time.
		X - observation matrix (examples by features)
		Y - one-hot target matrix (examples by classes)
		itrs - number of iterations to run
		learn_rate - size of step to use for gradient descent
		reg - regularization penalty
		momentum - fraction of the previous gradient used in the update step
		report_cost - if true, return the loss function at each step (expensive).
		batch_size - size of minibatches to use in training. -1 means full batch.
		"""

		if batch_size==-1:
			batch_size=X.shape[0]

		Xb = self.add_bias(X)
		#these should be initialized based on the square root of the fan in, but this should be OK.
		self.W_hid = uniform(-0.1, 0.1, (self.layer_one_size, Xb.shape[1]))
		self.W_out = uniform(-0.1, 0.1, (Y.shape[1], self.layer_one_size))
		#set up for learning
		costs = []
		grad_norms = []
		layer_grads = [zeros(self.W_hid.shape), zeros(self.W_out.shape)]
		layer_grads_prev = [zeros(self.W_hid.shape), zeros(self.W_out.shape)]
		#learn.
		for i in range(itrs):
			minibatch_inds = self.batch_inds(batch_size, X.shape[0]) #get a minibatch
			layer_grads = self.grad(Xb[minibatch_inds,:], Y[minibatch_inds,:], reg) #compute gradients (uses backprop)
			#update the weights
			self.W_hid = self.W_hid - learn_rate*(layer_grads[0] + momentum*layer_grads_prev[0])
			self.W_out = self.W_out - learn_rate*(layer_grads[1] + momentum*layer_grads_prev[1])
			#update the momentum term
			layer_grads_prev = layer_grads

			if report_cost:
				costs.append(self.loss(X,Y,reg))
				grad_norms.append(norm(layer_grads[0]) + norm(layer_grads[1]))

		return costs, grad_norms

	def batch_inds(self, batch_size, data_size):
		#given a batch size and the size of the data, get a minibatch
		inds = permutation(data_size)[:batch_size]
		return inds

	def softmax(self, X, W):
		#softmax activation function
		Z = dot(X, W.T)
		Z = maximum(Z, -1e3)
		Z = minimum(Z, 1e3)
		numerator = exp(Z)
		S = numerator / sum(numerator, axis=1).reshape((-1,1))
		grad = S*(1-S)
		return S, grad

	def predict(self, X, add_bias=True):
		"""
		If the model has been trained, makes predictions on an observation matrix (observations by features)
		"""
		if add_bias:
			X = self.add_bias(X)

		#map to our learned basis. Ignore the gradient.
		X2, dX2 = self.activation(X, self.W_hid)

		#make a prediction on top
		Y, dY = self.softmax(X2, self.W_out)
		return Y

	def rectifier(self, X, W):
		#rectifier activation function.
		#returns max(0, Wx), and the gradient
		Z = dot(X,W.T)
		act = maximum(0,Z)
		grad = greater(act,0)
		return act, grad

	def grad(self, X, Y, reg):
		"""
		Returns an array. First element is the gradient wrt the layer 1 weights, and the
		second element is the gradient wrt the layer 2 weights.
		"""
		layers = [] #will hold the gradients of each layer of weights.

		#feed forward pass
		X_2, X_2_grad = self.activation(X, self.W_hid)
		Yh, dYh = self.softmax(X_2, self.W_out)

		#now compute gradients (back prop)
		delta = Y-Yh #Take advantage of the cancellation of terms in CE loss + softmax

		#gradient is averaged over all training examples and classes
		W_out_grad = -dot(delta.T, X_2)/X.shape[0]/Y.shape[1]
		layers.append( W_out_grad + reg*self.W_out) #include regularization

		#update our delta, using the chain rule.
		delta = dot(delta, self.W_out)*X_2_grad

		#again, average over all examples + classes. Add a regularization term.
		W_hid_grad = -dot(delta.T, X)/X.shape[0]/Y.shape[1]
		layers.append(W_hid_grad + reg*self.W_hid)

		#backprop is...backwards. Reverse it.
		return list(reversed(layers))

	def loss(self, X, Y, reg, add_bias=True):
		#Loss function. Used internally for reporting.
		Yh = self.predict(X, add_bias)
		reg_W_hid = 0.5*reg*sum(sum(self.W_hid**2))
		reg_W_out = 0.5*reg*sum(sum(self.W_out**2))
		return mean(mean(-Y*log(Yh))) + reg_W_out + reg_W_hid

	#these were used during development.
#     def grad_check(self, X, Y, reg):
#         inds = [(0,0), (2,2), (1,2), (0,2)]
#         layer = 1
#         X = self.add_bias(X)
#         layer_grads = self.grad(X, Y, reg)
#         for ind in inds:
#             grad_calc = layer_grads[layer][ind]
#             grad_numer = self.numeric_grad(X, Y, reg, ind)
#             print 'calculated grad:', grad_calc, 'numeric grad:', grad_numer
#             print 'ratio:', grad_calc / grad_numer, 'diff:', grad_calc - grad_numer

#     def numeric_grad(self, X, Y, reg, index=(0,0)):
#         #compute the numeric gradient for a given layer and index.
#         #W_copy = copy(self.W_hid)
#         W_copy = copy(self.W_out)
#         #central difference method
#         ep = 1e-5
#         #self.W_hid[index] += ep
#         self.W_out[index] += ep
#         left_loss = self.loss(X, Y, reg, add_bias=False)
#         #self.W_hid[index] -= 2*ep
#         self.W_out[index] -= 2*ep
#         right_loss = self.loss(X, Y, reg, add_bias=False)

#         grad = (left_loss-right_loss)/2/ep
#         #self.W_hid = W_copy
#         self.W_out = W_copy
#         return grad