Spam-Classification/bayes.py at master · nathanah/Spam-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
import numpy as np
import random
import copy
import math
import sys

"""
    Read in the datafile and apply min-max normalization on it.

    Params:
        file: the name of the file to be read in. Passed through cmd line args

    Returns:
        arraying containing X and Y vectors
"""
def readFile(file):

    file = open(file, 'r')
    data = file.read()
    data = data.split('\n')
    data.pop(len(data)-1)

    #Split each sample into its features and read as correct data type
    for i in range(len(data)):
        #Split features
        data[i] = data[i].split(",")

        #Cast each feature
        for j in range(57):
            data[i][j] = float(data[i][j])

        #Read in class label as it, since it's 1/0
        data[i][57] = int(data[i][57])

    data = np.array(data)

    #min-max norm
    for j in range(57):
        min = data[0][j]
        max = data[0][j]
        for i in range(len(data)):
            if(data[i][j] < min):
                min = data[i][j]

            if(data[i][j] > max):
                max = data[i][j]

        for i in range(len(data)):
            data[i][j] = (data[i][j] - min) / (max - min)

    #Shuffle data:
    np.random.shuffle(data)

    #Split X and Y
    x = data[:,:57]
    y = data[:,57]
    return [x, y]


"""
    This function transforms the continuous values of our data into discrete buckets with labels.

    Params:
        buckets: the number of buckets we are dividing the data into (quantiles)
        X: set of samples to be bucketed
        Y: set of labels for the samples

    Side-effect:
        data is transformed and no longer the same as before it was passed in
"""
def bucket(buckets, X, Y):

    #Go through each column except for the label
    for j in range(57):

        #Sort the data on the given column
        indices = np.argsort(X[:, j])
        X = X[indices]
        Y = Y[indices]

        #Variable to store the lower bound of the current quantile
        lower = 0

        #Go through each quantile labeling it with the proper bucket
        for b in range(buckets):

            #Get the upper cutoff of the current quantile.
            upper = (b+1) * (len(X) / buckets)
            upper = int(upper)

            #Label all samples in range with current bucket
            for i in range(lower, upper):
                X[i][j] = b

            #Upper cutoff becomes lower bound.
            lower = upper

    return [X, Y]

"""
    Bucketing method that classify a feature into 1/0 depend which mean it is closer to.
    Isolated calculating the averages to only the training set.
    This we are not using the testing set to label certain values as correlated to spam or not.
    This can be thought of using K-means for k=2 on each feature separately.
    We calculate the center of that feature for each class, and label which class the sample belongs to for that feature.

    Param:
        trainingX: Our set of samples for training
        trainingY: Our set of labels for training
        testingX: Our set of samples to be used for verifying
        testingY: Our set of labels to be use for verifying

    Process;
        1. Pass through each feature calculating the spam/not-spam means
        2. Replace value of the feature for its class label
"""
def bucket_closerMean(trainingX, trainingY, testingX, testingY):

    # Go through each feature (i,j) = (feature, sample)
    # date[sample, feature] = data[j, i]
    for i in range(57):

        #Count number of spam/not-spam samples
        count_spam = 0
        count_notSpam = 0

        #Store summed value of each feature
        sum_spam = 0
        sum_nonSpam = 0

        #Store mean of each feature (center of cluster)
        mean_spam = 0
        mean_nonSpam = 0

        #Sum up the value of the feature, counting labels for spam/not-spam
        #Looks only at the training set so it doesn't make assumptions on the testing set
        for j in range(len(trainingX)):
            if trainingY[j] == 1:
                count_spam += 1
                sum_spam += trainingX[j][i]
            else:
                count_notSpam += 1
                sum_nonSpam += trainingX[j][i]

        #Calculate the mean
        mean_spam = sum_spam / count_spam
        mean_nonSpam = sum_nonSpam / count_notSpam

        #Reassign values for the training set
        for j in range(len(trainingX)):
            if abs(trainingX[j][i] - mean_spam) > abs(trainingX[j][i] - mean_nonSpam):
                trainingX[j][i] = 0
            else:
                trainingX[j][i] = 1

        #Reassign values for the testing set using the training set
        for j in range(len(testingX)):
            if abs(testingX[j][i] - mean_spam) > abs(testingX[j][i] - mean_nonSpam):
                testingX[j][i] = 0
            else:
                testingX[j][i] = 1

"""
    Prior is defined as the probability of any sample being in the class.
    We can calculate this by just counting up the number of samples that are spam.
    P(not spam) = (1 - P(spam))

    Params:
        y: the labels for all of our data

    Returns:
        Probability of any sample being spam (the prior)
"""
def prior(Y):

    spam = 0

    for i in range(len(Y)):
        if Y[i] == 1:
            spam += 1

    prior = spam / len(Y)
    return prior

"""
    Likelihood is the probability P(xj | spam/not-spam)
    We can calculate this by looking looking at the number of elements in that class that have the certain feature we are looking for.

    That is: (count of v in c) / (total number of samples in c)

    Params:
        x: our set of all samples X
        y: our set of all labels Y
        j: the column we are looking at
        v: the value we are looking for in the column
        c: what class we are looking at

    Returns:
        P(xj | spam/not-spam)
"""
def likelihood():

    #Memoized so we build the likelihood table as we test (saves time)
    memory = np.zeros(shape=(2, 57), dtype="object")

    def inner(X, Y, j, v, c):

        #Memory is formatted as:
        #memory[class][feature] stores an array [[value, probability], [value, probabliity]] pairs.

        #No values stored for that class and feature pair
        if(memory[c][j] == 0):
            memory[c][j] = []

        index = 0

        #See if value is in the memo
        for i in range(len(memory[c][j])):
            if(memory[c][j][i][0] == v):
                index = i
                break

        #No pairs stored or no matching pair was found
        #we need to calculate likelihood and store
        if(len(memory[c][j]) == 0 or memory[c][j][index][0] != v):

            #Count of occurences of that value
            count = 0

            #Total count of that class
            total = 0

            #Iterate through each sample
            for i in range(len(X)):

                #If the classes are the same update total number of that class
                if Y[i] == c :
                    total +=1

                    #If the values match update our count of value
                    if(X[i][j] == v):
                        count += 1

            likelihood = count / total

            memory[c][j].append([v, likelihood])

            return likelihood


        #Otherwise return the stored value
        return memory[c][j][index][1]

    return inner

"""
    Likelihood is the probability P(xj | spam/not-spam)
    We can calculate this by assuming a gaussian distribution and using the likelihood function for that

    Params:
        x: our set of all samples X
        y: our set of all labels Y
        j: the column we are looking at
        v: the value we are looking for in the column
        c: what class we are looking at

    Returns:
        P(xj | spam/not-spam)
"""
def gauss_likelihood():

    #Memoized so we build the likelihood table as we test (saves time)
    memory = np.zeros(shape=(2, 57), dtype="object")

    def inner(X, Y, j, v, c):

        #Memory is formatted as:
        #memory[class][feature] stores a [sigma, mu] pair.

        #No values stored for that class and feature pair
        if(memory[c][j] == 0):
            memory[c][j] = []

            numC = 0

            total = 0

            #Calculate mean
            for i in range(len(X)):
                if(Y[i] == c):
                    numC += 1
                    total += X[i][j]

            mu = total / numC

            total = 0

            #Calculate sigma
            for i in range(len(X)):
                if(Y[i] == c):
                    total += (X[i][j] - mu)**2

            sigma = (total / (numC - 1))**(1/2)

            memory[c][j] =  [sigma, mu]

        return (1/((2*math.pi)*memory[c][j][0])) * math.exp(-((v - memory[c][j][1])**2) / (2 * (memory[c][j][0] ** 2)))

    return inner

"""
    Posterior is the probability P(Spam | x). We calculate this with Bayes theroem. And by doing a trick of P(Spam | X) / P(!Spam | X) and seeing if it is >= 1.

    Params:
        x: our set of all samples X to train from
        y: our set of all labels Y to train from
        s: the sample we are looking at
        prior: the prior probabliity of a sample being spam
        likelihood_func: the memoized function for calculating likelihood
"""
def posterior(X, Y, s, prior, likelihood_func):

    #Get the probability that it is spam. Multiply by a large number to avoid precision problems
    numerator = (prior)

    #Go through each feature and calculate likelihood, multiplying it
    for j in range(57):
        v = s[j]
        numerator *= likelihood_func(X, Y, j, v, 1)

    #Get the probability that it is not spam
    denominator = (1 - prior)

    #Go through each feature and calculate likelihood, multiplying it
    for j in range(57):
        v = s[j]
        denominator *= likelihood_func(X, Y, j, v, 0)

    #Fix divide by 0 error
    if(denominator == 0):
        if(numerator > 0):
            return 1
        else:
            return 0

    return (numerator) / (denominator)

"""
    Runs a test of the model by training it using the training set, and testing it with the testing set.

    params:
        trainingX: the training set of samples
        trainingY: the labels associated with the training samples
        testingX: the testing set of samples
        testingY: the labels associiated with the testing samples

    returns:
        Array with the format:
        dat[0]: the number of testing samples classified TP
        dat[1]: the number of testing samples classified TN
        dat[2]: the number of testing samples classified FP
        dat[3]: the number of testing samples classified FN
        dat[4]: testing spam classification accuracy
"""
def test(trainingX, trainingY, testingX, testingY, method):

    #Track spam statistics
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    p = prior(trainingY)

    likelihood_func = likelihood()
    if(method == 0):
        likelihood_func = gauss_likelihood()

    #Go through each sample
    for i in range(len(testingX)):
        predict = posterior(trainingX, trainingY, testingX[i], p, likelihood_func)
        #If the prediction is >= it is spam
        if(predict >= 1):

            #If the sample is spam, guessed correctly (TP)
            if(testingY[i] == 1):
                TP += 1
            #If the sample is not spam, guessed wrong (FP)
            else:
                FP += 1
        #If the prediction is < 1 it is not spam
        else:

            #If the sample is spam, guessed incorrectly (FN)
            if(testingY[i] == 1):
                FN += 1
            #If the sample is not spam, guessed correctly (TN)
            else:
                TN += 1

    accuracy = (TN + TP) / (TN + TP + FP + FN)
    dat = [TP, TN, FP, FN, accuracy]

    print("TP:", TP)
    print("TN:", TN)
    print("FP:", FP)
    print("FN:", FN)
    print("Accuracy", accuracy)
    return(dat)

"""
    Runs k-fold cross validation. Holds 1 fold as the the testing set, all rest are used for training.

    Params:
        data: Array containing our X and Y matrices.
        folds: The number of folds to split our data on.
        method: integer to represent what bucketing method we are using:
            <=0: No bucketing method (use raw data after outlier detection)
            1: Using mean bucketing method
            >= 2: The number of buckets specified for simple quantile bucketing

    Returns:
        An array of testing results in the form:
        accuracies[k]: an array containing testing information for the kth fold
            accuracies[k][0]: the number of testing samples classified TP
            accuracies[k][1]: the number of testing samples classified TN
            accuracies[k][2]: the number of testing samples classified FP
            accuracies[k][3]: the number of testing samples classified FN
            accuracies[k][4]: spam classification testing accuracy

"""
def kFold(data, folds, method, transformation):

    #Lower bound of our testing set
    lower = 0

    trainingStatistics = []
    testingStatistics = []

    for k in range(folds):

        print("Fold:", k + 1)

        #Copy our set so the original data isn't modified for other trainings
        X = copy.deepcopy(data[0])
        Y = copy.deepcopy(data[1])

        upper = int((k + 1) * (len(X)/folds))

        #Split training and Testing sets
        testingX = X[lower:upper]
        testingY = Y[lower:upper]
        #Get everything not in our current segment as the training set
        trainingX = np.concatenate([X[:lower], X[upper:]])
        trainingY = np.concatenate([Y[:lower], Y[upper:]])

        if(transformation == 1):
            #Using mean buckets method transform the testing data based on the training data only.
            bucket_closerMean(trainingX, trainingY, testingX, testingY)
        if(transformation >= 2):
            #Using a simple bucket method, bucket training and testing data
            bucketed = bucket(transformation, trainingX, trainingY)
            trainingX = bucketed[0]
            trainingY = bucketed[1]
            bucketed = bucket(transformation, testingX, testingY)
            testingX = bucketed[0]
            testingY = bucketed[1]

        print("Evaluating Training Set:")
        trainingStatistics.append(test(trainingX, trainingY, trainingX, trainingY, method))
        print("Evaluating Testing Set:")
        testingStatistics.append(test(trainingX, trainingY, testingX, testingY, method))
        print()
        lower = upper

    sum = 0
    for i in range(len(trainingStatistics)):
        sum += trainingStatistics[i][4]
    print("Average Training misclassification:", 1 -(sum / len(trainingStatistics)))

    sum = 0
    max = testingStatistics[0][4]
    for i in range(len(testingStatistics)):
        sum += testingStatistics[i][4]

        if(max < testingStatistics[i][4]):
            max = testingStatistics[i][4]

    print("Average Testing Misclassification:", 1 - (sum / len(testingStatistics)))
    print("Min Testing Misclassification:", 1 - max)

    return [trainingStatistics, testingStatistics]

def main():

    #Wants arguments in form: bayes.py filename method transformation
    if(len(sys.argv) != 4):

        print("Program usage:")
        print("  python bayes.py filepath method transformation")
        print()
        print("Options for method:")
        print("  0: assume gaussian distribution")
        print("  1: discrete variable calculation")
        print()
        print("Options for transformation: ")
        print("<=0: no transformation of data")
        print("  1: transform data using a mean-bucketed method")
        print(">=2: transform data using specified number of quantiles")
        print()
        print("ex:\n  \"python bayes.py spambase.data 1 3\"\n  will use 3 buckets and calculate likelihood using a discrete function")

        return

    #Read in data and preprocess it. Replace the parameter with whatever path
    data = readFile(sys.argv[1])

    #Look at the method description above k-fold for usage
    #Use statistics to get information about the testing for each fold
    kFold(data, 10, int(sys.argv[2]), int(sys.argv[3]))

main()