-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbayes_classifier.py
More file actions
100 lines (75 loc) · 3.07 KB
/
bayes_classifier.py
File metadata and controls
100 lines (75 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import libGeneral
import sqlite3
import math
##############################
DATABASE_NAME = "ex4.db"
TEST_PATH = "u4_test/"
GLOBAL_INDEX = "globalIndex";
RELPROB_PATH = "bayes_model/";
connection = sqlite3.connect(DATABASE_NAME)
classes = libGeneral.determineClasses(connection)
aPrioriDictionary = libGeneral.calculateAPrioriDictionary(connection)
##############################
'''
Klassifiziert einen gegebenen Word_Vector.
Der Word_Vector soll im Format:
key:Wort
value:Haeufigkeit
vorliegen.
Zurueckgegeben wird ein Array mit Klassenname:Wahrscheinlichkeit
Bspw.
Mystery, 0.5
Was bedeutet, dass die wahrscheinlichste Klasse "Mystery" mit einer Wahrscheinlichkeit von 50% ist.
'''
def classify(wordVector):
probs = {}
for className in classes:
classProb = aPrioriDictionary[className]
prob = classifyWordVectorForClass(wordVector, className, classProb)
probs[className] = prob
'Normalize the dictionary'
probs = libGeneral.normalizeDictionary(probs)
maxKey = libGeneral.getKeyFromMaxValueFromDictionary(probs)
maxValue = libGeneral.getMaxValueFromDictionary(probs)
return [maxKey, maxValue]
def classifyWordVectorForClass(testWordVector,className,classProb):
trainedVector = libGeneral.readDictionaryFromDisk(RELPROB_PATH + className)
#score = 1.0
logScore = 0.0
for key in testWordVector:
if key in trainedVector.keys():
# p(d|c)
#score *= math.pow(float(trainedVector[key]), float(testWordVector[key]))
try:
aPriori = classProb
part1 = math.pow(float(trainedVector[key]), float(testWordVector[key]))
part2 = float(aPriori)
logScore -= math.log(part1 * part2)
except:
pass
#print logScore
return logScore
def calculateModel(TFIDF):
cursor = connection.cursor()
sql = "SELECT DISTINCT CLASS FROM TRAINING;"
cursor.execute(sql)
for row in cursor.fetchall():
classname = row[0]
cursor2 = connection.cursor()
sql = "SELECT WORD_VECTOR FROM TRAINING WHERE CLASS = ? AND FOR_TESTING = 0;"
values = [classname,]
cursor2.execute(sql, values)
classDictionary = {}
for row in cursor2.fetchall():
currentWordVector = row[0]
currentWordVector = libGeneral.makeDictionaryFromString(currentWordVector)
for key in currentWordVector:
if key not in classDictionary.keys():
classDictionary[key] = currentWordVector[key]
else:
classDictionary[key] += currentWordVector[key]
if TFIDF:
classDictionary = libGeneral.calculateTfIdfVector(classDictionary, connection, classname)
classDictionary = libGeneral.normalizeDictionary(classDictionary)
libGeneral.writeDictionaryToDisk(classDictionary, "bayes_model/" + classname)
print "Model: " + classname + " ::: DONE!"