-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
116 lines (106 loc) · 3.95 KB
/
main.py
File metadata and controls
116 lines (106 loc) · 3.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import tkinter
import prepare
import scraper
TAGS = ['teen_f', 'teen_m', 'adult_f', 'adult_m', 'mature_f', 'mature_m']
MINIMAL_PROB = {'teen_f': 1 / 26694629,
'teen_m': 1 / 26829390,
'adult_f': 1 / 40553782,
'adult_m': 1 / 39002136,
'mature_f': 1 / 16526313,
'mature_m': 1 / 15941756}
class MainWin():
def __init__(self, master):
self.master = master
# set title
self.master.title("Naive bayes blog classifier")
# validate function
# vcmd = self.master.register(self.validate)
self.userid = ""
# entrty for userid
self.entry = tkinter.Entry(self.master, text="enter a medium userID")
# button to start
self.start_button = tkinter.Button(
self.master, text="start", command=self.gen_tag)
# labels
instruction = tkinter.Label(
master, text="enter a medium userid, then press start button")
label1 = tkinter.Label(self.master, text="userid: ")
# arrange the UI components
instruction.grid(row=0, columnspan=3)
self.entry.grid(row=1, column=1)
self.start_button.grid(row=1, column=2)
label1.grid(row=1, column=0)
def update(self):
new_text = self.entry.get()
if not new_text:
self.userid = ""
return False
try:
self.userid = new_text
return True
except ValueError:
return False
def gen_tag(self):
if not self.update():
err = tkinter.Label(
self.master, text="error occured,please try again")
err.grid(columnspan=3)
return False
results = analyze(self.userid)
print("{0} guesses generated from the recent {1} posts of {2}".format(
len(results), len(results), self.userid))
score = {}
# there may not be ten posts,
for i in range(len(results)):
print("post number: {0}, tag generated:{1}".format(i, results[i]))
label = tkinter.Label(self.master, text="guess" + str(i))
label.grid(row=i + 2, column=0)
label = tkinter.Label(self.master, text=results[i])
label.grid(row=i + 2, column=1)
if results[i] in score.keys():
score[results[i]] += 1
else:
score[results[i]] = 1
label = tkinter.Label(self.master, text="most likey: ")
label.grid(column=0)
label = tkinter.Label(self.master, text=max(score, key=score.get))
label.grid(column=1)
return True
def analyze(userid):
# save the recent 10 posts into ./userid/
scraper.get_rec10(userid)
# load the saved vocabulary file
tag_vocab = {}
for tag in TAGS:
d = {}
f = open(tag + '.csv')
for line in f.readlines():
if len(line.split(',')) == 2:
newline = line.replace('\n', '')
word, freq = newline.split(',')
d[word] = float(freq)
tag_vocab[tag] = d
# save 10 classifying results in one list, then return it
results = []
# process the posts saved, generate hot words list
for root, dirs, files in os.walk(userid):
for f in files:
if f.split('.')[-1] != 'txt':
continue
# get 30 most used words
hot_words = prepare.prepare(userid + '/' + f)
# core: calculate likelyhood for every tag
probability = {}
for tag in TAGS:
probability[tag] = 1.0
for word in hot_words:
try:
probability[tag] *= tag_vocab[tag][word]
except KeyError:
probability[tag] *= MINIMAL_PROB[tag]
results.append(max(probability, key=probability.get))
return results
GUI_ROOT = tkinter.Tk()
CLASSIFIER_GUI = MainWin(GUI_ROOT)
GUI_ROOT.mainloop()