-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbayes.py
More file actions
124 lines (114 loc) · 4.5 KB
/
bayes.py
File metadata and controls
124 lines (114 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# 导入所需的库
from pre_processing import normalization as nm
import os
import math
import numpy as np
from prettytable import PrettyTable
from tqdm import tqdm
# 获取类别名
def get_class_id(path):
Dic = {}
i = 0
for cName in os.listdir(path):
Dic[i] = cName[:-4] # 将文件名去掉.txt
i += 1
return Dic
# 打印精确度、召回率和F1分数矩阵
def print_prf_matrix(Dic, precision, recall, f1):
table = PrettyTable(['Class', 'Precision', 'Recall', 'F1-score'])
for i in range(len(Dic)):
table.add_row([Dic[i], precision[i], recall[i], f1[i]])
return table
# 将每个类别目录转储为一个向量
def class_to_txt(dataChoice):
pathLoadChoice = dataChoice + '/v_train'
pathStoreChoice = dataChoice + '/bayes/v_class'
if not os.path.exists(pathStoreChoice):
os.makedirs(pathStoreChoice)
for dir in os.listdir(pathLoadChoice):
c_dir = pathLoadChoice + '/' + dir
v = nm.merge_all_vec_in_sort(c_dir)
nm.dic_to_txt(v, pathStoreChoice + '/' + dir + '.txt')
return
# 将先验概率转储为文本
def prior_to_txt(dataChoice, Dic, N):
pathLoadChoice = dataChoice + '/bayes/v_class'
pathStoreChoice = dataChoice + '/bayes/prior.txt'
prior = {}
for c in Dic:
Nc = nm.get_sum_words(pathLoadChoice + '/' + Dic[c] + '.txt', 0)
prior[c] = Nc/N
nm.dic_to_txt(prior, pathStoreChoice)
return
# 计算并保存条件概率
def condprob_to_npy(dataChoice, Dic, V, lamb):
NV = len(V)
condprob = [[0 for i in range(len(Dic))] for j in range(NV)]
for c in Dic:
vc_path = dataChoice + '/bayes/v_class' + '/' + Dic[c] + '.txt'
ck = nm.get_sum_words(vc_path, 0)
vc = nm.txt_to_dic(vc_path)
for t in V:
try:
Tct = vc[t]
except:
Tct = 0
condprob[int(t)][c] = (int(Tct) + lamb) / (ck + NV*lamb)
np.save(dataChoice + '/bayes/condprob.npy', np.array(condprob))
return
# 训练朴素贝叶斯模型
def naive_bayes_train(dataChoice, lamb):
class_to_txt(dataChoice)
print('Finished getting vectors of classes.')
Dic = get_class_id(dataChoice + '/bayes/v_class')
N = nm.get_sum_words(dataChoice + '/bayes/v_class', 0)
V = nm.txt_to_dic(dataChoice + '/allDic.txt')
prior_to_txt(dataChoice, Dic, N)
print('Finished getting prior probability.')
condprob_to_npy(dataChoice, Dic, V, lamb)
print('Finished getting conditional probability')
return
# 使用朴素贝叶斯模型进行预测
def naive_bayes_predict(d, prior, condprob, Dic, V):
vd = nm.txt_to_dic(d)
score = []
for c in Dic.keys():
score.append(math.log(prior[str(c)]))
for t in V:
if t in vd.keys():
score[c] += math.log(condprob[int(t)][c])
maxIndex = score.index(max(score))
return maxIndex
# 使用朴素贝叶斯模型进行测试
def naive_bayes_test(dataChoice, lamb):
print('\r' + '==================== Naive Bayes ====================')
naive_bayes_train(dataChoice, lamb)
prior = nm.value_to_float(nm.txt_to_dic(dataChoice + '/bayes/prior.txt'))
condprob = (np.load(dataChoice + '/bayes/condprob.npy', allow_pickle=True)).tolist()
Dic = get_class_id(dataChoice + '/bayes/v_class')
V = nm.txt_to_dic(dataChoice + '/allDic.txt')
dir = dataChoice + '/v_test'
true = {}.fromkeys(range(len(Dic)), 0)
false = {}.fromkeys(range(len(Dic)), 0)
predict= {}.fromkeys(range(len(Dic)), 0)
for i in tqdm(Dic, desc='Test progress'): # 使用tqdm显示进度条
print(i+1,':','Test', Dic[i], '...')
cur_path = dir + '/' + Dic[i]
for file in os.listdir(cur_path):
pc = naive_bayes_predict(cur_path + '/' + file, prior, condprob, Dic, V)
if pc == i:
true[i] += 1
else:
false[i] += 1 # recall(i) = true[i]/(true[i]+f[i])
predict[pc] += 1 # precision(i) = true[i]/(true[i] + p[i])
precision = {}
recall = {}
f1 = {}
for i in range(len(Dic)):
precision[i] = float(true[i]) / (true[i] + predict[i])
recall[i] = float(true[i]) / (true[i]+false[i])
f1[i] = float(2*precision[i]*recall[i]) / (precision[i] + recall[i])
print('Lambda: ', lamb)
print(print_prf_matrix(Dic, precision, recall, f1))
print('Macro-F1: ', float(sum(f1.values())) / len(Dic) * 100, '%')
return