ML_test/perceptron.py at main · 1NormalGuy/ML_test · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
from pre_processing import normalization as nm
import os
from prettytable import PrettyTable
import math
from tqdm import tqdm

# 获取向量列表
def get_list_of_vectors():
    path = 'dataof2/v_train'
    tempList = []
    for dir in os.listdir(path):
        curPath = os.path.join(path, dir)
        for file in os.listdir(curPath):
            vec = nm.txt_to_dic(os.path.join(curPath, file))
            tempList.append(vec)
    return tempList


# 获取逆文档频率（IDF）
def get_idf():
    tempList = get_list_of_vectors()
    vAll = nm.txt_to_dic('data_2/allDic.txt')
    vIdf = {}
    len_D = len(tempList)
    for v in vAll:
        df = 0
        for d in tempList:
            if v in d.keys():
                df += 1
        vIdf[v] = math.log(float(len_D)/(df + 1), 10)
    return vIdf

# 打印混淆矩阵
def print_confusion_matrix(categories, tp, tn, fp, fn):
    table = PrettyTable(['Actual \\ Predict', categories[0], categories[1]])
    table.add_row([categories[0], tn, fp])
    table.add_row([categories[1], fn, tp])
    return table

# 获取文档向量
def get_d_array(path, vAll):
    dic = nm.txt_to_dic(path)
    l = np.zeros(len(vAll))
    for v in vAll:
        if v in dic.keys():
            l[int(v)] = int(dic[v])
    return l

# 定义一个函数，用于获取数据和标签
def get_data_and_labels(dataChoice, vAll, categories):
    path = dataChoice + '/v_test'
    data = []
    labels = []
    classDic = {1: categories[0], -1: categories[1]}
    for dir in os.listdir(path):
        curPath = os.path.join(path, dir)
        if dir == classDic[1]:
            for file in os.listdir(curPath):
                data.append(get_d_array(os.path.join(curPath, file), vAll))
                labels.append(1)
        elif dir == classDic[-1]:
            for file in os.listdir(curPath):
                data.append(get_d_array(os.path.join(curPath, file), vAll))
                labels.append(-1)
    return data, labels


# 定义一个函数，用于训练感知机模型
def perceptron_train(dataChoice, categories, max_iter, a):
    vAll = nm.txt_to_dic(dataChoice + '/allDic.txt')
    data, labels = get_data_and_labels(dataChoice, vAll, categories)
    w = np.zeros(len(vAll))
    b = 0
    for iter in range(max_iter):
        for i in range(len(data)):
            x = data[i]
            y = labels[i]
            if y * (np.dot(x, w) + b) <= 0:
                delta = np.multiply(a*y, x)
                w = np.add(w, delta)
                b += a*y
    return w, b, vAll

# 预测文档的类别
def perceptron_predict(w, b, d):
    return np.sign(np.dot(d, w) + b)

# 测试感知机模型
def perceptron_test(dataChoice, categories, max_iter, a):
    print('\r'+'==================== Perceptron ====================')
    w, b, vAll = perceptron_train(dataChoice, categories, max_iter, a)
    tp = tn = fp = fn = 0
    classDic = {1: categories[0], -1: categories[1]}
    for iter in tqdm(range(max_iter), desc='Training progress'):
        for cname in os.listdir(dataChoice + '/v_test'):
            if cname == classDic[1]:
                dir_path = os.path.join(dataChoice + '/v_test', cname)
                for file in os.listdir(dir_path):
                    d = os.path.join(dir_path, file)
                    result = perceptron_predict(w, b, get_d_array(d, vAll))
                    if result == 1:
                        tp += 1
                    else:
                        fn += 1
            else:
                dir_path = os.path.join(dataChoice + '/v_test', cname)
                for file in os.listdir(dir_path):
                    d = os.path.join(dir_path, file)
                    result = perceptron_predict(w, b, get_d_array(d, vAll))
                    if result == -1:
                        tn += 1
                    else:
                        fp += 1
    print('\r')
    print('Iteration Times: ', max_iter)
    print('Learning Rate: ', a)
    print("Confusion matrix: ")
    print(print_confusion_matrix([classDic[-1], classDic[1]], tp, tn, fp, fn))
    print("Precision: ", float(tp) / (tp + fp) * 100, "%")
    print("Recall: ", float(tp) / (tp + fn) * 100, "%")
    print('F1-score: ', float(2*tp) / (2*tp + fp + fn) * 100, "%")