-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
143 lines (101 loc) · 3.88 KB
/
utils.py
File metadata and controls
143 lines (101 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import pandas as pd
import scipy.io as si
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import normalized_mutual_info_score as NMI
from sklearn.metrics.cluster import adjusted_rand_score as ARI
from random import random
import joblib
from coclust.clustering.spherical_kmeans import SphericalKmeans
from Cluster_Ensembles import CSPA, HGPA, MCLA
from cluster_ensembles_sample import cluster_ensembles
def sparsity(mat):
return np.round(1 - (np.count_nonzero(mat) / float(mat.size)), 3)
def balance(class_):
element_level = []
for i in np.unique(class_):
element_level.append(list(class_).count(i))
min_ = min(element_level)
max_ = max(element_level)
return np.round(min_/max_, 3)
def create_data_table(data: {}):
nbr_doc = []
nbr_terms = []
sparsities = []
cl = []
blces = []
dataset_name = [*data.keys()]
for name,d in data.items():
nbr_doc.append(d[0].shape[0])
nbr_terms.append(d[0].shape[1])
sparsities.append(sparsity(d[0]))
cl.append(len(np.unique(d[1])))
blces.append(balance(d[1]))
table_dict = {"dataset": dataset_name, "# document": nbr_doc, "# mots": nbr_terms,"# class": cl, "sparsité": sparsities, "pt class/ gr class": blces }
table_frame = pd.DataFrame(table_dict)
return table_frame
def load_data(data_names,dir_path='.'):
data = {}
for db in data_names:
temp = si.loadmat(f'{dir_path}/{db}.mat')
data[db] = [temp['mat'].toarray(), temp['labels'][0], temp['fea'], temp['label_names'] ]
return data
def find_partitions(data, algo,minimized,method="CI", n=5, runs=15):
results = []
for _ in range(runs):
copy_algo = algo
copy_algo.fit(data)
results.append(copy_algo)
# joblib.dump(results, f'./{name}.joblib')
if method == "CI":
vals = [part.criterion for part in results]
elif method == "CM":
vals = [part.modularity for part in results]
else:
vals = [random() for _ in results]
if minimized:
ind = np.array(vals).argsort()[:n]
else:
ind = np.array(vals).argsort()[-n:]
best_part = np.take(results, ind)
row_labels = np.array([item.row_labels_ for item in best_part])
return row_labels
def co_association(label):
return (label == label[:, np.newaxis]) * 1
def total_association(labels):
return np.sum( [co_association(label) for label in labels], axis=0)
def find_asso_nmi_ari(algo, pred_lab, reel_lab,):
association = total_association(pred_lab)
algo.fit(association)
asso_result = [NMI(reel_lab,algo.row_labels_), ARI(reel_lab, algo.rolabels_)]
return asso_result
def evalute_(label_true,pred_labels):
nmi_ = {}
ari_ = {}
for i,label in enumerate(pred_labels):
nmi_[f'best{i+1}'] = NMI(label_true,label)
ari_[f'best{i+1}'] = ARI(label_true,label)
return nmi_, ari_
def sk_best_partitions(data,k,n=5,runs=20):
partitions = []
for i in range(runs):
sk = SphericalKmeans(k)
sk.fit(data)
partitions.append(sk)
best_index = [ part.criterion for part in partitions]
ind = np.array(best_index).argsort()[-n:]
best_part = np.take(partitions, ind)
row_lab = np.array([item.labels_ for item in best_part])
return best_part, row_lab
### DEPREDICATE
# def coc_best_partitions(data,n_row,n_col,n=5, runs=20):
# partitions = []
# for i in range(runs):
# coc = CoclustInfo(n_row_clusters=n_row, n_col_clusters=n_col)
# coc.fit(data)
# partitions.append(coc)
# best_index = [ part.criterion for part in partitions]
# ind = np.array(best_index).argsort()[:n]
# best_part = np.take(partitions, ind)
# row_lab = np.array([item.row_labels_ for item in best_part])
# return best_part, row_lab