forked from liuhuanyong/SinglepassTextCluster
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsinglepass_cluster_tfidf.py
More file actions
68 lines (62 loc) · 2.83 KB
/
singlepass_cluster_tfidf.py
File metadata and controls
68 lines (62 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# coding: utf-8
# File: singlepass_cluster_tfidf.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 21-09-04
import numpy as np
from gensim import corpora, models, matutils
class SingelPassClusterTfidf():
'''
1.利用tfidf vec计算cossim
'''
def tfidf_vec(self, corpus, pivot=10, slope=0.25):
dictionary = corpora.Dictionary(corpus) # 形成词典映射
self.dict_size = len(dictionary)
print('dictionary size:{}'.format(len(dictionary)))
corpus = [dictionary.doc2bow(text) for text in corpus] # 词的向量表示
tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf
def get_max_similarity(self, cluster_cores, vector):
max_value = 0
max_index = -1
for k, core in cluster_cores.items():
similarity = matutils.cossim(vector, core)
if similarity > max_value:
max_value = similarity
max_index = k
return max_index, max_value
def single_pass(self, corpus_vec, corpus, theta):
clusters = {}
cluster_cores = {}
cluster_text = {}
num_topic = 0
cnt = 0
for vector, text in zip(corpus_vec, corpus):
if num_topic == 0:
clusters.setdefault(num_topic, []).append(vector)
cluster_cores[num_topic] = vector
cluster_text.setdefault(num_topic, []).append(text)
num_topic += 1
else:
max_index, max_value = self.get_max_similarity(cluster_cores, vector)
if max_value > theta:
clusters[max_index].append(vector)
text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
num_docs=len(clusters[max_index])).T # 稀疏转稠密
core = np.mean(text_matrix, axis=0) # 更新簇中心
core = matutils.any2sparse(core) # 将稠密向量core转为稀疏向量
cluster_cores[max_index] = core
cluster_text[max_index].append(text)
else: # 创建一个新簇
clusters.setdefault(num_topic, []).append(vector)
cluster_cores[num_topic] = vector
cluster_text.setdefault(num_topic, []).append(text)
num_topic += 1
cnt += 1
if cnt % 100 == 0:
print('processing {}...'.format(cnt))
return clusters, cluster_text
def fit_transform(self, corpus, raw_data, theta=0.6):
tfidf_vec = self.tfidf_vec(corpus) # tfidf_vec是稀疏向量
clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
return clusters, cluster_text