forked from liuhuanyong/SinglepassTextCluster
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdoc_vector.py
More file actions
42 lines (35 loc) · 1.39 KB
/
doc_vector.py
File metadata and controls
42 lines (35 loc) · 1.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# coding: utf-8
# File: doc_vector.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 21-09-04
import jieba.posseg as pseg
import os
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import numpy as np
class Doc2vec:
def __init__(self):
base_path = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.embedding_path = os.path.join(base_path, 'model/token_vector.bin')
self.model = gensim.models.KeyedVectors.load_word2vec_format(self.embedding_path, binary=False)
'''获取词向量文件'''
def get_wordvector(self, word):
try:
return self.model[word]
except:
return np.zeros(200)
"""average pooling对句子进行向量化表示"""
def get_sentvector(self, text):
words = [i for i in text]
sent_vector = np.zeros(200)
for word in words:
sent_vector += self.get_wordvector(word)
sent_vector = sent_vector/len(words)
return sent_vector
'''基于余弦相似度计算句子之间的相似度,句子向量等于字符向量求平均'''
def similarity_cosine(self, vector1, vector2):
cos1 = np.sum(vector1*vector2)
cos21 = np.sqrt(sum(vector1**2))
cos22 = np.sqrt(sum(vector2**2))
similarity = cos1/float(cos21*cos22)
return similarity