-
Notifications
You must be signed in to change notification settings - Fork 206
Expand file tree
/
Copy pathtf_idf_sklearn.py
More file actions
122 lines (107 loc) · 5.01 KB
/
tf_idf_sklearn.py
File metadata and controls
122 lines (107 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding:utf-8 -*-
'''
-------------------------------------------------
Description : tf-idf-sklearn
Author : machinelp
Date : 2020-06-03
-------------------------------------------------
'''
import os
import jieba
import pickle
import numpy as np
from .stop_words import StopWords
from textmatch.config.config import cfg
from textmatch.utils.logging import logging
from gensim import corpora, models, similarities
from textmatch.config.constant import Constant as const
from textmatch.models.model_base.model_base import ModelBase
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
class TfIdf(ModelBase):
def __init__( self,
dic_path=const.TFIDF_DIC_PATH,
tfidf_model_path=const.TFIDF_MODEL_PATH,
tfidf_index_path=const.TFIDF_INDEX_PATH,
stop_word=StopWords ):
'''
'''
self.dic_path = dic_path
self.tfidf_model_path = tfidf_model_path
self.tfidf_index_path = tfidf_index_path
for per_path in [self.dic_path, self.tfidf_model_path, self.tfidf_index_path]:
per_path = '/'.join(per_path.split('/')[:-1])
if os.path.exists(per_path) == False:
os.makedirs(per_path)
self.stop_word = stop_word()
self.vectorizer = CountVectorizer(stop_words = None, max_df=cfg.emb.MAX_DF, min_df=cfg.emb.MIN_DF, max_features=cfg.emb.MAX_FEATURES, token_pattern='(?u)\\b\\w\\w*\\b')
self.transformer = TfidfTransformer()
# init
def init(self, words_list=None, update=True):
if (~os.path.exists(self.dic_path) or ~os.path.exists(self.tfidf_model_path) or update) and (words_list!=None):
word_list = self._seg_word(words_list)
# print ('>>>>>>>>>>', word_list)
if os.path.exists(self.dic_path) and os.path.exists(self.tfidf_model_path) and update==False:
with open(self.dic_path, 'rb') as f:
self.vectorizer = pickle.load(f)
with open(self.tfidf_model_path, 'rb') as f:
self.transformer = pickle.load(f)
else:
try:
logging.info('[Tfidf] start build tfidf model.')
if words_list==None:
logging.error( '[Bow] words_list is None' )
self._gen_model(word_list)
logging.info('[Tfidf] build tfidf model success.')
except Exception as e:
logging.error( '[Tfidf] build tfidf model error,error info: {} '.format(e) )
if words_list!=None:
self.words_list_pre = []
for per_word in words_list:
self.words_list_pre.append( self._normalize( self._predict(per_word) )[0] )
self.words_list_pre = np.array(self.words_list_pre)
return self
# seg word
def _seg_word(self, words_list, jieba_flag=cfg.emb.JIEBA_FLAG, del_stopword=cfg.emb.DEL_STOPWORD):
word_list = []
if jieba_flag:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(jieba.cut(words))] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in jieba.cut(words)] )
else:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(words)] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in words] )
return [ ' '.join(word) for word in word_list ]
def fit(self, word_list):
word_list = self._seg_word(word_list)
self._gen_model(word_list)
# build dic
def _gen_dic(self, word_list):
dic = self.vectorizer.fit_transform(word_list)
with open(self.dic_path, 'wb') as f:
pickle.dump(self.vectorizer, f)
# print( 'vectorizer>>>>', self.vectorizer.get_feature_names() )
return dic
# build tf-idf model
def _gen_model(self, word_list):
tfidf = self.transformer.fit_transform(self._gen_dic(word_list))
with open(self.tfidf_model_path, 'wb') as f:
pickle.dump(self.transformer, f)
def _predict(self, words):
tf_idf_embedding = self.transformer.transform( self.vectorizer.transform(self._seg_word([words])) )
# print('tf_idf_embedding0>>>>>', tf_idf_embedding)
tf_idf_embedding = tf_idf_embedding.toarray().sum(axis=0)
# print ('>>>>', tf_idf_embedding[np.newaxis, :])
return tf_idf_embedding[np.newaxis, :].astype(float)
def predict(self, words):
pre = self._normalize( self._predict(words) )
return np.dot( self.words_list_pre[:], pre[0] )