TextMatch/textmatch/models/text_embedding/tf_idf_sklearn.py at master · MachineLP/TextMatch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding:utf-8 -*-
'''
-------------------------------------------------
   Description :  tf-idf-sklearn
   Author :       machinelp
   Date :         2020-06-03
-------------------------------------------------

'''

import os
import jieba
import pickle
import numpy as np
from .stop_words import StopWords
from textmatch.config.config import cfg
from textmatch.utils.logging import logging
from gensim import corpora, models, similarities
from textmatch.config.constant import Constant as const
from textmatch.models.model_base.model_base import ModelBase
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


class TfIdf(ModelBase):

    def __init__( self,
                        dic_path=const.TFIDF_DIC_PATH,
                        tfidf_model_path=const.TFIDF_MODEL_PATH,
                        tfidf_index_path=const.TFIDF_INDEX_PATH,
                        stop_word=StopWords ):
        '''
        '''

        self.dic_path = dic_path
        self.tfidf_model_path = tfidf_model_path
        self.tfidf_index_path = tfidf_index_path
        for per_path in [self.dic_path, self.tfidf_model_path, self.tfidf_index_path]:
            per_path = '/'.join(per_path.split('/')[:-1])
            if os.path.exists(per_path) == False:
                os.makedirs(per_path)
        self.stop_word = stop_word()
        self.vectorizer = CountVectorizer(stop_words = None, max_df=cfg.emb.MAX_DF, min_df=cfg.emb.MIN_DF, max_features=cfg.emb.MAX_FEATURES, token_pattern='(?u)\\b\\w\\w*\\b')
        self.transformer = TfidfTransformer()

    # init
    def init(self, words_list=None, update=True):
        if (~os.path.exists(self.dic_path) or ~os.path.exists(self.tfidf_model_path) or update) and (words_list!=None):
            word_list = self._seg_word(words_list)
            # print ('>>>>>>>>>>', word_list)

        if os.path.exists(self.dic_path) and os.path.exists(self.tfidf_model_path) and update==False:
            with open(self.dic_path, 'rb') as f:
                self.vectorizer = pickle.load(f)
            with open(self.tfidf_model_path, 'rb') as f:
                self.transformer = pickle.load(f)
        else:
            try:
                 logging.info('[Tfidf] start build tfidf model.')
                 if words_list==None:
                     logging.error( '[Bow] words_list is None' )
                 self._gen_model(word_list)
                 logging.info('[Tfidf] build tfidf model success.')
            except Exception as e:
                 logging.error( '[Tfidf] build tfidf model error，error info: {} '.format(e) )
        if words_list!=None:
            self.words_list_pre = []
            for per_word in words_list:
                self.words_list_pre.append( self._normalize( self._predict(per_word) )[0] )
            self.words_list_pre = np.array(self.words_list_pre)
        return self

    # seg word
    def _seg_word(self, words_list, jieba_flag=cfg.emb.JIEBA_FLAG, del_stopword=cfg.emb.DEL_STOPWORD):
        word_list = []
        if jieba_flag:
            for words in words_list:
                if del_stopword:
                    if words!='' and type(words) == str:
                        word_list.append( [word for word in self.stop_word.del_stopwords(jieba.cut(words))] )
                else:
                    if words!='' and type(words) == str:
                        word_list.append( [word for word in jieba.cut(words)] )
        else:
            for words in words_list:
                if del_stopword:
                    if words!='' and type(words) == str:
                        word_list.append( [word for word in self.stop_word.del_stopwords(words)] )
                else:
                    if words!='' and type(words) == str:
                        word_list.append( [word for word in words] )
        return [ ' '.join(word) for word in word_list  ]

    def fit(self, word_list):
        word_list = self._seg_word(word_list)
        self._gen_model(word_list)

    # build dic
    def _gen_dic(self, word_list):
        dic = self.vectorizer.fit_transform(word_list)
        with open(self.dic_path, 'wb') as f:
            pickle.dump(self.vectorizer, f)
        # print( 'vectorizer>>>>', self.vectorizer.get_feature_names() )
        return dic

    # build tf-idf model
    def _gen_model(self, word_list):
        tfidf = self.transformer.fit_transform(self._gen_dic(word_list))
        with open(self.tfidf_model_path, 'wb') as f:
            pickle.dump(self.transformer, f)

    def _predict(self, words):
        tf_idf_embedding = self.transformer.transform( self.vectorizer.transform(self._seg_word([words])) )
        # print('tf_idf_embedding0>>>>>', tf_idf_embedding)
        tf_idf_embedding = tf_idf_embedding.toarray().sum(axis=0)
        # print ('>>>>', tf_idf_embedding[np.newaxis, :])
        return tf_idf_embedding[np.newaxis, :].astype(float)

    def predict(self, words):
        pre = self._normalize( self._predict(words) )
        return np.dot( self.words_list_pre[:], pre[0] )