-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathw2v_modules.py
More file actions
120 lines (89 loc) · 3.28 KB
/
w2v_modules.py
File metadata and controls
120 lines (89 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__synopsis__ : Tools for Word2Vec operations
__description__ :
__project__ : my_modules
__author__ : 'Samujjwal Ghosh'
__version__ :
__date__ : June 2018
__copyright__ : "Copyright (c) 2018"
__license__ : "Python"; (Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html)
__classes__ :
__variables__ :
__methods__ :
TODO : 1.
"""
import os
import numpy as np
from collections import OrderedDict
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import my_modules as mm
def open_word2vec(w2v_bin_path,binary=True):
# model = None
try:
model = KeyedVectors.load_word2vec_format(w2v_bin_path,binary=True)
except Exception as e: ## Loading a different format.
print('Loading original word2vec format failed. Trying Gensim format.')
model = KeyedVectors.load(w2v_bin_path,binary=True)
return model
def init_w2v(w2v_bin_path='/home/Embeddings/',w2v_bin_file='GoogleNews-vectors-negative300.bin'): ## Alternate: w2v_bin_file=glove.840B.300d.txt
try:
w2v = open_word2vec(os.path.join(w2v_bin_path,w2v_bin_file))
except Exception as e:
print("Failed to load Word2Vec binary file from:",os.path.join(w2v_bin_path,w2v_bin_file))
print('Failure reason:',e)
return w2v
def gen_txt_w2v(train,w2v):
train_vec = OrderedDict()
for id,val in train.items():
s_vec = np.zeros(300)
for word in val['parsed_tweet'].split(" "):
if word in w2v.wv.vocab:
# train_vec[id][word] = w2v[word].tolist()
s_vec = np.add(s_vec, w2v[word])
else:
pass
# print("Word [",word,"] not in vocabulary")
# print("\n")
train_vec[id]=s_vec
return train_vec
def cosine_sim(w2v,w1,w2):
return w2v.similarity(w1,w2)
def find_sim(w2v,word,k=5):
#print("Finding similar words of:",word)
w2v_words = []
if word in w2v.wv.vocab:
w2v_words = w2v.most_similar(positive=[word],negative=[],topn=k)
#for term,val in list(w2v_words):
# word_list = word_list + [term]
return w2v_words
def find_sim_list(w2v,words,k=5):
for word in words:
words = words + find_sim(w2v,word,k)
words = mm.remove_dup_list(words, case=True)
return words[0:k]
def expand_tweet(w2v,tweet,c=3):
new_tweet = []
for word in tweet.split(" "):
new_tweet= new_tweet+[word]
w2v_words = find_sim(w2v,word,c)
#if word in w2v.vocab:
# w2v_words=w2v.most_similar(positive=[word], negative=[], topn=c)
for term,val in w2v_words:
new_tweet= new_tweet+[term]
return new_tweet
def expand_tweets(w2v,dict):
# print("Method: expand_tweets(dict)")
for id,val in dict.items():
val['expanded_tweet'] = "".join(expand_tweet(w2v,val['parsed_tweet']))
return dict
def create_w2v(corpus,size=1000,window=5,min_count=3,workers=10):
w2v = Word2Vec(corpus,size,window,min_count,workers)
# print(w2v)
# print(type(w2v))
return w2v
def main():
pass
if __name__ == "__main__": main()