-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_word2vec.py
More file actions
83 lines (64 loc) · 1.96 KB
/
train_word2vec.py
File metadata and controls
83 lines (64 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gensim
import sys
from os import path
import numpy as np
import random
from sklearn.decomposition import PCA
model = gensim.models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
def word2vec(statute):
x = None
statute = statute.split()
for word in statute:
try:
w = Word(word)
if x is None:
x = w.vec.reshape(w.vec.shape[0], 1)
else:
x = np.hstack((x, w.vec.reshape(w.vec.shape[0], 1)))
except:
pass
return x
def pca(x, n_components):
pca = PCA(n_components = n_components)
v = pca.fit_transform(x).transpose()
return v
class Relation:
def __init__(self, pos, neg=[]):
self.pos = pos
self.neg = neg
def get_similar(self):
return model.most_similar(positive=self.pos, negative=self.neg)
def __str__(self):
x = ' + '.join(self.pos)
if len(self.neg) > 0:
x += ' - ' + ' - '.join(self.neg)
return x
def __add__(self, other):
if isinstance(other, Word):
self.pos.append(other.word)
return Relation(self.pos, self.neg)
else:
self.pos.extend(other.pos)
self.neg.extend(other.neg)
return Relation(self.pos, self.neg)
def __sub__(self, other):
if isinstance(other, Word):
self.neg.append(other.word)
return Relation(self.pos, self.neg)
else:
self.pos.extend(other.neg)
self.neg.extend(other.pos)
return Relation(self.pos, self.neg)
class Word:
def __init__(self, word):
self.word = word
self.vec = model[word]
def __str__(self):
return self.word
def __add__(self, other):
pos = [self.word, other.word]
return Relation(pos, [])
def __sub__(self, other):
pos = [self.word]
neg = [other.word]
return Relation(pos, neg)