-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword2vec.py
More file actions
103 lines (72 loc) · 3.26 KB
/
word2vec.py
File metadata and controls
103 lines (72 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
from collections import defaultdict
class OneHotEncoder():
def __init__(self, tokenized_docs):
self.w2i = defaultdict(lambda : len(self.w2i))
[self.w2i[w] for d in tokenized_docs for w in d]
self.i2w = {v:k for k, v in self.w2i.items()}
self.n_word = len(self.w2i)
def encode(self, tokenized_docs):
result = []
for d in tokenized_docs:
for w in d:
result.append(self.get_one_vector(w))
return result
def get_one_vector(self, w):
v = [0] * len(self.w2i)
v[self.w2i[w]] = 1
return v
def decode(self, v):
pass
class Word2Vec():
def __init__(self, docs, embedding_size, window=1, learning_rate=0.1, epoch=100):
self.embedding_size = embedding_size
self.window = window
self.learning_rate = learning_rate
self.epoch = epoch
self.tokenized_docs = [d.split() for d in docs]
encode = OneHotEncoder(self.tokenized_docs)
encoded_docs = encode.encode(self.tokenized_docs)
W = self._init_weights(encode.n_word)
X, Y = self._slide_window(encoded_docs)
self._optimize(X, Y, W)
def _init_weights(self, n_word):
W1 = np.random.rand(n_word, self.embedding_size)
W2 = np.random.rand(self.embedding_size, n_word)
return W1, W2
def _slide_window(self, encoded_docs):
# CBOW는 주변 단어가 input 으로 output 으로 center 단어 , Skip-gram은 input 으로 center 단어 output 으로 주변 단어
context, center = [], []
for i, w in enumerate(encoded_docs):
start_point = max(0, i - self.window) # max를 하게 되면 0 밑으로는 내려가지 않음
end_point = min(len(encoded_docs), i + self.window) # min을 하면 전체 길이를 넘지 않음
temp_context = encoded_docs[start_point:end_point+1]
temp_center = temp_context.pop(i-start_point)
for c in temp_context:
center.append(temp_center)
context.append(c)
return np.array(context), np.array(center)
def _input_to_hidden(self, X, W):
return np.dot(X, W) # 대용량 데이터 처리할 때는 변경해야함. 행렬 곱을 빼고(원핫인코딩 없애기로 했으니) 바로 히든 레이어 출력하는 것으로
def _hidden_to_output(self, H, W):
return self._softmax(np.dot(H, W))
def _eval_loss(self, Y, Y_hat):
return -1/len(Y) * np.sum(Y*np.log(Y_hat)) # 크로스 앤트로피 구현 (닷프로덕트로 변경해도됨)
def _calc_gradients(self, X, Y, Y_hat, H, W2):
dW2 = 0
dW1 = 0
return dW1, dW2
def _softmax(self, O):
return np.exp(O)/np.sum(np.exp(O))
def _optimize(self, X, Y, W):
loss_trace = []
for e in range(self.epoch):
H = self._input_to_hidden(X, W[0])
Y_hat = self._hidden_to_output(H, W[1])
loss = self._eval_loss(Y, Y_hat)
loss_trace.append(loss)
gradients = self._calc_gradients(X, Y, Y_hat, H, W[1])
# update weights
if __name__ == "__main__":
docs = ["you will never know until you try"]
wv = Word2Vec(docs=docs, embedding_size=4)