Word2Vec/word2vec.py at main · KMOU-NLP-Lab/Word2Vec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
from collections import defaultdict

class OneHotEncoder():
    def __init__(self, tokenized_docs):
        self.w2i = defaultdict(lambda : len(self.w2i))
        [self.w2i[w] for d in tokenized_docs for w in d]

        self.i2w = {v:k for k, v in self.w2i.items()}
        self.n_word = len(self.w2i)

    def encode(self, tokenized_docs):

        result = []
        for d in tokenized_docs:
            for w in d:
                result.append(self.get_one_vector(w))

        return result

    def get_one_vector(self, w):
        v = [0] * len(self.w2i)
        v[self.w2i[w]] = 1
        return v

    def decode(self, v):
        pass


class Word2Vec():
    def __init__(self, docs, embedding_size, window=1, learning_rate=0.1, epoch=100):
        self.embedding_size = embedding_size
        self.window = window
        self.learning_rate = learning_rate
        self.epoch = epoch

        self.tokenized_docs = [d.split() for d in docs]
        encode = OneHotEncoder(self.tokenized_docs)
        encoded_docs = encode.encode(self.tokenized_docs)

        W = self._init_weights(encode.n_word)
        X, Y = self._slide_window(encoded_docs)
        self._optimize(X, Y, W)

    def _init_weights(self, n_word):
        W1 = np.random.rand(n_word, self.embedding_size)
        W2 = np.random.rand(self.embedding_size, n_word)

        return W1, W2

    def _slide_window(self, encoded_docs):
        # CBOW는 주변 단어가 input 으로 output 으로 center 단어 , Skip-gram은 input 으로 center 단어 output 으로 주변 단어
        context, center = [], []

        for i, w in enumerate(encoded_docs):
            start_point = max(0, i - self.window)  # max를 하게 되면 0 밑으로는 내려가지 않음
            end_point = min(len(encoded_docs), i + self.window)  # min을 하면 전체 길이를 넘지 않음

            temp_context = encoded_docs[start_point:end_point+1]
            temp_center = temp_context.pop(i-start_point)

            for c in temp_context:
                center.append(temp_center)
                context.append(c)


        return np.array(context), np.array(center)

    def _input_to_hidden(self, X, W):
        return np.dot(X, W)  # 대용량 데이터 처리할 때는 변경해야함. 행렬 곱을 빼고(원핫인코딩 없애기로 했으니) 바로 히든 레이어 출력하는 것으로

    def _hidden_to_output(self, H, W):
        return self._softmax(np.dot(H, W))

    def _eval_loss(self, Y, Y_hat):
        return -1/len(Y) * np.sum(Y*np.log(Y_hat))  # 크로스 앤트로피 구현 (닷프로덕트로 변경해도됨)

    def _calc_gradients(self, X, Y, Y_hat, H, W2):
        dW2 = 0
        dW1 = 0

        return dW1, dW2

    def _softmax(self, O):
        return np.exp(O)/np.sum(np.exp(O))

    def _optimize(self, X, Y, W):
        loss_trace = []

        for e in range(self.epoch):
            H = self._input_to_hidden(X, W[0])
            Y_hat = self._hidden_to_output(H, W[1])
            loss = self._eval_loss(Y, Y_hat)
            loss_trace.append(loss)
            gradients = self._calc_gradients(X, Y, Y_hat, H, W[1])

            # update weights


if __name__ == "__main__":
    docs = ["you will never know until you try"]
    wv = Word2Vec(docs=docs, embedding_size=4)