-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvectorization.py
More file actions
73 lines (54 loc) · 2.19 KB
/
vectorization.py
File metadata and controls
73 lines (54 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re, os, gensim, nltk
import pandas as pd
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
def tokenize(files):
vocabs = []
tags = []
stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()
for f in files:
tags.append(f.split('/')[-1])
file = open(f, 'r', encoding='utf-8')
data = re.sub('[\d\W_a-zA-Z]+', ' ', file.read())
data = data.lower()
tokens = []
for token in data.split():
if token and token not in stopwords_ru:
token = token.strip()
token = morph.normal_forms(token)[0]
if len(token) > 2:
tokens.append(token)
vocabs.append(tokens[1:-1])
return vocabs, tags
def tagged_document(vocabs, tags):
for i in range(len(vocabs)):
yield gensim.models.doc2vec.TaggedDocument(vocabs[i], [tags[i]])
if __name__ == "__main__":
folder = $FOLDERPATH
boted_files = []
for i in range(100):
boted_files.append(folder+'boted/bot'+str(i)+'.txt')
human_files = []
for i in range(100):
human_files.append(folder+'human/human'+str(i)+'.txt')
human_tokens, human_tags = tokenize(human_files)
boted_tokens, boted_tags = tokenize(boted_files)
data_tokens = human_tokens + boted_tokens
data_tags = human_tags + boted_tags
data_to_vectorize = list(tagged_document(data_tokens, data_tags))
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
model.build_vocab(data_to_vectorize)
model.train(data_to_vectorize, total_examples=model.corpus_count, epochs=model.epochs)
model.save('DostModel')
vectors = []
for tag in data_tags:
clss = tag.split('.')[0]
clss = re.sub('[\d]+', '', clss)
vec = model.dv[model.dv.index_to_key.index(tag)]
row = {'tag': tag, 'class': clss}
for v in range(40):
row['v'+str(v)] = vec[v]
vectors.append(row)
df = pd.DataFrame.from_dict(vectors)
df.to_csv(folder+'vectors.csv')