-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_importer.py
More file actions
113 lines (88 loc) · 3.94 KB
/
data_importer.py
File metadata and controls
113 lines (88 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
##from __future__ import unicode_literals
import csv
from string import punctuation
import re
numbers = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
stemmer = SnowballStemmer("danish")
def write_predictions(output):
with open("predictions.csv", 'w') as f:
writer = csv.writer(f)
ids = [x[0] for x in output] ##Open up sentences
ids = [a.replace('\n', '') for a in ids] ##Strips out linebreak
real_codes = [b[1] for b in output] ##Open up real_codes in similar format
pred_codes = [c[2] for c in output]
results = zip(ids, real_codes, pred_codes) ##Zip it up! Ready to write
writer.writerows(results)
with open('master_data.csv', 'r') as r, open('tekst_out2.csv', 'w') as text_out, open('code_out2.csv', 'w') as code_out:
reader = csv.reader(r)
text_writer = csv.writer(text_out, delimiter=b',')
code_writer = csv.writer(code_out, delimiter=b',')
i = 0
for row in reader:
current_text = str(row[1])
current_code = int(row[0])
current_text = current_text.decode('utf-8').lower() ##Notice: decodes unicode
for p in punctuation:
current_text = current_text.replace(p,'') ## Clean punctuation
for num in numbers:
current_text = current_text.replace(str(num),'') ## Clean numbers
stop_words_list = set(stopwords.words('danish'))
word_tokens_list = word_tokenize(current_text, 'danish')
filtered_text = []
for word in word_tokens_list:
if word.encode('utf-8') not in stop_words_list:
stem_word = stemmer.stem(word)
filtered_text.append(stem_word) ## Removed: .encode('utf-8')
current_text = ' '.join(filtered_text).encode('utf-8')
##print(current_text)
try:
text_out.write(current_text.encode('utf-8') + '\n') ##Join the string back together to make it work. ##
except AttributeError:
text_out.write('')
code_out.write('{}'.format(current_code) + '\n')
i = i+1
if i % 1000 == 0:
print('Line ' + str(i) + ' Done! On to the next :D')
print('Output file creation done!')
with open('tekst_out2.csv', 'r') as input_file, open('text_out.csv', 'w') as output_file:
sentences = input_file.readlines()
text_writer = csv.writer(output_file)
padded_sentences = []
sequence_length = max(len(x.split()) for x in sentences)
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence.split())
new_sentence = sentence + '<PAD/> ' * num_padding
split_new_sentences = new_sentence.split()
flat_new_words = [word for word in split_new_sentences if word is not '\n']
flat_new_sentence = ' '.join(flat_new_words)
padded_sentences.append(flat_new_sentence)
print('Current line: ' + str(i))
output_file.write('%s\n' % padded_sentences)
##Merge text and codes back together
with open('text_out.csv', 'r') as text, open('code_out2.csv', 'r') as code, open('master_out.csv', 'w') as master_out:
sentences = text.readlines()
code_reader = code.readlines()
master_writer = csv.writer(master_out, delimiter=',')
texts = []
for row in sentences:
try:
current_text = str(row[0])
current_text = current_text.decode('utf-8').lower()
texts.append(current_text.encode('utf-8'))
except IndexError:
texts.append('')
codes = []
for row in code_reader:
current_code = int(row[0])
codes.append(current_code)
rows = zip(codes, texts)
for row in rows:
master_writer.writerow(row)
print('SUCCESSFULLY CREATED MASTER FILE!')