-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_utils.py
More file actions
393 lines (311 loc) · 15.5 KB
/
data_utils.py
File metadata and controls
393 lines (311 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""class DataParser = Utility for reading data from files and creating ordered or random batches from them
When created, DataParser loads the data into memory in following form:
self.vocabulary_index = list of used words (ordered by frequency)
self.reverse_vocabulary_index = dict {'word': index of 'word' in self.vocabulary_index}
can be loaded from / saved to a file
provide 'tokenization' = translating lines into lists of indices into self.vocabulary_index
-1 for unknown words
self.train_conjectures
self.val_conjectures
= lists of conjectures,
every conjecture = dict
'name': title of the conjecture (string)
'filename': string like 'e-hol-ml-dataset/train/00042'
'conj': tokenized conecture, i.e. wording translated into indices into self.vocabulary_index
'deps': can be list of tokenized dependencies, currently not used
'+', '-': lists of tokenized positive and negative steps
There are 4 methods for getting data (for user):
draw_random_batch_of_steps
draw_random_batch_of_steps_and_conjectures
draw_batch_of_steps_in_order
draw_batch_of_steps_and_conjectures_in_order
Keyword arguments: split='train' (default) or 'dev', batch_size = 128 (default)
Return values of random versions:
draw_random_batch_of_steps -> ([steps, preselection], labels)
draw_random_batch_of_steps -> ([steps, conjectures, preselection], labels)
steps and conjectures = tree data, for explicit format, see tree_utils.py
preselection = the list of words used in the batch, for explicit format, see tree_utils.py
labels = numpy array of zeros (= unuseful step) and ones (= useful step)
The "in_order" versions can moreover get an argument begin_index, the begin of data is index (0,0).
Then it returns pair (data, end_index), where data are in the format of appropriate "random" version.
If it reaches the end of data, it returns partial or empty result. So it can be in loop like that:
index = (0,0)
while True:
(input_data, labels), index = draw_batch_of_steps_in_order(begin_index = index, 'val', batch_size)
if len(labels) == 0: break
process(input_data, labels)
if len(labels) < batch_size: break
Methods for parsing (used by constructor)
build_vocabulary(fnames = list of filenames from which the words are loaded)
save_vocabulary(filename)
load_vocabulary(filename)
parse_file_list(fnames = list of filenames, each file contains one conjecture)
-> parse_file(filename)
"""
from __future__ import print_function
import os
import sys
import logging
import random
import numpy as np
class DataParser(object):
# discard_unknown = save only conjectures / steps without unknown words
# ignore_deps = do not save lists of dependencies -- 'deps' of a conjecture
def __init__(self, source_dir, encoder, verbose=1, voc_filename=None,
discard_unknown = False, ignore_deps = False, simple_format = False,
check_input = False, divide_test = None, truncate_train = 1, truncate_test = 1,
complete_vocab = False, step_as_index = False, def_fname = None):
random.seed(1337)
self.simple_format = simple_format
self.verbose = verbose
self.check_input = check_input
if divide_test is None:
train_dir = os.path.join(source_dir, 'train')
val_dir = os.path.join(source_dir, 'test')
train_fnames = sorted([
os.path.join(train_dir, fname)
for fname in os.listdir(train_dir)])
val_fnames = sorted([
os.path.join(val_dir, fname)
for fname in os.listdir(val_dir)])
else:
train_fnames = [
os.path.join(source_dir, fname)
for fname in os.listdir(source_dir)]
random.shuffle(train_fnames)
val_fnames = sorted(train_fnames[-int(divide_test*len(train_fnames)):])
train_fnames = sorted(train_fnames[:-len(val_fnames)])
train_fnames = train_fnames[:int(truncate_train*len(train_fnames))]
val_fnames = val_fnames[:int(truncate_test*len(val_fnames))]
if voc_filename and os.path.isfile(voc_filename):
self.vocabulary_index = self.load_vocabulary(voc_filename)
else:
if verbose:
logging.info('Building vocabulary...')
vocab_fnames = train_fnames
if complete_vocab: vocab_fnames = vocab_fnames + val_fnames
if def_fname: vocab_fnames = vocab_fnames + [def_fname]
self.vocabulary_index = self.build_vocabulary(vocab_fnames)
if voc_filename: self.save_vocabulary(voc_filename)
if verbose:
logging.info('Found %s unique tokens.', len(self.vocabulary_index))
self.reverse_vocabulary_index = dict(
[(self.vocabulary_index[key], key) for key in range(len(self.vocabulary_index))])
#if encoder is None: return
if encoder: encoder.set_vocab(self.reverse_vocabulary_index, self.vocabulary_index)
self.encoder = encoder
self.discard_unknown = discard_unknown
self.ignore_deps = ignore_deps
self.step_as_index = step_as_index
self.train_conjectures = self.parse_file_list(train_fnames)
self.val_conjectures = self.parse_file_list(val_fnames)
if verbose: print("Loaded {} training conjectures, {} validation conjectures.".format(
len(self.train_conjectures), len(self.val_conjectures)
))
if def_fname: self.definitions = self.parse_definitions(def_fname)
else: self.definitions = None
if step_as_index:
steps_set = set()
for conj in self.train_conjectures:
for step in conj['+']+conj['-']:
steps_set.add(step)
self.max_step_index = len(steps_set)
steps_set = dict((step,i) for i,step in enumerate(steps_set))
for conj in self.train_conjectures + self.val_conjectures:
conj['+'] = [steps_set.get(step, -1) for step in conj['+'] ]
conj['-'] = [steps_set.get(step, -1) for step in conj['-'] ]
else: self.max_step_index = None
def save_vocabulary(self, filename):
f = open(filename, 'w')
for token in self.vocabulary_index: print(token, file=f)
f.close()
def load_vocabulary(self, filename):
f = open(filename, 'r')
vocabulary = f.read().splitlines()
f.close()
return vocabulary
def build_vocabulary(self, fnames):
vocabulary_freq = dict()
for fname in fnames:
f = open(fname)
for line in f:
if self.simple_format or line[0] == 'P' or line[0] == 'd':
for token in line.rstrip()[2:].split():
if token not in vocabulary_freq:
vocabulary_freq[token] = 1
else: vocabulary_freq[token] += 1
f.close()
vocabulary = sorted([(freq, token) for (token, freq) in vocabulary_freq.items()], reverse=True)
# By uncommenting these, you log the vocabulary together with frequencies
#
#f = open('vocab_freq', 'w')
#for (freq, token) in vocabulary: print("{} {}".format(freq, token), file=f)
#f.close()
return [token for (freq, token) in vocabulary]
def parse_file_list(self, fnames): # load a list of conjectures into memory
conjectures = []
for fname in fnames:
if self.verbose:
sys.stdout.write("Loading {} ".format(fname))
sys.stdout.flush()
sys.stdout.write('\r')
conjecture = self.parse_file(fname)
if conjecture: conjectures.append(conjecture)
if self.verbose: sys.stdout.write('\n')
return conjectures
def tokenize(self, line):
line = line.rstrip()[2:]
tokens = [self.reverse_vocabulary_index.get(tokstr, -1) for tokstr in line.split()]
if self.check_input:
try:
self.encoder([tokens])
except IOError:
print("Line: {}".format(line))
print("File: {}".format(fname))
raise
return tokens
def parse_file(self, fname): # parse a single file with a single conjecture
f = open(fname)
line = f.readline()
name = line.rstrip()[2:]
if self.simple_format: prefix_line = line
else:
f.readline() # text line
prefix_line = f.readline()
conj = self.tokenize(prefix_line)
if self.discard_unknown and min(conj) < 0: return None
conjecture = {
'name': name,
'filename': fname,
'deps': [],
'+': [],
'-': [],
'conj': conj,
}
while 1:
line = f.readline()
if not line:
break
marker = line[0]
if marker == 'D':
if self.simple_format: prefix_line = line
else:
text_line = f.readline()
prefix_line = f.readline()
if not self.ignore_deps:
content = self.tokenize(prefix_line)
if not (self.discard_unknown and min(content) < 0):
conjecture['deps'].append(content)
elif marker in {'+', '-'}:
if self.simple_format: prefix_line = line
else: prefix_line = f.readline()
if self.step_as_index: content = prefix_line.rstrip()[2:]
else: content = self.tokenize(prefix_line)
if not (self.discard_unknown and min(content) < 0):
conjecture[marker].append(content)
return conjecture
def parse_definitions(self, fname):
result = []
f = open(fname)
for line in f:
sline = line.rstrip()[2:].split()
tokens = [self.reverse_vocabulary_index[w] for w in sline]
result.append((tokens[0], tokens[1:]))
return result
def draw_batch(self, split, batch_size, get_conjectures = True, only_pos = False, begin_index = None, use_preselection = True, definitions_size = None):
if self.definitions is None: definitions_size = None
in_order = (begin_index is not None)
if split == 'train':
all_conjectures = self.train_conjectures
elif split == 'val':
all_conjectures = self.val_conjectures
else:
raise ValueError('`split` must be in {"train", "val"}.')
# Preparation of steps and conjectures
steps = []
conjectures = []
if in_order:
labels = []
conjecture_index, step_index = begin_index
while len(steps) < batch_size and conjecture_index < len(all_conjectures):
conjecture = all_conjectures[conjecture_index]
if only_pos: conjecture_steps = conjecture['+']
else: conjecture_steps = conjecture['+']+conjecture['-']
if len(conjecture_steps) > step_index:
if only_pos: step_labels = [1] * len(conjecture['+'])
else: step_labels = [1] * len(conjecture['+']) + [0] * len(conjecture['-'])
remaining = batch_size - len(steps)
added_labels = step_labels[step_index: step_index + remaining]
labels += added_labels
steps += conjecture_steps[step_index: step_index + remaining]
if get_conjectures: conjectures += [conjecture['conj']] * len(added_labels)
step_index += remaining
else:
step_index = 0
conjecture_index += 1
labels = np.asarray(labels)
else:
if only_pos: labels = np.ones((batch_size,), int)
else: labels = np.random.randint(0, 2, size=(batch_size,))
while len(steps) < batch_size:
conjecture = random.choice(all_conjectures)
if labels[len(steps)]:
conjecture_steps = conjecture['+']
else:
conjecture_steps = conjecture['-']
if conjecture_steps:
step = random.choice(conjecture_steps)
steps.append(step)
if get_conjectures: conjectures.append(conjecture['conj'])
# Preparation of definitions
if definitions_size is not None:
definitions = []
while len(definitions) < definitions_size: definitions.append(random.choice(self.definitions))
def_tokens = [token for token, definition in definitions]
definitions = [definition for token, definition in definitions]
# Preselection -- the used words
if use_preselection:
all_data = []
if definitions_size is not None: all_data = all_data + [def_tokens] + definitions
if not self.step_as_index: all_data = all_data + steps
if get_conjectures: all_data = all_data + conjectures
preselection = self.encoder.load_preselection(all_data)
if definitions_size is not None:
def_tokens = [preselection.translation[token] for token in def_tokens]
else: preselection = None
# encoding data
if definitions_size is not None:
definitions = self.encoder(definitions, preselection)
def_tokens = np.array(def_tokens)
if get_conjectures:
conjectures = self.encoder(conjectures, preselection)
if self.step_as_index: steps = np.array(steps)
else: steps = self.encoder(steps, preselection)
# Packing data
batch = dict()
batch['steps'] = steps
if get_conjectures: batch['conjectures'] = conjectures
if preselection is not None: batch['preselection'] = preselection.data
if definitions_size is not None:
batch['def_tokens'] = def_tokens
batch['definitions'] = definitions
batch['labels'] = labels
batch['size'] = len(labels)
if in_order: return batch, (conjecture_index, step_index)
else: return batch
def draw_random_batch_of_steps(self, split='train', batch_size=128, **kwargs):
batch = self.draw_batch(split, batch_size, get_conjectures = False, **kwargs)
return [batch['steps'], batch['preselection'], batch['labels']]
def draw_batch_of_steps_in_order(self, begin_index=(0,0), split='train', batch_size=128):
batch, index = self.draw_batch(split, batch_size, get_conjectures = False, begin_index = begin_index, **kwargs)
return [batch['steps'], batch['preselection'], batch['labels']], index
def draw_batch_of_steps_and_conjectures_in_order(self, begin_index=(0,0), split='train', batch_size=128, **kwargs):
batch, index = self.draw_batch(split, batch_size, get_conjectures = True, begin_index = begin_index)
return [batch['steps'], batch['conjectures'], batch['preselection'], batch['labels']], index
def draw_random_batch_of_steps_and_conjectures(self, split='train', batch_size=128, **kwargs):
batch = self.draw_batch(split, batch_size, get_conjectures = True, **kwargs)
return [batch['steps'], batch['conjectures'], batch['preselection'], batch['labels']]
if __name__ == "__main__":
# when loaded alone, just test that data can be loaded
parser = DataParser("mizar-dataset", None, simple_format = True, divide_test = 0.1, step_as_index = True)
print(parser.max_step_index)