-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_dataset.py
More file actions
70 lines (56 loc) · 2.18 KB
/
test_dataset.py
File metadata and controls
70 lines (56 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from model.dataset import *
import math
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
def test_penntreebank():
print ("-"*80)
path = "data/penn/train.txt"
# CONSTANTS
batch_size = 64
bptt = 70
print("Running Sanity Check: Dataset")
# Open treebank text file
words = []
with open(path, 'r') as f:
for line in f:
words += line.split()
treebank = CustomDataset(path, batch_size=batch_size, bptt=bptt, is_stream=True)
dataloader = DataLoader(treebank, batch_size=batch_size, shuffle=False, num_workers=4)
vocab = Vocab(file_path="data/penn/train_stream.json")
total_num_lines = math.ceil(len(words) / bptt)
num_batches = math.ceil(total_num_lines / batch_size)
print("Total number of batches: ", num_batches)
words += ['<pad>'] * (num_batches * bptt * batch_size - len(words))
"""
if batch_size = 3
x <
x -
x
x <
x -
x
x <
x -
x
"""
for i, (batch_x, batch_y) in enumerate(tqdm(dataloader)):
# x -> (64, bptt)
# x[0] -> first bptt words in file = 0 * ceil(total_num_lines / batch_size)
# x[1] -> 1 * ceil(total_num_lines / batch_size)
# For each line of length 70 in the batch
for batch_row_idx, (x, y) in enumerate(zip(batch_x, batch_y)):
x = x.tolist()
y = y.tolist()
x = [vocab.id2word[index] for index in x]
y = [vocab.id2word[index] for index in y]
seq_len = bptt
if i == num_batches - 1:
seq_len = bptt - 1
gt_x = words[(batch_row_idx * num_batches + i) * bptt: (batch_row_idx * num_batches + i) * bptt + seq_len]
gt_y = words[(batch_row_idx * num_batches + i) * bptt + 1: (batch_row_idx * num_batches + i) * bptt + seq_len + 1]
assert (x == gt_x), "\n At batch {}, row {}, \n Expected: \n {} \n\n got: \n {}".format(i, batch_row_idx, x, gt_x)
assert (y == gt_y), "\n At batch {}, row {}, \n Expected: \n {} \n\n got: \n {}".format(i, batch_row_idx, y, gt_y)
print("All Sanity Checks Passed!")
print ("-"*80)
if __name__ == '__main__':
test_penntreebank()