secondOrderRNN/test_dataset.py at master · noa-codes/secondOrderRNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from model.dataset import *
import math
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

def test_penntreebank():
    print ("-"*80)
    path = "data/penn/train.txt"

    # CONSTANTS
    batch_size = 64
    bptt = 70
    print("Running Sanity Check: Dataset")

    # Open treebank text file
    words = []
    with open(path, 'r') as f:
        for line in f:
            words += line.split()

    treebank = CustomDataset(path, batch_size=batch_size, bptt=bptt, is_stream=True)
    dataloader = DataLoader(treebank, batch_size=batch_size, shuffle=False, num_workers=4)
    vocab = Vocab(file_path="data/penn/train_stream.json")

    total_num_lines = math.ceil(len(words) / bptt)
    num_batches = math.ceil(total_num_lines / batch_size)
    print("Total number of batches: ", num_batches)

    words += ['<pad>'] * (num_batches * bptt * batch_size - len(words))

    """
    if batch_size = 3
    x <
    x -
    x
    x <
    x -
    x
    x <
    x -
    x
    """
    for i, (batch_x, batch_y) in enumerate(tqdm(dataloader)):
        # x -> (64, bptt)
        # x[0] -> first bptt words in file = 0 * ceil(total_num_lines / batch_size)
        # x[1] -> 1 * ceil(total_num_lines / batch_size)
        # For each line of length 70 in the batch
        for batch_row_idx, (x, y) in enumerate(zip(batch_x, batch_y)):
            x = x.tolist()
            y = y.tolist()
            x = [vocab.id2word[index] for index in x]
            y = [vocab.id2word[index] for index in y]

            seq_len = bptt
            if i == num_batches - 1:
                seq_len = bptt - 1

            gt_x = words[(batch_row_idx * num_batches + i) * bptt: (batch_row_idx * num_batches + i) * bptt + seq_len]
            gt_y = words[(batch_row_idx * num_batches + i) * bptt + 1: (batch_row_idx * num_batches + i) * bptt + seq_len + 1]

            assert (x == gt_x), "\n At batch {}, row {}, \n Expected: \n {} \n\n got: \n {}".format(i, batch_row_idx, x, gt_x)
            assert (y == gt_y), "\n At batch {}, row {}, \n Expected: \n {} \n\n got: \n {}".format(i, batch_row_idx, y, gt_y)


    print("All Sanity Checks Passed!")
    print ("-"*80)


if __name__ == '__main__':
    test_penntreebank()