biLSTM/visualize-training-data.py at master · fhgr/biLSTM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3

import gzip
import numpy as np

from encode import TermTranslator

from glob import glob
from pickle import load
from csv import reader

TRAINING_CORPUS_X = "./train/html_training_corpus.bin.gz_x.?"
TRAINING_CORPUS_Y = "./train/html_training_corpus.bin.gz_y.?"
VOCABULARY = "html_vocabulary.cvs.gz"

def l(corpus_pattern):
    ''' loads all corpora matching the given corpus_pattern '''
    result = []
    for fname in sorted(glob(corpus_pattern)):
        with gzip.open(fname) as f:
            result.extend(load(f))
    print("Read:", len(result), "records.")
    return result


def get_matrix(data, sequence_len, term_translator, size, skip):
    print("Creating data matrix...")
    result = np.full([size, sequence_len, term_translator.vector_len], 0)
    for no, example in enumerate(data):
        if no < skip:
            continue
        result[no-skip] = [term_translator.int_to_vector(v) for v in example]
        if no == (skip+size-1):
            break
    print("Completed computation of data matrix with shape", result[0].shape, "...")
    return result

def read_vocabulary_file(fname):
    vocabulary = {}
    with gzip.open(VOCABULARY, 'rt') as f:
        csv = reader(f)
        for word, idx in csv:
            vocabulary[word] = int(idx)

    return vocabulary

if __name__ == '__main__':
    vocabulary = read_vocabulary_file(VOCABULARY)
    tt = TermTranslator(vocabulary)
    print("Vocabulary size:", len(vocabulary))
    print("Vocabulary vector size:", tt.vector_len)

    sequence_len = 15

    sample_size = 5
    data_y = get_matrix(l(TRAINING_CORPUS_Y), sequence_len, tt, sample_size, skip=500000).reshape(sample_size, sequence_len*tt.vector_len)
    data_x = get_matrix(l(TRAINING_CORPUS_X), sequence_len, tt, sample_size, skip=500000).reshape(sample_size, sequence_len*tt.vector_len)

    x = tt.matrix_to_term_sequeence(data_x)
    y = tt.matrix_to_term_sequeence(data_y)

    for xx, yy in zip(x, y):
        print("\t".join(xx) + "\n" + "\t".join(yy))
        print("----------")