W-D-Prediction/DataParser.py at master · Backpack-Technologies/W-D-Prediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import ProductInfo
import numpy as np
import json
from sklearn.model_selection import train_test_split

TOP = 1000000
DATA_FILE = "data/p2v-embeddings" + str(TOP)
DIMENSION_FILE = "data/asin+dimensions.txt"
PARSED_DATA = 'data/dataset' + str(TOP)

dimensions = dict()
NUM_OF_COLS = 300


def init_file_name():
    global DATA_FILE
    global DIMENSION_FILE
    global PARSED_DATA

    DATA_FILE = "data/p2v-embeddings" + str(TOP)
    DIMENSION_FILE = "data/asin+dimensions.txt"
    PARSED_DATA = 'data/dataset' + str(TOP)


def get_data(cat1, cat2, cat3, cat4):
    loop = 0
    with open(DIMENSION_FILE) as infile:
        for line in infile:
            now = json.loads(line)
            dimension = now['dimensions']
            if cat1 not in dimension or cat2 not in dimension or cat3 not in dimension or cat4 not in dimension:
                continue
            dimensions[now['asin']] = now['dimensions']

            loop += 1
            if loop % 10000 == 0:
                print("created mapping", loop)

    with open(DATA_FILE, "r") as infile:
        lineNo = 0
        datas = []
        for line in infile:
            lineNo += 1

            if lineNo % 1000 == 0:
                print(lineNo)

            if lineNo > 1:
                row = line.split()

                asin = row[0]
                del row[0]

                if asin not in dimensions:
                    continue

                row.append(dimensions[asin][cat1])
                row.append(dimensions[asin][cat2])
                row.append(dimensions[asin][cat3])
                row.append(dimensions[asin][cat4])

                datas.append(row)

    return datas


def get_splitted_data_for_model(load_data):
    init_file_name()
    if load_data:
        datas = np.asarray(np.float_(parse_data()))
        np.save(PARSED_DATA, datas)

    datas = np.load(PARSED_DATA + ".npy")
    X = datas[:, 0:NUM_OF_COLS]
    y = datas[:, NUM_OF_COLS:NUM_OF_COLS+4]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


def get_data_for_model(load_data):
    init_file_name()
    if load_data:
        datas = np.asarray(np.float_(parse_data()))
        np.save(PARSED_DATA, datas)

    datas = np.load(PARSED_DATA + ".npy")
    X = datas[:, 0:100]
    y = datas[:, 100:104]
    return X, y


def parse_data():
    datas = get_data("length", "width", "height", "weight")
    return datas


if __name__ == "__main__":
    # parse_data()
    tmp = get_data("length", "width", "height", "weight")
    print(len(tmp))