-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
executable file
·229 lines (171 loc) · 8.3 KB
/
utils.py
File metadata and controls
executable file
·229 lines (171 loc) · 8.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import pandas as pd
import numpy as np
import os
import random
import copy
import json
import glob
def loadItemsProperty(itemsPropertyFile):
"""loadItemsProperty function load json file in a dictionary.
Args:
itemsPropertyFile (str): path to the json file with item properties.
Returns:
dictionary: information about items in dictionary format.
"""
with open(itemsPropertyFile, 'r') as handle:
json_data = [json.loads(line) for line in handle]
return json_data
def loadRatings(ratingsFile):
"""loadRatings function load csv rating file in a dataframe.
Args:
ratingsFile (str): path to the csv file with item ratings.
Returns:
pivot_utility (dataframe): pivot table with ratings
ratings_df (dataframe): ratings in dataframe format
"""
col_names = ["userId", "itemId", "rating", "timestamp"]
ratings_df = pd.read_csv(ratingsFile, delimiter=",", decimal=".", names=col_names, header=None)
pivot_utility = ratings_df.pivot_table('rating', 'userId', 'itemId', fill_value=0).astype(float)
return pivot_utility, ratings_df
def fill_users_items_dict(ratings_df, items_array):
"""fill_users_items_dict function creates a dictionary with information about all items and users
from input dataframe.
Args:
ratings_df (dataframe): dataframe with information about users, items and ratings
items_array (array): array of all items
Returns:
holy_dict: dictionary with keys are users and values are information about items
which were rated by this user.
"""
holy_dict = {}
for count, row in enumerate(ratings_df.itertuples()):
key = row.userId
item = {"itemId": row.itemId, "real_rating": float(row.rating),
"predicted_rating": 0.0, "folder": 0,
"global_id": int(np.argwhere(items_array == row.itemId)[0][0]), "relevance": -1
}
if key not in holy_dict:
holy_dict[key] = [item]
else:
holy_dict[key].append(item)
return holy_dict
def set_folders_cv5(users_items_dict):
"""set_folders_cv5 function distributes items per each user randomly for different cross-validation splits
Args:
users_items_dict (dictionary): dictionary which was generated by fill_users_items_dict function
Returns:
users_items_dict: dictionary with filled information about splitting folders
"""
for user, items in users_items_dict.items():
# for each user shuffle data and split at n-items folders
random.shuffle(items)
n = len(items) // 5
for i, item in enumerate(items):
if 0 <= i < n:
item['folder'] = 1
elif n <= i < 2 * n:
item['folder'] = 2
elif (2 * n) <= i < (3 * n):
item['folder'] = 3
elif (3 * n) <= i < (4 * n):
item['folder'] = 4
elif (4 * n) <= i:
item['folder'] = 5
return users_items_dict
# this method was used for splitting once for competition with strict folders
# at the final version of EvaS we use our splitting method set_folders_cv5
def set_folders_cv5_competition_splits(users_items_dict):
"""set_folders_cv5_competition_splits function distributes items per each user for different cross-validation splits
according to the provided csv_files
Args:
users_items_dict (dictionary): dictionary which was generated by fill_users_items_dict function
Returns:
users_items_dict: dictionary with filled information about splitting folders
"""
fold1 = open("../Dataset/ratings_Electronics_50_fold1.csv", "r")
fold1_arr = " ".join(fold1.readlines())
fold1.close()
fold2 = open("../Dataset/ratings_Electronics_50_fold2.csv", "r")
fold2_arr = " ".join(fold2.readlines())
fold2.close()
fold3 = open("../Dataset/ratings_Electronics_50_fold3.csv", "r")
fold3_arr = " ".join(fold3.readlines())
fold3.close()
fold4 = open("../Dataset/ratings_Electronics_50_fold4.csv", "r")
fold4_arr = " ".join(fold4.readlines())
fold4.close()
fold5 = open("../Dataset/ratings_Electronics_50_fold5.csv", "r")
fold5_arr = " ".join(fold5.readlines())
fold5.close()
for user, items in users_items_dict.items():
for i, item in enumerate(items):
splitting_line = user + "," + item["itemId"] + "," + str(item["real_rating"])
if splitting_line in fold1_arr:
item['folder'] = 1
elif splitting_line in fold2_arr:
item['folder'] = 2
elif splitting_line in fold3_arr:
item['folder'] = 3
elif splitting_line in fold4_arr:
item['folder'] = 4
elif splitting_line in fold5_arr:
item['folder'] = 5
return users_items_dict
def set_relevance(users_items_dict):
"""set_relevance function calculates the relevance per each item per each user according cross-validation splits
Args:
users_items_dict (dictionary): dictionary which was generated by fill_users_items_dict function
and filled by set_folders_cv5 function
Returns:
users_items_dict: dictionary with filled information about relevance of each item
"""
for user, items in users_items_dict.items():
for f in range(1, 6):
user_ratings = []
for item in items:
if item['folder'] != f:
user_ratings.append(item['real_rating'])
avg_rating_for_user = np.mean(np.array(user_ratings))
for item in items:
if item['folder'] == f:
item['relevance'] = int(item['real_rating'] >= avg_rating_for_user)
return users_items_dict
def generate_pivot_without_folder_k(original_pivot_utility, users_items_dict, folder):
"""generate_pivot_without_folder_k function creates a copy of original_pivot_utility and
deletes the information about test items from it
Args:
original_pivot_utility (dataframe): pivot table with ratings
users_items_dict (dictionary) - dictionary which was generated by fill_users_items_dict function.
Contains information about splitting folders
folder (int) - the number of cross-validation experiment.
Controls which test set should be excluded from training procedure
Returns:
pivot_utility_copy (dataframe): pivot table without ratings from test folder (only training info)
"""
pivot_utility_copy = copy.deepcopy(original_pivot_utility)
for user, items in users_items_dict.items():
for item in items:
if item['folder'] == folder:
pivot_utility_copy[item['itemId']][user] = 0
return pivot_utility_copy
def generate_ratings_df_without_folder_k(original_ratings_df, users_items_dict, folder):
"""generate_ratings_df_without_folder_k function creates a copy of original_pivot_utility and
deletes the information about test items from it
Args:
original_ratings_df (dataframe): original dataframe with ratings
users_items_dict (dictionary) - dictionary which was generated by fill_users_items_dict function.
Contains information about splitting folders
folder (int) - the number of cross-validation experiment.
Controls which test set should be excluded from training procedure
Returns:
ratings_df_copy (dataframe): dataframe without ratings from test folder (only training info)
"""
ratings_df_copy = copy.deepcopy(original_ratings_df)
for user, items in users_items_dict.items():
for item in items:
if item['folder'] == folder:
drop_index = original_ratings_df.loc[
(original_ratings_df["itemId"] == item['itemId']) & (original_ratings_df["userId"] == user)].index
if drop_index is not None:
ratings_df_copy = ratings_df_copy.drop(drop_index)
return ratings_df_copy