-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
155 lines (129 loc) · 4.92 KB
/
evaluate.py
File metadata and controls
155 lines (129 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import math
import heapq # for retrieval topK
import numpy as np
from time import time
# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None
def getHR(ranklist, gtItems):
for item in ranklist:
if item in gtItems:
return 1
return 0
def get_precision(ranklist, gtItems):
relevant = 0
for item in ranklist:
if item in gtItems:
relevant += 1
return relevant / len(ranklist)
def getNDCG(ranklist, gtItems):
dcg = 0.0
for i, item in enumerate(ranklist):
if item in gtItems:
dcg += math.log(2) / math.log(i + 2)
# Ideal DCG
idcg = sum(math.log(2) / math.log(i + 2) for i in range(len(gtItems)))
return dcg / idcg if idcg > 0 else 0.0
def get_recall(ranklist, gtItems):
relevant = 0
for item in ranklist:
if item in gtItems:
relevant += 1
return relevant / len(gtItems)
#### For Matrix factorization binary case :
def evaluate_metrics(test_data, P, Q, user_id_map, item_id_map, K=10):
"""
Evaluate metrics (HR@K, NDCG@K, Precision@K, Recall@K) for a leave-2-out test set.
Parameters:
- test_data: DataFrame containing test samples with the following columns:
'user', 'id1' (positive item 1), 'id2' (positive item 2), ..., 'negative_1', ..., 'negative_99'
- P: User latent factor matrix
- Q: Item latent factor matrix
- user_id_map: Dictionary mapping user IDs to indices
- item_id_map: Dictionary mapping item IDs to indices
- K: Number of top items to consider for metrics
Returns:
- hr: Average Hit Rate at K
- ndcg: Average Normalized Discounted Cumulative Gain at K
- precision: Average Precision at K
- recall: Average Recall at K
"""
hr_list = []
ndcg_list = []
precision_list = []
recall_list = []
for _, row in test_data.iterrows():
user_id = row['user']
positive_items = [row['id1'], row['id2']]
# Extract negative items from the row
negative_items = row[7:].values # Assuming negatives start from the 8th column
user_idx = user_id_map[user_id]
# Create a list of candidate items (two positives + negatives)
candidate_items = positive_items + list(negative_items)
candidate_indices = [item_id_map[item] for item in candidate_items]
# Compute scores for all candidate items
scores = [np.dot(P[user_idx], Q[item_idx]) for item_idx in candidate_indices]
# Rank items by their predicted scores
ranked_indices = np.argsort(scores)[::-1] # Descending order of scores
ranked_items = [candidate_items[i] for i in ranked_indices[:K]] # Top-K items
# Compute metrics for the current user
hr = getHR(ranked_items, positive_items)
ndcg = getNDCG(ranked_items, positive_items)
precision = get_precision(ranked_items, positive_items)
recall = get_recall(ranked_items, positive_items)
hr_list.append(hr)
ndcg_list.append(ndcg)
precision_list.append(precision)
recall_list.append(recall)
# Compute average metrics across all users
avg_hr = np.mean(hr_list)
avg_ndcg = np.mean(ndcg_list)
avg_precision = np.mean(precision_list)
avg_recall = np.mean(recall_list)
return avg_hr, avg_ndcg, avg_precision, avg_recall
def evaluate_model(model, testRatings, testNegatives, K):
"""
Evaluate the performance (Normalized Precision@k, NDCG@k) of top-K recommendation
Return: score of each test rating.
"""
global _model
global _testRatings
global _testNegatives
global _K
_model = model
_testRatings = testRatings
_testNegatives = testNegatives
_K = K
hits, ndcgs, precisions, recalls = [],[], [], []
for idx in range(len(_testRatings)):
hr, ndcg, p, r = eval_one_rating(idx)
hits.append(hr)
ndcgs.append(ndcg)
precisions.append(p)
recalls.append(r)
return hits, ndcgs, precisions, recalls
def eval_one_rating(idx):
hr, ndcg, precision, recall = [], [], [], []
rating = _testRatings[idx]
items = _testNegatives[idx]
u = rating[0]
gtItems = rating[1:]
items += gtItems
# Get prediction scores
map_item_score = {}
users = np.full(len(items), u, dtype = 'int32')
predictions = _model.predict(users, np.array(items), batch_size=100)
for i in range(len(items)):
item = items[i]
map_item_score[item] = predictions[i].max()
items.pop()
# Evaluate top rank list for topK in (1,10)
for k in range(1, _K + 1):
ranklist = heapq.nlargest(k, map_item_score, key=map_item_score.get)
hr.append(getHR(ranklist, gtItems))
ndcg.append(getNDCG(ranklist, gtItems))
precision.append(get_precision(ranklist, gtItems))
recall.append(get_recall(ranklist, gtItems))
return hr, ndcg, precision, recall