forked from pasquini-dario/LLMmap
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_model.py
More file actions
110 lines (90 loc) · 3.32 KB
/
test_model.py
File metadata and controls
110 lines (90 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys
import argparse
import itertools
import numpy as np
import torch
import tqdm
from LLMmap.dataset import read_dataset
from LLMmap.inference import load_LLMmap
def get_topk_labels_from_distances(distances, label_map, k):
"""
distances : 1-D np.ndarray of shape (num_classes,)
label_map : dict[int -> str] (same one your model already has)
k : int (1 ≤ k ≤ num_classes)
-------
returns : list[str] length == k
"""
topk_idx = np.argsort(distances)[:k] # smaller distance = closer = better
return topk_idx
def evaluate_topk(model, test_iterable, k_values=(1, 2, 3)):
"""
model : your InferenceModel_open instance (called `inf` in your snippet)
test_iterable : whatever you named `test`
k_values : tuple of k’s you want accuracies for
Returns dict {k: accuracy_float}
"""
# counters
num_samples = 0
topk_correct_counter = {k: 0 for k in k_values}
llms_map = {v:k for (k,v) in model.label_map.items()}
for entry in tqdm.tqdm(test_iterable):
llm_name = entry['llm'] # ← key present in your JSON
gt_label = llms_map[llm_name] # ground-truth string
answers = [trace[1] for trace in entry['traces']]
distances = model(answers) # forward pass
num_samples += 1
for k in k_values:
preds_k = get_topk_labels_from_distances(distances, model.label_map, k)
if gt_label in preds_k:
topk_correct_counter[k] += 1
# compute accuracy
accuracies = {k: topk_correct_counter[k] / num_samples
for k in k_values}
return accuracies
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Test a pre-trained LLMmap model on the test-set."
)
parser.add_argument(
"model_home_dir",
type=str,
help="Path to the model home directory"
)
parser.add_argument(
"-k", "--topk",
type=int,
default=3,
metavar="K",
help="Compute top-1 … top-K accuracies (default: 3)"
)
parser.add_argument(
"-m", "--max-entries",
type=int,
default=None,
metavar="N",
help="Evaluate only the first N samples of the test set (default: all)"
)
args = parser.parse_args()
if args.topk < 1:
parser.error("--topk must be ≥ 1")
# ------------------------------------------------------------------
conf, inf = load_LLMmap(args.model_home_dir, device='cpu')
if not conf['is_open']:
print("Applicable to only open-set inference model. Aborting...")
sys.exit(1)
if not inf.ready:
print("No templates found for the model. Aborting...")
sys.exit(1)
train, test = read_dataset(conf['dataset_path'])
# Respect --max-entries (None means "all")
test_iter = (
test if args.max_entries is None
else itertools.islice(test, args.max_entries)
)
# Build the tuple (1, 2, …, K)
k_values = tuple(range(1, args.topk + 1))
print("Running test...")
acc = evaluate_topk(inf, test_iter, k_values=k_values)
# Nicely print all requested accuracies
for k in k_values:
print(f"Top-{k} accuracy: {acc[k]:.3%}")