biomind/test_evaluation_logic.py at master · 269652/biomind · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python3
"""
Simple test to verify MMLU evaluation logic without loading complex models
"""

# Mock some data to test the evaluation logic
correct_answers = [1, 2, 3, 0]  # B, C, D, A
predicted_answers = [0, 0, 0, 0]  # Always predict A

print("Testing evaluation logic:")
print("Correct answers (indices):", correct_answers)
print("Predicted answers (indices):", predicted_answers)

# Test the target creation logic from the evaluation function
for i, (correct, predicted) in enumerate(zip(correct_answers, predicted_answers)):
    # This mimics the evaluation logic
    answer_targets = []
    correct_answers_batch = [correct]  # Batch of 1

    for ans in correct_answers_batch:
        if isinstance(ans, str) and ans in ['A', 'B', 'C', 'D']:
            answer_targets.append(ord(ans.upper()) - ord('A'))
        elif isinstance(ans, int) and 0 <= ans <= 3:
            answer_targets.append(ans)
        else:
            answer_targets.append(0)  # Fallback

    target_tensor_val = answer_targets[0]
    predicted_val = predicted

    is_correct = (predicted_val == target_tensor_val)
    print(f"Q{i+1}: Correct={target_tensor_val}, Predicted={predicted_val}, Is_Correct={is_correct}")