-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsemantic_mapping.py
More file actions
147 lines (120 loc) · 5.47 KB
/
semantic_mapping.py
File metadata and controls
147 lines (120 loc) · 5.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import pandas as pd
import numpy as np
import warnings
from sentence_transformers import SentenceTransformer, util
# Disable futurewarnings from huggingface/transformers
warnings.simplefilter(action='ignore', category=FutureWarning)
from anchor_extraction import load_dataset, process_dataset_anchors
class SemanticMapper:
def __init__(self, model_name='all-MiniLM-L6-v2'):
print(f"Loading SentenceTransformer model: {model_name}")
self.model = SentenceTransformer(model_name)
def compute_jaccard_similarity(self, text1, text2):
"""Computes Jaccard similarity between two strings."""
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
if not set1 or not set2:
return 0.0
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union
def compute_edit_distance_similarity(self, text1, text2):
"""Computes a normalized Levenshtein edit distance similarity."""
import Levenshtein
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
distance = Levenshtein.distance(text1.lower(), text2.lower())
max_len = max(len(text1), len(text2))
# Returns 1.0 for identical strings, 0.0 for completely different
return 1.0 - (distance / max_len)
def evaluate_student_answer(self, student_answer, anchors):
"""
Evaluates a student's answer against a list of desired anchors.
Returns aggregated features.
"""
if not isinstance(student_answer, str) or not student_answer.strip():
# Return zeros for empty answers
return {
'avg_semantic_sim': 0.0,
'max_semantic_sim': 0.0,
'anchors_covered': 0.0,
'avg_jaccard': 0.0,
'avg_edit_sim': 0.0
}
if not anchors:
# Handle case with no anchors
return {
'avg_semantic_sim': 0.0,
'max_semantic_sim': 0.0,
'anchors_covered': 0.0,
'avg_jaccard': 0.0,
'avg_edit_sim': 0.0
}
student_embedding = self.model.encode(student_answer, convert_to_tensor=True)
anchor_embeddings = self.model.encode(anchors, convert_to_tensor=True)
# Calculate cosine similarities between student answer and all anchors
# The result is a 1 x num_anchors tensor
cosine_scores = util.cos_sim(student_embedding, anchor_embeddings)[0].cpu().numpy()
# Calculate lexical metrics against each anchor
jaccard_scores = [self.compute_jaccard_similarity(student_answer, anchor) for anchor in anchors]
edit_scores = [self.compute_edit_distance_similarity(student_answer, anchor) for anchor in anchors]
# Define a threshold for considering an anchor "covered"
# Since we compare a full sentence against a phrase, similarity might not be 0.99
coverage_threshold = 0.35
features = {
'avg_semantic_sim': float(np.mean(cosine_scores)),
'max_semantic_sim': float(np.max(cosine_scores)),
'anchors_covered': float(np.sum(cosine_scores >= coverage_threshold) / len(anchors)),
'avg_jaccard': float(np.mean(jaccard_scores)),
'avg_edit_sim': float(np.mean(edit_scores))
}
return features
def generate_features(df):
"""Generates features for the entire dataframe."""
mapper = SemanticMapper()
print("Generating semantic mapping features...")
# Initialize lists to hold feature columns
avg_sem = []
max_sem = []
coverage = []
avg_jac = []
avg_edit = []
# Process each row
total_rows = len(df)
for idx, row in df.iterrows():
if idx % 100 == 0:
print(f"Processing row {idx}/{total_rows}...")
features = mapper.evaluate_student_answer(row['student_answer'], row['anchors'])
avg_sem.append(features['avg_semantic_sim'])
max_sem.append(features['max_semantic_sim'])
coverage.append(features['anchors_covered'])
avg_jac.append(features['avg_jaccard'])
avg_edit.append(features['avg_edit_sim'])
# Add new feature columns to the dataframe
df['feat_avg_semantic'] = avg_sem
df['feat_max_semantic'] = max_sem
df['feat_anchors_covered'] = coverage
df['feat_avg_jaccard'] = avg_jac
df['feat_avg_edit'] = avg_edit
print("Feature generation complete.")
return df
if __name__ == "__main__":
file_path = "C:/Users/deii/Desktop/cloud/mohler_dataset_edited.csv"
try:
# Load data
df = load_dataset(file_path)
# Take a sample for testing
df_sample = df.head(10).copy()
# Step 1: Extract anchors from desired answers
df_sample = process_dataset_anchors(df_sample)
# Step 2: Generate Features mapping student_answer against anchors
df_featured = generate_features(df_sample)
# Display results
features_to_show = ['score_avg', 'feat_avg_semantic', 'feat_max_semantic', 'feat_anchors_covered', 'feat_avg_jaccard']
print("\nResults for sample data:")
print(df_featured[['student_answer'] + features_to_show].head().to_string())
except Exception as e:
import traceback
traceback.print_exc()