-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathEuclidean.py
More file actions
33 lines (26 loc) · 1003 Bytes
/
Euclidean.py
File metadata and controls
33 lines (26 loc) · 1003 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from sentence_transformers import SentenceTransformer
import re
import numpy as np
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
def cleanText(readData):
text = re.sub(
'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', readData)
return text
def euclidean(testList, patternIdx):
for i in range(len(testList)):
testList[i]["processed"] = cleanText(testList[i]["title"])
for i in range(len(testList)):
patternIdx = patternIdx - 1
sentences = [testList[i]["processed"],
testList[patternIdx]["processed"]]
# (1) 이용하여 임베딩
sentence_embeddings = model.encode(sentences)
# (2)거리 구하는 공식을 적용
result = np.sqrt(
np.sum((sentence_embeddings[0]-sentence_embeddings[1])**2))
if result == 0:
result = 1
else:
result = 1 / result
testList[i]["similarity"] = result
return testList