-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgrouping.py
More file actions
95 lines (74 loc) · 3.34 KB
/
grouping.py
File metadata and controls
95 lines (74 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from collections import Counter
import copy
class Grouping:
def __init__(self, data, model):
data = data['tokens'].tolist()
self.data_tokens = [token for token_list in data for token in token_list]
self.model = model
def group_words(self):
count_tokens = Counter(self.data_tokens).most_common()
word_pair_map = self.get_word_pair_map(count_tokens)
min_sim = min(word_pair_map['sim'])
max_sim = max(word_pair_map['sim'])
threshold = min_sim +(max_sim - min_sim) * 0.25
cluster = {}
for idx in range(len(word_pair_map['main'])):
if word_pair_map['sim'][idx] < threshold:
continue
main_word = word_pair_map['main'][idx]
target_word = word_pair_map['target'][idx]
temp_cluster = [main_word, target_word]
for _ in range(4):
main_word = target_word
new_idx = word_pair_map['main'].index(main_word)
target_word = word_pair_map['target'][new_idx]
if word_pair_map['sim'][new_idx] < threshold:
continue
if target_word in temp_cluster:
break
temp_cluster.append(target_word)
cluster[idx] = temp_cluster
new_cluster = self.remove_duplicate(list(cluster.values()))
return new_cluster
def get_word_pair_map(self, count_tokens):
word_pair_map = {"main": [], "target": [], "sim": []}
for idx in range(len(count_tokens)):
main_word = count_tokens[idx][0]
target_word = self.get_most_similar_word(main_word)
word_pair_map["main"].append(main_word)
word_pair_map["target"].append(target_word)
word_pair_map["sim"].append(self.model.wv.similarity(main_word, target_word))
return word_pair_map
def get_most_similar_word(self, word):
test_word_list = copy.deepcopy(list(set(self.data_tokens)))
if word in test_word_list:
test_word_list.remove(word)
return self.model.wv.most_similar_to_given(word, test_word_list)
def filter_by_glossary(self, glossary_name, old_cluster):
glossary_list = []
new_cluster = []
with open(f'./glossary/{glossary_name}.txt', 'r') as f:
glossary_list = f.read()
for cluster in old_cluster:
temp = [word for word in cluster if word in glossary_list]
if len(temp) > 1:
new_cluster.append(temp)
new_cluster = self.remove_duplicate(new_cluster)
return new_cluster
@staticmethod
def remove_duplicate(data):
new_cluster = []
for cluster in data:
merged = False
for i, existing_cluster in enumerate(new_cluster):
if len(existing_cluster) < 6 and set(cluster).issubset(set(existing_cluster)):
merged = True
new_cluster[i] += list(set(cluster) - set(existing_cluster))
break
elif len(existing_cluster) < 6 and set(existing_cluster).issubset(set(cluster)):
new_cluster[i] = cluster
merged = True
break
if not merged:
new_cluster.append(cluster)
return new_cluster