WordUnitier/grouping.py at main · HelpPPT/WordUnitier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from collections import Counter
import copy


class Grouping:
    def __init__(self, data, model):
        data = data['tokens'].tolist()
        self.data_tokens = [token for token_list in data for token in token_list]
        self.model = model

    def group_words(self):
        count_tokens = Counter(self.data_tokens).most_common()
        word_pair_map = self.get_word_pair_map(count_tokens)
        min_sim = min(word_pair_map['sim'])
        max_sim = max(word_pair_map['sim'])
        threshold = min_sim +(max_sim - min_sim) * 0.25
        cluster = {}

        for idx in range(len(word_pair_map['main'])):
            if word_pair_map['sim'][idx] < threshold:
                continue
            main_word = word_pair_map['main'][idx]
            target_word = word_pair_map['target'][idx]
            temp_cluster = [main_word, target_word]

            for _ in range(4):
                main_word = target_word
                new_idx = word_pair_map['main'].index(main_word)
                target_word = word_pair_map['target'][new_idx]

                if word_pair_map['sim'][new_idx] < threshold:
                    continue
                if target_word in temp_cluster:
                    break
                temp_cluster.append(target_word)

            cluster[idx] = temp_cluster

        new_cluster = self.remove_duplicate(list(cluster.values()))

        return new_cluster

    def get_word_pair_map(self, count_tokens):
        word_pair_map = {"main": [], "target": [], "sim": []}

        for idx in range(len(count_tokens)):
            main_word = count_tokens[idx][0]
            target_word = self.get_most_similar_word(main_word)

            word_pair_map["main"].append(main_word)
            word_pair_map["target"].append(target_word)
            word_pair_map["sim"].append(self.model.wv.similarity(main_word, target_word))

        return word_pair_map

    def get_most_similar_word(self, word):
        test_word_list = copy.deepcopy(list(set(self.data_tokens)))
        if word in test_word_list:
            test_word_list.remove(word)
        return self.model.wv.most_similar_to_given(word, test_word_list)


    def filter_by_glossary(self, glossary_name, old_cluster):
        glossary_list = []
        new_cluster = []

        with open(f'./glossary/{glossary_name}.txt', 'r') as f:
            glossary_list = f.read()

        for cluster in old_cluster:
            temp = [word for word in cluster if word in glossary_list]
            if len(temp) > 1:
                new_cluster.append(temp)

        new_cluster = self.remove_duplicate(new_cluster)
        return new_cluster

    @staticmethod
    def remove_duplicate(data):
        new_cluster = []
        for cluster in data:
            merged = False
            for i, existing_cluster in enumerate(new_cluster):
                if len(existing_cluster) < 6 and set(cluster).issubset(set(existing_cluster)):
                    merged = True
                    new_cluster[i] += list(set(cluster) - set(existing_cluster))
                    break
                elif len(existing_cluster) < 6 and set(existing_cluster).issubset(set(cluster)):
                    new_cluster[i] = cluster
                    merged = True
                    break
            if not merged:
                new_cluster.append(cluster)

        return new_cluster