oed_prettifier/synonym_extractor.py at main · Commodore64user/oed_prettifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Provides the SynonymExtractor class for parsing and cleaning synonym data from dictionary HTML."""

import re

class SynonymExtractor:
    """Handles extraction and cleaning of synonyms from entry HTML."""
    SYNONYM_CLEANUP_MAP = {
        "†": "",   "*": "",   "ˈ": "",   "ˌ": "",   "(": "",   ")": "",   "[": "",   "]": "",   "‖": "",   "¶": "",   "?": "",   "!": "",
        "–": "",   "—": "",   ";": "",   ":": "",   "  ": " ", "- ": "-", " -": "-"
    }

    IGNORED_SYN_WORDS = {'to', 'or', 'and', 'a', 'an', 'the', 'after', 'before', 'in', 'on', 'at', 'for', 'with', 'by', 'of', 'from', 'Derivatives.',
                         'that', 'which', 'who', 'whom', 'whose', 'as', 'than', 'like', 'such', 'so', 'but', 'if', 'when', 'up', 'down', 'Compounds.'}
    PAREN_PATTERN = re.compile(r'\(.*?\)')

    @staticmethod
    def _clean_synonym(text: str) -> str:
        """Removes unwanted characters from a potential synonym string."""
        text = SynonymExtractor.PAREN_PATTERN.sub('', text)
        for char, replacement in SynonymExtractor.SYNONYM_CLEANUP_MAP.items():
            text = text.replace(char, replacement)
        return text.strip()

    @staticmethod
    def _prepare_and_validate_synonym(headword: str, word_initial: str, final_synonym: str) -> str | None:
        if not final_synonym or final_synonym in SynonymExtractor.IGNORED_SYN_WORDS:
            return None
        if (not headword.startswith('-') and final_synonym.startswith('-')) or \
          (not headword.endswith('-') and final_synonym.endswith('-')):
            return None
        if re.search(r'\d{2,}', final_synonym):
            return None
        if re.fullmatch(r'[IVXL]+\.', final_synonym):
            return None
        if re.fullmatch(r'[αβγδ]\.', final_synonym):
            return None
        if re.fullmatch(r'[A-Za-z]\.?', final_synonym):
            return None
        if re.fullmatch(r'[0-9]\.?', final_synonym):
            return None
        if re.fullmatch(r'[IVXL]+\. [0-9]+\.', final_synonym):
            return None
        if re.fullmatch(r'[0-9]+\. [a-z]\.', final_synonym):
            return None
        # Skip overly long multi-word synonyms (likely phrases rather than single synonyms)
        if len(final_synonym.split()) > 4:
            return None
        if final_synonym.endswith('..'):
            final_synonym = final_synonym.rstrip('.')
        if '..' in final_synonym:
            return None
        if '--' in final_synonym:
            final_synonym = final_synonym.replace("--", "-")
        if final_synonym.startswith('―') or final_synonym.startswith(','):
            return None
        if re.match(r'to (?!a |an |be |the )', final_synonym):
            final_synonym = final_synonym[3:]
        # some entries (e.g., plover) when creating compounds, use "p." as shorthands
        final_synonym = final_synonym.replace(word_initial + ".", headword)
        return final_synonym

    @staticmethod
    def extract(headword: str, html: str, debug_words: set[str] | None) -> list[str]:
        """Extracts potential synonyms from <b> tags within the definition HTML."""
        from bs4 import BeautifulSoup, Tag, FeatureNotFound # Fix macOS fork-safety deadlock by lazy-loading BeautifulSoup imports
        try:
            soup = BeautifulSoup(html, 'lxml')
        except FeatureNotFound:
            # If lxml fails, fall back to the more lenient, built-in parser.
            soup = BeautifulSoup(html, 'html.parser')
        cleaned_syns = set()

        clean_headword = SynonymExtractor._clean_synonym(headword.strip())
        if not clean_headword:
            return []
        word_initial = clean_headword[:1]

        # Find and remove all quotation divs from the parse tree.
        for div in soup.find_all('div', class_='quotations'):
            div.decompose()

        # Also remove isolated roman-numeral markers like <b><sup>IV</sup></b>
        for sup in soup.find_all('sup'):
            if sup.parent and sup.parent.name == 'b':
                sup.decompose()

        for span in soup.find_all('span', class_='headword'):
            span.decompose()

        # Categorize all <b> tags into strict or lax processing sets.
        lax_tags = set()
        strict_tags = set()

        pos_blocks = []
        for tag in soup.find_all('span', class_='pos'):
            parent_block = tag.find_parent('blockquote')
            if parent_block:
                pos_blocks.append(parent_block)
        if pos_blocks and 'forms' in pos_blocks[0].get_text(strip=True).lower():
            start_node = pos_blocks[0]
            end_node = pos_blocks[1] if len(pos_blocks) > 1 else None

            current_node = start_node
            while current_node:
                current_node = current_node.find_next_sibling()
                if not current_node or current_node == end_node:
                    break
                if isinstance(current_node, Tag):
                    lax_tags.update(current_node.find_all('b'))

        all_b_tags = set(soup.find_all('b'))
        remaining_tags = all_b_tags - lax_tags

        strict_blocks = set()
        for marker in soup.find_all('span', class_=['senses', 'subsenses', 'major-division']):
            parent_block = marker.find_parent('blockquote')
            if parent_block:
                strict_blocks.add(parent_block)

        strict_blocks.update(soup.find_all('blockquote', class_='usage-note'))

        if lax_tags and pos_blocks:
            strict_blocks.discard(pos_blocks[0])

        all_tags_in_strict_blocks = set()
        for block in strict_blocks:
            if block:
                all_tags_in_strict_blocks.update(block.find_all('b'))

        strict_tags = remaining_tags.intersection(all_tags_in_strict_blocks)
        lax_tags.update(remaining_tags - strict_tags)

        # Process tags that require the strict headword-containment check.
        for tag in strict_tags:
            synonym_text = SynonymExtractor._clean_synonym(tag.get_text())
            if debug_words:
                print(f"\n[STRICT] synonym_text: '{synonym_text}'")
            validated = SynonymExtractor._prepare_and_validate_synonym(clean_headword, word_initial, synonym_text)

            # Apply the special rule: keep only if it contains the headword
            # For prefix headwords (ending in '-'), strip all trailing '-' before checking.
            headword_to_check = clean_headword.rstrip('-') if clean_headword.endswith('-') else clean_headword
            if validated and headword_to_check.lower() in validated.lower():
                cleaned_syns.add(validated)
                if debug_words:
                    print(f"[STRICT] validated:    '{validated}'\n")

        # Process all tags that require lax validation.
        for tag in lax_tags:
            synonym_text = SynonymExtractor._clean_synonym(tag.get_text())
            if debug_words:
                print(f"\n[LAX] synonym_text: '{synonym_text}'")
            validated = SynonymExtractor._prepare_and_validate_synonym(clean_headword, word_initial, synonym_text)
            if validated:
                cleaned_syns.add(validated)
                if debug_words:
                    print(f"[LAX] validated:    '{validated}'\n")

        cleaned_syns.discard(clean_headword)
        return sorted(list(cleaned_syns))