-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsynonym_extractor.py
More file actions
160 lines (137 loc) · 7.2 KB
/
synonym_extractor.py
File metadata and controls
160 lines (137 loc) · 7.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Provides the SynonymExtractor class for parsing and cleaning synonym data from dictionary HTML."""
import re
class SynonymExtractor:
"""Handles extraction and cleaning of synonyms from entry HTML."""
SYNONYM_CLEANUP_MAP = {
"†": "", "*": "", "ˈ": "", "ˌ": "", "(": "", ")": "", "[": "", "]": "", "‖": "", "¶": "", "?": "", "!": "",
"–": "", "—": "", ";": "", ":": "", " ": " ", "- ": "-", " -": "-"
}
IGNORED_SYN_WORDS = {'to', 'or', 'and', 'a', 'an', 'the', 'after', 'before', 'in', 'on', 'at', 'for', 'with', 'by', 'of', 'from', 'Derivatives.',
'that', 'which', 'who', 'whom', 'whose', 'as', 'than', 'like', 'such', 'so', 'but', 'if', 'when', 'up', 'down', 'Compounds.'}
PAREN_PATTERN = re.compile(r'\(.*?\)')
@staticmethod
def _clean_synonym(text: str) -> str:
"""Removes unwanted characters from a potential synonym string."""
text = SynonymExtractor.PAREN_PATTERN.sub('', text)
for char, replacement in SynonymExtractor.SYNONYM_CLEANUP_MAP.items():
text = text.replace(char, replacement)
return text.strip()
@staticmethod
def _prepare_and_validate_synonym(headword: str, word_initial: str, final_synonym: str) -> str | None:
if not final_synonym or final_synonym in SynonymExtractor.IGNORED_SYN_WORDS:
return None
if (not headword.startswith('-') and final_synonym.startswith('-')) or \
(not headword.endswith('-') and final_synonym.endswith('-')):
return None
if re.search(r'\d{2,}', final_synonym):
return None
if re.fullmatch(r'[IVXL]+\.', final_synonym):
return None
if re.fullmatch(r'[αβγδ]\.', final_synonym):
return None
if re.fullmatch(r'[A-Za-z]\.?', final_synonym):
return None
if re.fullmatch(r'[0-9]\.?', final_synonym):
return None
if re.fullmatch(r'[IVXL]+\. [0-9]+\.', final_synonym):
return None
if re.fullmatch(r'[0-9]+\. [a-z]\.', final_synonym):
return None
# Skip overly long multi-word synonyms (likely phrases rather than single synonyms)
if len(final_synonym.split()) > 4:
return None
if final_synonym.endswith('..'):
final_synonym = final_synonym.rstrip('.')
if '..' in final_synonym:
return None
if '--' in final_synonym:
final_synonym = final_synonym.replace("--", "-")
if final_synonym.startswith('―') or final_synonym.startswith(','):
return None
if re.match(r'to (?!a |an |be |the )', final_synonym):
final_synonym = final_synonym[3:]
# some entries (e.g., plover) when creating compounds, use "p." as shorthands
final_synonym = final_synonym.replace(word_initial + ".", headword)
return final_synonym
@staticmethod
def extract(headword: str, html: str, debug_words: set[str] | None) -> list[str]:
"""Extracts potential synonyms from <b> tags within the definition HTML."""
from bs4 import BeautifulSoup, Tag, FeatureNotFound # Fix macOS fork-safety deadlock by lazy-loading BeautifulSoup imports
try:
soup = BeautifulSoup(html, 'lxml')
except FeatureNotFound:
# If lxml fails, fall back to the more lenient, built-in parser.
soup = BeautifulSoup(html, 'html.parser')
cleaned_syns = set()
clean_headword = SynonymExtractor._clean_synonym(headword.strip())
if not clean_headword:
return []
word_initial = clean_headword[:1]
# Find and remove all quotation divs from the parse tree.
for div in soup.find_all('div', class_='quotations'):
div.decompose()
# Also remove isolated roman-numeral markers like <b><sup>IV</sup></b>
for sup in soup.find_all('sup'):
if sup.parent and sup.parent.name == 'b':
sup.decompose()
for span in soup.find_all('span', class_='headword'):
span.decompose()
# Categorize all <b> tags into strict or lax processing sets.
lax_tags = set()
strict_tags = set()
pos_blocks = []
for tag in soup.find_all('span', class_='pos'):
parent_block = tag.find_parent('blockquote')
if parent_block:
pos_blocks.append(parent_block)
if pos_blocks and 'forms' in pos_blocks[0].get_text(strip=True).lower():
start_node = pos_blocks[0]
end_node = pos_blocks[1] if len(pos_blocks) > 1 else None
current_node = start_node
while current_node:
current_node = current_node.find_next_sibling()
if not current_node or current_node == end_node:
break
if isinstance(current_node, Tag):
lax_tags.update(current_node.find_all('b'))
all_b_tags = set(soup.find_all('b'))
remaining_tags = all_b_tags - lax_tags
strict_blocks = set()
for marker in soup.find_all('span', class_=['senses', 'subsenses', 'major-division']):
parent_block = marker.find_parent('blockquote')
if parent_block:
strict_blocks.add(parent_block)
strict_blocks.update(soup.find_all('blockquote', class_='usage-note'))
if lax_tags and pos_blocks:
strict_blocks.discard(pos_blocks[0])
all_tags_in_strict_blocks = set()
for block in strict_blocks:
if block:
all_tags_in_strict_blocks.update(block.find_all('b'))
strict_tags = remaining_tags.intersection(all_tags_in_strict_blocks)
lax_tags.update(remaining_tags - strict_tags)
# Process tags that require the strict headword-containment check.
for tag in strict_tags:
synonym_text = SynonymExtractor._clean_synonym(tag.get_text())
if debug_words:
print(f"\n[STRICT] synonym_text: '{synonym_text}'")
validated = SynonymExtractor._prepare_and_validate_synonym(clean_headword, word_initial, synonym_text)
# Apply the special rule: keep only if it contains the headword
# For prefix headwords (ending in '-'), strip all trailing '-' before checking.
headword_to_check = clean_headword.rstrip('-') if clean_headword.endswith('-') else clean_headword
if validated and headword_to_check.lower() in validated.lower():
cleaned_syns.add(validated)
if debug_words:
print(f"[STRICT] validated: '{validated}'\n")
# Process all tags that require lax validation.
for tag in lax_tags:
synonym_text = SynonymExtractor._clean_synonym(tag.get_text())
if debug_words:
print(f"\n[LAX] synonym_text: '{synonym_text}'")
validated = SynonymExtractor._prepare_and_validate_synonym(clean_headword, word_initial, synonym_text)
if validated:
cleaned_syns.add(validated)
if debug_words:
print(f"[LAX] validated: '{validated}'\n")
cleaned_syns.discard(clean_headword)
return sorted(list(cleaned_syns))