-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessing_worker.py
More file actions
204 lines (177 loc) · 11.2 KB
/
processing_worker.py
File metadata and controls
204 lines (177 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
from entry_processor import EntryProcessor
from synonym_extractor import SynonymExtractor
# This pattern was moved here from the main class to make the worker self-contained.
CORE_HOMOGRAPH_PATTERN = r'(?:<b><span style="color:#8B008B">▪ <span>(?:[IVXL]+\.)</span></span></b>)'
HOMOGRAPH_PATTERN = re.compile(f'(?={CORE_HOMOGRAPH_PATTERN})')
PROBLEMATIC_ABBREVIATIONS = ["Ed.", "Gov.", "mod.", "MS.", "viz.", "prob.", "Pol. Econ.", "Oxon."]
# Words with <dtrn> sandwich pattern but no trailing punctuation.
PROBLEMATIC_DTRN_WORDS = ['H', 'John', 'Timon']
CEDILLA_CORRECTIONS = {
'aai': 'açai', 'Alenon': 'Alençon',
'almaour': 'almaçour', 'aperu': 'aperçu',
'ariama': 'çariama', 'beau garon': 'beau garçon',
'cachaa': 'cachaça', 'comme a': 'comme ça',
'curaao': 'curaçao', 'curaao bird': 'curaçao bird',
'curaoa': 'curaçoa', 'douaine': 'douçaine',
'en garon': 'en garçon', 'escoinon': 'escoinçon',
'faadal': 'façadal', 'faade': 'façade',
'faonné': 'façonné', 'faon de parler': 'façon de parler',
'fianailles': 'fiançailles', 'Franois Premier': 'François Premier',
'garonnière': 'garçonnière', 'garon': 'garçon',
'glaon': 'glaçon', 'Juranon': 'Jurançon',
'limaon': 'limaçon', 'Niois': 'Niçois',
'piaaba': 'piaçaba', 'plus a change': 'plus ça change',
'Provenal': 'Provençal', 'remplaant': 'remplaçant',
'salade nioise': 'salade niçoise', 'soupon': 'soupçon',
'tajau': 'tajaçu', 'tayau': 'tajaçu',
}
def _handle_dotted_word_quirks(word: str, definition: str) -> tuple:
"""Handles special logic for words ending in full stops or symbols."""
# Some of these seem to be legitimate entries, whilst others seem to have been added by a previous "editor"
# I'm choosing to preserve them but we need to handle some quirks.
metrics = {'dotted_words': 1, 'dot_corrected': 0}
if word == "Prov.":
definition = "<br/>proverb, (in the Bible) Proverbs"
metrics['dot_corrected'] = 1
elif word == "Div.":
definition = "<br/>division, divinity"
metrics['dot_corrected'] = 1
elif word == ". s. d.":
word = "l. s. d."
elif word == '‖' or word == '¶':
definition = "<br/>" + definition
elif word in PROBLEMATIC_ABBREVIATIONS:
definition = re.sub(r'<dtrn>.*?</dtrn>(\\n)?', '', definition, flags=re.DOTALL)
metrics['dot_corrected'] = 1
# Detect merged entries that are stuck together (e.g. N. / No.)
# These look like <dtrn>...</dtrn>\n<b...
# We search for this pattern and inject a separator to force a split in the worker.
if '</dtrn>\\n<b' in definition:
separator = '<b><span style="color:#8B008B">▪ <span>II.</span></span></b>'
definition = re.sub(r'(<dtrn>.*?</dtrn>)\\n<dtrn>(.*?)</dtrn>\\n', r'\1\\n', definition)
definition = definition.replace('</dtrn>\\n<b', f'</dtrn>{separator}<b')
metrics['dot_corrected'] = 1
metrics['auto_split_merged'] = True
# For some bizarre and unbeknown reason, these abbreviation entries have their definition duplicated
# so we will have to verify if it is the case (it is!) and clean it up. After that we will add a synonym
# entry for the headword without the leading full stop, so koreader can find it without editing.
test_definition = definition.replace('\\n', '')
def_len = len(test_definition)
# sadly this method fails for some duplicated entries (about 7%, see "adj.") but it works for most of them
if def_len > 0 and def_len % 2 == 0:
midpoint = def_len // 2
if test_definition[:midpoint] == test_definition[midpoint:]:
# If it's a duplicate, the correct definition is the part
# before the original newline separator.
definition = '<br/>' + definition.split('\\n')[0]
metrics['dot_corrected'] = 1
alt_key = word.rstrip('.')
entry_word = [word, alt_key]
return entry_word, definition, metrics
def finalise_entry(base_word: list[str] | str, final_definition: str, add_syns: bool, debug_words: set[str] | None) -> dict:
"""Takes a processed definition, extracts synonyms, and packages the final entry data."""
all_words = list(base_word) if isinstance(base_word, list) else [base_word]
syn_count = 0
if add_syns:
# The headword for synonym extraction is the first word in the base list.
headword_for_syns = all_words[0]
synonyms = SynonymExtractor.extract(headword_for_syns, final_definition, debug_words)
if synonyms:
all_words.extend(synonyms)
syn_count = len(synonyms)
# Although architecturally impure, this pragmatic printing to console here
# will avoid us a whole set of unnecessarily complex rerouting of synonyms.
if debug_words and headword_for_syns in debug_words:
sorted_syns = sorted(synonyms, key=lambda s: (len(s), s))
print(f"\n\n--> Synonyms for '[{headword_for_syns}]': {'; '.join(sorted_syns)}\n")
return {'words': all_words, 'definition': final_definition, 'syn_count': syn_count}
def process_entry_line_worker(line_tuple: tuple[str, bool, set[str] | None]) -> dict:
"""Worker function to process a single TSV line.
This function is designed to be run in a separate process.
It returns a dictionary with status, results, and metrics."""
line, add_syns, debug_words = line_tuple
try:
parts = line.split('\t', 1)
if len(parts) != 2:
return {'status': 'error', 'type': 'malformed_line', 'line': line}
word, definition = parts
metrics = {'source_entry': 1, 'split_entry': 0, 'dotted_words': 0, 'dot_corrected': 0, 'synonyms_added': 0}
entry_word_base = word
if word.endswith(('.', '‖', '¶', '†')) or word in PROBLEMATIC_DTRN_WORDS:
entry_word_base, definition, dot_metrics = _handle_dotted_word_quirks(word, definition)
metrics.update(dot_metrics)
# These entries have the letter 'ç' missing from their search key.
if word in CEDILLA_CORRECTIONS:
entry_word_base = CEDILLA_CORRECTIONS[word]
word = entry_word_base
split_parts = HOMOGRAPH_PATTERN.split(definition)
processed_results = []
# This is a particular case we introduced in _handle_dotted_word_quirks()
# Here we handle these amalgamated entries.
if metrics.get('auto_split_merged'):
metrics['split_entry'] = 1
for idx, part in enumerate(split_parts):
if not part.strip():
continue
# Remove the separator pattern we injected so it doesn't appear in the output
part = re.sub(CORE_HOMOGRAPH_PATTERN, '', part)
processor = EntryProcessor(part, word)
processed_part = processor.process()
if idx == 0:
# First part: Add headword manually (it was the part before the merged entry)
headword_b_tag = f'<span class="headword"><b>{word}</b></span><br/>'
final_definition = headword_b_tag + processed_part
elif word in PROBLEMATIC_DTRN_WORDS and word != 'Timon':
headword_b_tag = f'<span class="headword"><b>{word}</b></span> '
final_definition = headword_b_tag + processed_part
else:
# Second part: Wrap the existing headword (it starts with <b>...</b>)
final_definition = re.sub(r'<b>(.*?)</b>', r'<span class="headword"><b>\1</b></span>', processed_part, count=1)
# Clean up trailing dtrn tags that can pollute the end of the second merged entry
final_definition = re.sub(r'\s*<dtrn>.*?</dtrn>$', '', final_definition)
final_entry = finalise_entry(entry_word_base, final_definition, add_syns, debug_words)
processed_results.append(final_entry)
metrics['synonyms_added'] += final_entry['syn_count']
# This is the primary case, we are splitting homographs
elif len(split_parts) > 1:
metrics['split_entry'] = 1
for part in split_parts:
if part.strip():
processor = EntryProcessor(part, word)
processed_part = processor.process()
if re.search(r'<b><sup>[IVXL]+</sup></b>\s*<span class="headword">', processed_part):
# A headword is already present, so use the part as-is.
final_definition = processed_part
else:
# The headword is missing, so we prepend it.
headword_b_tag = f' <span class="headword"><b>{word}</b></span>'
final_definition = processed_part.replace('</b>', '</b>' + headword_b_tag, 1)
final_definition = re.sub(r'<b><sup>[IVXL]+</sup></b>\s*', '', final_definition)
final_entry = finalise_entry(entry_word_base, final_definition, add_syns, debug_words)
processed_results.append(final_entry)
metrics['synonyms_added'] += final_entry['syn_count']
else:
# Logic for a standard, non-homograph entry.
processor = EntryProcessor(definition, word)
processed_definition = processor.process()
headword_div = f'<span class="headword"><b>{word}</b></span>'
final_definition = headword_div + processed_definition
if re.search(
r'<span class="headword"><b>(.*?)</b></span>\s*(<blockquote>)?<b>(<span class="abbreviation">[‖¶†]</span>\s)?[\w\u00C0-\u017F\u0180-\u024F\u02C8\' &\-\.\[\(<]',
final_definition): # \u02C8 is ˈ
# If the headword was already present, we don't need to prepend it, so remove it.
# Seems backwards to do it this way but it is much safer.
final_definition = final_definition.replace(headword_div, '', 1)
# Finally, wrap the headword in a span tag, to match the expected format.
final_definition = re.sub(r'<b>(.*?)</b>', r'<span class="headword"><b>\1</b></span>', final_definition, count=1)
elif re.search(r'<span class="headword"><b>(.*?)</b></span>(<i>)?(<span class="abbreviation">\w|[\w\(\?])', final_definition):
# some entries (see "gen") need some space
final_definition = final_definition.replace(headword_div, headword_div + ' ', 1)
final_definition = re.sub(r'<blockquote>(<span class="headword"><b>.*?</b></span>)</blockquote>', r'\1', final_definition)
final_entry = finalise_entry(entry_word_base, final_definition, add_syns, debug_words)
processed_results.append(final_entry)
metrics['synonyms_added'] += final_entry['syn_count']
return {'status': 'ok', 'results': processed_results, 'metrics': metrics}
except Exception as e:
return {'status': 'error', 'type': 'processing_error', 'line': line, 'error': str(e)}