forked from Gene-Weaver/VoucherVisionEditor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
284 lines (239 loc) · 10.4 KB
/
utils.py
File metadata and controls
284 lines (239 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import streamlit as st
import pandas as pd
from PIL import Image
import json, os, argparse, shutil, re
import requests
from urllib.parse import urlencode
from screeninfo import get_monitors
class ScreenResolution:
def get_smallest_monitor(self):
# Get all connected monitors
monitors = get_monitors()
# Find the smallest monitor based on resolution (width * height)
smallest_monitor = min(monitors, key=lambda m: m.width * m.height)
return smallest_monitor.width, smallest_monitor.height
# def remove_number_lines(text, threshold=6):
# lines = text.split('\n')
# cleaned_lines = []
# for line in lines:
# numbers = re.findall(r'\b\d+(\.\d+)?\b', line)
# if len(numbers) < threshold:
# cleaned_lines.append(line)
# cleaned_lines2 = '\n'.join(cleaned_lines)
# return cleaned_lines2
def remove_number_lines(text, threshold=40):
lines = text.split('\n')
# Handle multi-line case
if len(lines) > 1:
cleaned_lines = []
number_lines = []
for line in lines:
# Find all number sequences in the line
numbers = re.findall(r'\b\d+(\.\d+)?\b', line)
# If the line has fewer than the threshold number of numbers, keep it in the main text
if len(numbers) < threshold:
cleaned_lines.append(line)
else:
# If the line has too many numbers, move it to the "number_lines" list
number_lines.append(line)
# Return the cleaned lines followed by the number-heavy lines
return '\n'.join(cleaned_lines + number_lines)
# Handle single-line case
else:
# Find large sequences of numbers and move them to the end
# Regex to find all large number sequences
number_blocks = re.findall(r'(\d[\d\s.,-]{3,})', text)
# Remove the number blocks from the original text
cleaned_text = re.sub(r'(\d[\d\s.,-]{3,})', '', text).strip()
# Join the cleaned text with the number blocks at the end
return f"{cleaned_text}\n\n{' '.join(number_blocks)}"
def dms_to_decimal(dms_string):
"""Convert DMS string to decimal format for latitude and longitude."""
def parse_dms(dms):
"""Parse degrees, minutes, seconds coordinates to decimal degrees"""
direction = dms[-1].strip()
parts = re.split('[°\'"MinutesSecondsNSEWnsew]+', dms)
parts = [p.strip() for p in parts if p.strip()] # removing leading/trailing whitespaces and empty strings
while len(parts) < 3: # if no seconds or no minutes and seconds, fill with zeros
parts.append('0')
if len(parts) == 3:
degrees, minutes, seconds = parts
dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
if direction in ('S', 'W'):
dd *= -1
return dd
else:
raise
# Splitting into latitude and longitude parts
# lat_str, lon_str = dms_string.split(", ")
lat_str = dms_string[0]
lon_str = dms_string[1]
lat_decimal = parse_dms(lat_str)
lon_decimal = parse_dms(lon_str)
return lat_decimal, lon_decimal
# def parse_dms(dms):
# """Parse degrees, minutes, seconds coordinates to decimal degrees"""
# parts = re.split('[°\'"]+', dms)
# direction = parts[-1]
# parts = parts[:-1]
# while len(parts) < 3: # if no seconds or no minutes and seconds, fill with zeros
# parts.append('0')
# degrees, minutes, seconds = parts
# dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
# if direction in ('S','W'):
# dd *= -1
# return dd
def parse_coordinate(coordinate):
"""Try to parse a coordinate to decimal degrees format"""
try:
return float(coordinate) # plain decimal degrees
except:
pass
try:
return dms_to_decimal(coordinate) # degrees, minutes, seconds
except:
pass
try:
# degrees, decimal minutes
parts = re.split('[°\']+', coordinate.upper())
dd = float(parts[0]) + float(parts[1])/60
if parts[2] in ('S', 'W'):
dd *= -1
return dd
except:
return None, None
def check_for_sep(verbatim_coordinates):
# Split latitude and longitude from the verbatim_coordinates using regex
chars = [',', '|', '-']
counts = {}
for char in chars:
counts[char] = verbatim_coordinates.count(char)
total_count = sum(counts.values())
# Check if we have two separate coordinates
if total_count >= 1:
return False
else:
return True
# def replace_base_path(old_path, new_base_path, opt):
# # Normalize the old_path to match the OS's current path separator
# old_path = os.path.normpath(old_path)
# # print(f"old = {old_path}")
# # print(f"new = {new_base_path}")
# # Replace the base path of the old_path with the new_base_path.
# # Split the path into parts
# parts = old_path.split(os.path.sep)
# # Find the index of the 'Transcription' part
# if opt == 'crop':
# transcription_index = parts.index('Cropped_Images') if 'Cropped_Images' in parts else None
# elif opt == 'original':
# transcription_index = parts.index('Original_Images') if 'Original_Images' in parts else None
# elif opt == 'json':
# transcription_index = parts.index('Transcription') if 'Transcription' in parts else None
# elif opt == 'jpg':
# transcription_index = parts.index('Transcription') if 'Transcription' in parts else None
# else:
# raise
# if transcription_index is not None:
# # Replace the base path up to 'Transcription' with the new_base_path
# new_path = os.path.join(new_base_path, *parts[transcription_index:])
# return new_path
# else:
# return old_path # Return the old_path unchanged if 'Transcription' is not in the path
def replace_base_path(old_path, new_base_path, opt):
# Normalize the old_path to match the OS's current path separator
normalized_old_path = os.path.normpath(old_path)
normalized_old_path_part = normalized_old_path.split(os.path.sep)
if opt == 'crop':
if 'Original_Images' in normalized_old_path_part:
opt = 'crop_OG'
# Define the target directory based on 'opt'
target_dir_map = {
'crop': 'Cropped_Images',
'crop_OG': 'Original_Images',
'original': 'Original_Images',
'json': 'Transcription',
'jpg': 'Transcription', # Assuming this is correct based on your function
}
target_dir = target_dir_map.get(opt)
if target_dir is None:
raise ValueError(f"Invalid option: {opt}")
normalized_old_path = normalized_old_path.replace('\\', os.path.sep)
# Split the normalized path into parts
parts = normalized_old_path.split(os.path.sep)
target_index = -1
# Find the index of the target directory part
try:
target_index = len(parts) - 1 - parts[::-1].index(target_dir)
# Construct the new path by joining the new_base_path with the parts after the target directory
# new_path_parts = [new_base_path] + parts[target_index:]
# new_path = os.path.join(*new_path_parts)
# return new_path
except:
pass
# If target directory not found, raise an error
if target_index == -1:
raise ValueError(f"Target directory {target_dir} not found in path: {old_path}")
# Construct the new path by joining the new_base_path with the parts after the target directory
new_path_parts = [new_base_path] + parts[target_index:]
new_path = os.path.join(*new_path_parts)
return new_path
def get_wfo_url(input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True):
good_basic = False
good_search = False
base_url = "https://list.worldfloraonline.org/matching_rest.php?"
base_url_search = "https://www.worldfloraonline.org/search?query="
params = {
"input_string": input_string,
"check_homonyms": check_homonyms,
"check_rank": check_rank,
"method": "full",
"accept_single_candidate": accept_single_candidate,
}
full_url = base_url + urlencode(params)
response_basic = requests.get(full_url)
if response_basic.status_code == 200:
# return full_url
# response_basic = response_basic.json()
good_basic = True
# new_input_string = '+'.join(input_string.split(" "))
# search_url = base_url_search + new_input_string + '&start=0&sort='
# try:
# response_search = requests.get(search_url,timeout=2.0)
# if response_search.status_code == 200:
# # return full_url
# response_search = response_search.json()
# good_search = True
# except:
# pass
if good_basic and good_search:
return full_url, None #search_url
elif good_basic and not good_search:
return full_url, None
elif not good_basic and good_search:
return None, None #search_url
else:
return None, None
# simplified_response = {}
# ranked_candidates = None
# exact_match = response.get("match")
# simplified_response["WFO_exact_match"] = bool(exact_match)
# candidates = response.get("candidates", [])
# candidate_names = [candidate["full_name_plain"] for candidate in candidates] if candidates else []
# if not exact_match and candidate_names:
# cleaned_candidates, ranked_candidates = self._rank_candidates_by_similarity(query, candidate_names)
# simplified_response["WFO_candidate_names"] = cleaned_candidates
# simplified_response["WFO_best_match"] = cleaned_candidates[0] if cleaned_candidates else ''
# elif exact_match:
# simplified_response["WFO_candidate_names"] = exact_match.get("full_name_plain")
# simplified_response["WFO_best_match"] = exact_match.get("full_name_plain")
# else:
# simplified_response["WFO_candidate_names"] = ''
# simplified_response["WFO_best_match"] = ''
# # Call WFO again to update placement using WFO_best_match
# try:
# response_placement = self.query_wfo_name_matching(simplified_response["WFO_best_match"])
# placement_exact_match = response_placement.get("match")
# simplified_response["WFO_placement"] = placement_exact_match.get("placement", '')
# except:
# simplified_response["WFO_placement"] = ''
# return simplified_response, ranked_candidates