|
22 | 22 | except ImportError: |
23 | 23 | lp_parse_address = None |
24 | 24 |
|
| 25 | + |
| 26 | +def _is_probably_international(address_string: str) -> bool: |
| 27 | + """Lightweight heuristic to detect likely international addresses.""" |
| 28 | + |
| 29 | + lower = address_string.lower() |
| 30 | + intl_keywords = [ |
| 31 | + "united kingdom", |
| 32 | + "uk", |
| 33 | + "england", |
| 34 | + "scotland", |
| 35 | + "wales", |
| 36 | + "ireland", |
| 37 | + "germany", |
| 38 | + "france", |
| 39 | + "japan", |
| 40 | + "россия", |
| 41 | + "russia", |
| 42 | + "india", |
| 43 | + "australia", |
| 44 | + "brazil", |
| 45 | + "canada", |
| 46 | + "mexico", |
| 47 | + "spain", |
| 48 | + "italy", |
| 49 | + "netherlands", |
| 50 | + "belgium", |
| 51 | + "switzerland", |
| 52 | + "sweden", |
| 53 | + "norway", |
| 54 | + "denmark", |
| 55 | + "finland", |
| 56 | + ] |
| 57 | + |
| 58 | + if any(keyword in lower for keyword in intl_keywords) and ( |
| 59 | + "united states" not in lower and "usa" not in lower |
| 60 | + ): |
| 61 | + return True |
| 62 | + |
| 63 | + return bool(any(ord(ch) > 127 for ch in address_string)) |
| 64 | + |
| 65 | + |
25 | 66 | if TYPE_CHECKING: |
26 | 67 | import pandas as pd |
27 | 68 |
|
@@ -258,7 +299,7 @@ def parse_international(self, address_string: str) -> ParseResult: |
258 | 299 | error=RuntimeError("libpostal not available"), |
259 | 300 | address=None, |
260 | 301 | international_address=None, |
261 | | - validation=None, |
| 302 | + validation=ValidationResult(is_valid=False, errors=[]), |
262 | 303 | source="international", |
263 | 304 | ) |
264 | 305 |
|
@@ -289,13 +330,36 @@ def parse_international(self, address_string: str) -> ParseResult: |
289 | 330 | ) |
290 | 331 |
|
291 | 332 | def parse_auto_route(self, address_string: str, *, validate: bool = True) -> ParseResult: |
292 | | - """Try US parse first; if invalid and libpostal is available, fall back to international.""" |
293 | | - us_result = self.parse(address_string, validate=validate) |
| 333 | + """Try US parse first; if invalid or fails and libpostal is available, fall back.""" |
| 334 | + # If clearly international, skip US path |
| 335 | + if _is_probably_international(address_string) and lp_parse_address is not None: |
| 336 | + return self.parse_international(address_string) |
| 337 | + |
| 338 | + try: |
| 339 | + us_result = self.parse(address_string, validate=validate) |
| 340 | + except Exception as exc: |
| 341 | + if lp_parse_address is not None: |
| 342 | + intl_result = self.parse_international(address_string) |
| 343 | + if intl_result.is_valid or intl_result.international_address is not None: |
| 344 | + return intl_result |
| 345 | + # Return error as a ParseResult to avoid raising in auto route |
| 346 | + return ParseResult( |
| 347 | + raw_input=address_string, |
| 348 | + address=None, |
| 349 | + international_address=None, |
| 350 | + error=exc, |
| 351 | + validation=ValidationResult(is_valid=False, errors=[]), |
| 352 | + source="us", |
| 353 | + ) |
| 354 | + |
294 | 355 | if us_result.is_valid: |
295 | 356 | return us_result |
| 357 | + |
296 | 358 | if lp_parse_address is None: |
297 | 359 | return us_result |
298 | | - return self.parse_international(address_string) |
| 360 | + |
| 361 | + intl_result = self.parse_international(address_string) |
| 362 | + return intl_result |
299 | 363 |
|
300 | 364 | # ------------------------------------------------------------------------- |
301 | 365 | # Pandas integration methods |
|
0 commit comments