Skip to content

Commit d9251c3

Browse files
author
github-actions
committed
fix: fallback to libpostal on US validation failure
1 parent 1ba3ad7 commit d9251c3

4 files changed

Lines changed: 94 additions & 4 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ make docker-run-api
6060
- Strict rules: international results must include a road plus at least one location element (city/state/postal/country) or parsing fails.
6161
- Returned structure includes `InternationalAddress` fields (`HouseNumber`, `Road`, `City`, `State`, `PostalCode`, `Country`, `CountryCode`) and raw libpostal `Components`.
6262
- Requires libpostal installed; the provided Docker image already bundles it. Outside Docker, install libpostal first.
63+
- Heuristics: if the input clearly names a non-US country or contains non-ASCII, it skips US parsing and goes straight to libpostal; otherwise, US is attempted first and any US validation failure triggers libpostal fallback.
6364

6465
Notes:
6566
- Image name: `ghcr.io/abstract-data/ryandata-addr-utils-libpostal` (configurable via `DOCKER_IMAGE`, `DOCKER_TAG`, `DOCKER_REF`).

src/ryandata_address_utils/api.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,15 @@ def parse_auto(address: str = Query(..., min_length=3), validate: bool = True) -
7070
"address": intl.to_dict() if intl else None,
7171
"components": intl.Components if intl else {},
7272
"errors": [],
73+
"source": result.source,
7374
}
7475
return {
7576
"mode": "us",
7677
"is_valid": True,
7778
"is_parsed": True,
7879
"address": result.to_dict(),
7980
"errors": [],
81+
"source": result.source,
8082
}
8183

8284
if result.error:
@@ -89,13 +91,15 @@ def parse_auto(address: str = Query(..., min_length=3), validate: bool = True) -
8991
"is_valid": False,
9092
"is_parsed": False,
9193
"errors": ["International parse failed"],
94+
"source": result.source,
9295
}
9396

9497
return {
9598
"mode": "us",
9699
"is_valid": False,
97100
"is_parsed": False,
98101
"errors": ["US parse failed"],
102+
"source": result.source,
99103
}
100104

101105

src/ryandata_address_utils/service.py

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,47 @@
2222
except ImportError:
2323
lp_parse_address = None
2424

25+
26+
def _is_probably_international(address_string: str) -> bool:
27+
"""Lightweight heuristic to detect likely international addresses."""
28+
29+
lower = address_string.lower()
30+
intl_keywords = [
31+
"united kingdom",
32+
"uk",
33+
"england",
34+
"scotland",
35+
"wales",
36+
"ireland",
37+
"germany",
38+
"france",
39+
"japan",
40+
"россия",
41+
"russia",
42+
"india",
43+
"australia",
44+
"brazil",
45+
"canada",
46+
"mexico",
47+
"spain",
48+
"italy",
49+
"netherlands",
50+
"belgium",
51+
"switzerland",
52+
"sweden",
53+
"norway",
54+
"denmark",
55+
"finland",
56+
]
57+
58+
if any(keyword in lower for keyword in intl_keywords) and (
59+
"united states" not in lower and "usa" not in lower
60+
):
61+
return True
62+
63+
return bool(any(ord(ch) > 127 for ch in address_string))
64+
65+
2566
if TYPE_CHECKING:
2667
import pandas as pd
2768

@@ -258,7 +299,7 @@ def parse_international(self, address_string: str) -> ParseResult:
258299
error=RuntimeError("libpostal not available"),
259300
address=None,
260301
international_address=None,
261-
validation=None,
302+
validation=ValidationResult(is_valid=False, errors=[]),
262303
source="international",
263304
)
264305

@@ -289,13 +330,36 @@ def parse_international(self, address_string: str) -> ParseResult:
289330
)
290331

291332
def parse_auto_route(self, address_string: str, *, validate: bool = True) -> ParseResult:
292-
"""Try US parse first; if invalid and libpostal is available, fall back to international."""
293-
us_result = self.parse(address_string, validate=validate)
333+
"""Try US parse first; if invalid or fails and libpostal is available, fall back."""
334+
# If clearly international, skip US path
335+
if _is_probably_international(address_string) and lp_parse_address is not None:
336+
return self.parse_international(address_string)
337+
338+
try:
339+
us_result = self.parse(address_string, validate=validate)
340+
except Exception as exc:
341+
if lp_parse_address is not None:
342+
intl_result = self.parse_international(address_string)
343+
if intl_result.is_valid or intl_result.international_address is not None:
344+
return intl_result
345+
# Return error as a ParseResult to avoid raising in auto route
346+
return ParseResult(
347+
raw_input=address_string,
348+
address=None,
349+
international_address=None,
350+
error=exc,
351+
validation=ValidationResult(is_valid=False, errors=[]),
352+
source="us",
353+
)
354+
294355
if us_result.is_valid:
295356
return us_result
357+
296358
if lp_parse_address is None:
297359
return us_result
298-
return self.parse_international(address_string)
360+
361+
intl_result = self.parse_international(address_string)
362+
return intl_result
299363

300364
# -------------------------------------------------------------------------
301365
# Pandas integration methods

tests/test_address_parser.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -958,3 +958,24 @@ def test_parse_auto_international_missing_components_fails_strict() -> None:
958958
assert result.source == "international"
959959
assert not result.is_valid
960960
assert isinstance(result.error, RyanDataAddressError)
961+
962+
963+
def test_parse_auto_international_skips_us_when_probably_international() -> None:
964+
_require_libpostal()
965+
service = AddressService()
966+
result = service.parse_auto_route("Potsdamer Straße 3, 10785 Berlin, Germany", validate=True)
967+
assert result.source == "international"
968+
assert result.is_valid
969+
assert result.international_address is not None
970+
assert result.international_address.Road is not None
971+
972+
973+
def test_parse_auto_fallback_on_us_validation_error() -> None:
974+
_require_libpostal()
975+
service = AddressService()
976+
result = service.parse_auto_route(
977+
"1-1-2 Oshiage, Sumida-ku, Tokyo 131-0045, Japan", validate=True
978+
)
979+
assert result.source == "international"
980+
assert result.is_valid
981+
assert result.international_address is not None

0 commit comments

Comments
 (0)