Skip to content

Commit 51093eb

Browse files
author
github-actions
committed
feat: add libpostal docker image and api entrypoint
1 parent 23e462d commit 51093eb

5 files changed

Lines changed: 216 additions & 0 deletions

File tree

Dockerfile

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
FROM python:3.11-slim
2+
3+
# Install system dependencies for libpostal
4+
RUN apt-get update && apt-get install -y \
5+
build-essential curl automake autoconf libtool pkg-config git \
6+
&& rm -rf /var/lib/apt/lists/*
7+
8+
# Build and install libpostal from source
9+
RUN git clone https://github.com/openvenues/libpostal.git /tmp/libpostal \
10+
&& cd /tmp/libpostal \
11+
&& ./bootstrap.sh \
12+
&& ./configure --datadir=/usr/local/share/libpostal \
13+
&& make \
14+
&& make install \
15+
&& ldconfig \
16+
&& rm -rf /tmp/libpostal
17+
18+
WORKDIR /app
19+
20+
# Allow overriding install ref (defaults to main)
21+
ARG RYANDATA_ADDR_UTILS_REF=main
22+
23+
# Install the package from git ref
24+
RUN pip install --no-cache-dir "git+https://github.com/Abstract-Data/RyanData-Address-Utils.git@${RYANDATA_ADDR_UTILS_REF}"
25+
26+
# Optional: copy source for local development/mounting (no-op unless mounted)
27+
COPY . /app
28+
29+
# Default to Python shell; can override with CMD for API (see Make targets)
30+
CMD ["python"]
31+

Makefile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ help:
1414
@echo " make typecheck Run mypy type checker"
1515
@echo " make format Format code with ruff"
1616
@echo ""
17+
@echo "Docker:"
18+
@echo " make docker-build Build libpostal-enabled image"
19+
@echo " make docker-shell Shell into the image"
20+
@echo " make docker-test Run a sample parse inside the image"
21+
@echo " make docker-run-api Run the optional API server"
22+
@echo ""
1723
@echo "Cleanup:"
1824
@echo " make clean Remove build artifacts"
1925

@@ -41,6 +47,25 @@ typecheck:
4147
format:
4248
uv run ruff format src/
4349

50+
# Docker targets (libpostal-enabled image)
51+
DOCKER_IMAGE ?= ghcr.io/abstract-data/ryandata-addr-utils-libpostal
52+
DOCKER_TAG ?= latest
53+
DOCKER_REF ?= main
54+
55+
docker-build:
56+
docker build --build-arg RYANDATA_ADDR_UTILS_REF=$(DOCKER_REF) -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
57+
58+
docker-shell:
59+
docker run --rm -it $(DOCKER_IMAGE):$(DOCKER_TAG) bash
60+
61+
docker-test:
62+
docker run --rm $(DOCKER_IMAGE):$(DOCKER_TAG) \
63+
python -c "from ryandata_address_utils import parse; print(parse('123 Main St, Austin TX 78749').to_dict())"
64+
65+
docker-run-api:
66+
docker run --rm -it -p 8000:8000 $(DOCKER_IMAGE):$(DOCKER_TAG) \
67+
python -m ryandata_address_utils.api
68+
4469
# Clean up
4570
clean:
4671
rm -rf dist/ build/ *.egg-info src/*.egg-info

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,31 @@ pip install git+https://github.com/Abstract-Data/RyanData-Address-Utils.git
3333
pip install "ryandata-address-utils[pandas] @ git+https://github.com/Abstract-Data/RyanData-Address-Utils.git"
3434
```
3535

36+
## Docker (libpostal-ready, “clone and go”)
37+
38+
Build (with libpostal and this package installed from the chosen ref):
39+
```bash
40+
make docker-build # builds ghcr.io/abstract-data/ryandata-addr-utils-libpostal:latest
41+
make docker-test # quick parse inside container
42+
```
43+
44+
Shell into the image:
45+
```bash
46+
make docker-shell
47+
```
48+
49+
Run the optional API (FastAPI) on port 8000:
50+
```bash
51+
make docker-run-api
52+
# Then call: curl "http://localhost:8000/parse?address=123%20Main%20St,%20Austin%20TX%2078749"
53+
# International (if libpostal available in image):
54+
# curl "http://localhost:8000/parse_international?address=10%20Downing%20St,%20London"
55+
```
56+
57+
Notes:
58+
- Image name: `ghcr.io/abstract-data/ryandata-addr-utils-libpostal` (configurable via `DOCKER_IMAGE`, `DOCKER_TAG`, `DOCKER_REF`).
59+
- The image bundles libpostal; use it when you need international parsing without host installs.
60+
3661
## Quick start
3762

3863
```python

src/ryandata_address_utils/api.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""Minimal FastAPI service for parsing/validation (US + optional international via libpostal)."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any
6+
7+
from fastapi import FastAPI, HTTPException, Query
8+
9+
from ryandata_address_utils.service import AddressService, parse
10+
11+
try:
12+
from postal.parser import parse_address as lp_parse_address
13+
except ImportError:
14+
lp_parse_address = None
15+
16+
app = FastAPI(title="RyanData Address Utils API", version="0.3.1")
17+
service = AddressService()
18+
19+
20+
@app.get("/health")
21+
def health() -> dict[str, str]:
22+
return {"status": "ok"}
23+
24+
25+
@app.get("/parse")
26+
def parse_us_address(
27+
address: str = Query(..., min_length=3),
28+
validate: bool = True,
29+
) -> dict[str, Any]:
30+
"""Parse a US address using the standard service."""
31+
result = parse(address, validate=validate)
32+
return {
33+
"is_valid": result.is_valid,
34+
"is_parsed": result.is_parsed,
35+
"address": result.to_dict() if result.address else None,
36+
"errors": [e.message for e in (result.validation.errors if result.validation else [])]
37+
if result.validation
38+
else [],
39+
}
40+
41+
42+
@app.get("/parse_international")
43+
def parse_international(address: str = Query(..., min_length=3)) -> dict[str, Any]:
44+
"""Parse an international address via libpostal, if available."""
45+
if lp_parse_address is None:
46+
raise HTTPException(status_code=501, detail="libpostal not available in this environment")
47+
48+
parsed = lp_parse_address(address)
49+
# Convert list of (component, label) tuples into a dict of lists to preserve duplicates
50+
components: dict[str, list[str]] = {}
51+
for value, label in parsed:
52+
components.setdefault(label, []).append(value)
53+
54+
return {"address": address, "components": components}
55+
56+
57+
@app.get("/parse_auto")
58+
def parse_auto(address: str = Query(..., min_length=3), validate: bool = True) -> dict[str, Any]:
59+
"""Auto route: try US parser first; if it fails and libpostal is available, fall back."""
60+
us_result = parse(address, validate=validate)
61+
if us_result.is_valid:
62+
return {
63+
"mode": "us",
64+
"is_valid": True,
65+
"is_parsed": True,
66+
"address": us_result.to_dict(),
67+
"errors": [],
68+
}
69+
70+
if lp_parse_address is None:
71+
return {
72+
"mode": "us",
73+
"is_valid": False,
74+
"is_parsed": False,
75+
"errors": ["US parse failed and libpostal is not available"],
76+
}
77+
78+
parsed = lp_parse_address(address)
79+
components: dict[str, list[str]] = {}
80+
for value, label in parsed:
81+
components.setdefault(label, []).append(value)
82+
83+
return {"mode": "international", "is_valid": True, "is_parsed": True, "components": components}
84+
85+
86+
# To run: uvicorn ryandata_address_utils.api:app --host 0.0.0.0 --port 8000

src/ryandata_address_utils/service.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
from ryandata_address_utils.parsers import ParserFactory
1515
from ryandata_address_utils.validation.validators import create_default_validators
1616

17+
# Optional libpostal import for international parsing
18+
try:
19+
from postal.parser import parse_address as lp_parse_address
20+
except ImportError:
21+
lp_parse_address = None
22+
1723
if TYPE_CHECKING:
1824
import pandas as pd
1925

@@ -234,6 +240,49 @@ def normalize_state(self, state: str) -> Optional[str]:
234240
"""
235241
return self._data_source.normalize_state(state)
236242

243+
# -------------------------------------------------------------------------
244+
# International / libpostal parsing
245+
# -------------------------------------------------------------------------
246+
247+
def parse_international(self, address_string: str) -> ParseResult:
248+
"""Parse an address using libpostal if available."""
249+
if lp_parse_address is None:
250+
return ParseResult(
251+
raw_input=address_string,
252+
error=RuntimeError("libpostal not available"),
253+
address=None,
254+
validation=None,
255+
)
256+
257+
try:
258+
parsed_tokens = lp_parse_address(address_string)
259+
# Convert list of (value, label) tuples into a dict; labels may repeat
260+
parsed_dict: dict[str, str] = {}
261+
for value, label in parsed_tokens:
262+
if label in parsed_dict:
263+
parsed_dict[label] = f"{parsed_dict[label]} {value}"
264+
else:
265+
parsed_dict[label] = value
266+
267+
# Return parsed tokens in address field for downstream use
268+
return ParseResult(
269+
raw_input=address_string,
270+
address=parsed_dict, # type: ignore
271+
error=None,
272+
validation=None,
273+
)
274+
except Exception as e: # pragma: no cover
275+
return ParseResult(raw_input=address_string, error=e)
276+
277+
def parse_auto_route(self, address_string: str, *, validate: bool = True) -> ParseResult:
278+
"""Try US parse first; if invalid and libpostal is available, fall back to international."""
279+
us_result = self.parse(address_string, validate=validate)
280+
if us_result.is_valid:
281+
return us_result
282+
if lp_parse_address is None:
283+
return us_result
284+
return self.parse_international(address_string)
285+
237286
# -------------------------------------------------------------------------
238287
# Pandas integration methods
239288
# -------------------------------------------------------------------------

0 commit comments

Comments
 (0)