From 7c5881fe8dc50ffeb138669ea1e31b4fd793bffe Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Wed, 20 May 2026 23:07:15 +0200 Subject: [PATCH 1/2] Write geoparquet with duckdb, geopandas does not yet support it --- pyproject.toml | 10 +- vecorel_cli/const.py | 4 +- vecorel_cli/encoding/geoparquet.py | 2 +- vecorel_cli/parquet/__init__.py | 55 ++++++ vecorel_cli/parquet/duckdb.py | 260 +++++++++++++++++++++++++++++ vecorel_cli/parquet/geopandas.py | 2 +- 6 files changed, 328 insertions(+), 5 deletions(-) create mode 100644 vecorel_cli/parquet/duckdb.py diff --git a/pyproject.toml b/pyproject.toml index caa9175..266419b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,11 +85,19 @@ build = ">=1.0.0,<2.0.0" s3fs = "==2025.7.0" gcsfs = "==2025.7.0" +# Optional dependency for writing GeoParquet 2.0+ (native Parquet GEOMETRY +# logical type). pyarrow 23 and geopandas 1.1 can't emit GeoParquet 2.0; +# DuckDB's spatial extension can, so we delegate the write when 2.x is +# requested. See vecorel_cli/parquet/duckdb.py. +[tool.pixi.feature.parquet2.dependencies] +duckdb = ">=1.4,<2.0" + [tool.pixi.environments] default = {solve-group = "default"} dev = {features = ["dev"], solve-group = "default"} cloud = {features = ["cloud"], solve-group = "default"} -all = {features = ["dev", "cloud"], solve-group = "default"} +parquet2 = {features = ["parquet2"], solve-group = "default"} +all = {features = ["dev", "cloud", "parquet2"], solve-group = "default"} [tool.pixi.tasks] # Development tasks diff --git a/vecorel_cli/const.py b/vecorel_cli/const.py index a6fde98..b7e9e57 100644 --- a/vecorel_cli/const.py +++ b/vecorel_cli/const.py @@ -2,5 +2,5 @@ COMPRESSION_METHODS = ["brotli", "gzip", "lz4", "snappy", "zstd", "none"] -GEOPARQUET_VERSIONS = ["1.0.0", "1.1.0"] -GEOPARQUET_DEFAULT_VERSION = "1.1.0" +GEOPARQUET_VERSIONS = ["1.0.0", "1.1.0", "2.0.0"] +GEOPARQUET_DEFAULT_VERSION = "2.0.0" diff --git a/vecorel_cli/encoding/geoparquet.py b/vecorel_cli/encoding/geoparquet.py index 79370fc..33a07cf 100644 --- a/vecorel_cli/encoding/geoparquet.py +++ b/vecorel_cli/encoding/geoparquet.py @@ -12,7 +12,7 @@ from ..const import GEOPARQUET_DEFAULT_VERSION, GEOPARQUET_VERSIONS from ..encoding.geojson import VecorelJSONEncoder -from ..parquet.geopandas import to_parquet +from ..parquet import to_parquet from ..parquet.types import get_geopandas_dtype, get_pyarrow_field, get_pyarrow_type_for_geopandas from ..validation.base import Validator from ..vecorel.typing import SchemaMapping diff --git a/vecorel_cli/parquet/__init__.py b/vecorel_cli/parquet/__init__.py index e69de29..275d40d 100644 --- a/vecorel_cli/parquet/__init__.py +++ b/vecorel_cli/parquet/__init__.py @@ -0,0 +1,55 @@ +"""Parquet writers for the GeoParquet specification. + +This package contains version-specific writers and a dispatcher. + +* GeoParquet 1.0.0 and 1.1.0 are written through :mod:`.geopandas`, which uses + the geopandas Arrow path (WKB-encoded geometry + ``geo`` metadata). +* GeoParquet 2.0.0+ is written through :mod:`.duckdb`, which uses the DuckDB + spatial extension to emit native Parquet ``GEOMETRY`` logical types. DuckDB + is an optional dependency (install the ``parquet2`` pixi feature). +""" + +from __future__ import annotations + +from typing import Any, Optional + + +def _major_version(schema_version: Optional[str]) -> int: + """Return the major part of a ``MAJOR.MINOR.PATCH`` version, or 0.""" + if not schema_version: + return 0 + head = schema_version.split(".", 1)[0] + try: + return int(head) + except ValueError: + return 0 + + +def to_parquet( + df, + path, + *, + schema_version: Optional[str] = None, + **kwargs: Any, +): + """Write a (Geo)DataFrame to GeoParquet, dispatching to the right backend. + + For ``schema_version`` < ``2.0.0`` (or unspecified) the geopandas writer + is used. For ``2.0.0+`` the DuckDB writer is used (requires the optional + ``duckdb`` dependency — install the ``parquet2`` pixi feature). + """ + if _major_version(schema_version) >= 2: + try: + from .duckdb import to_parquet as _backend + except ImportError as exc: # pragma: no cover - depends on env + raise ImportError( + f"Writing GeoParquet {schema_version!r} requires the optional " + "'duckdb' dependency. Install the 'parquet2' pixi feature, " + "e.g. `pixi install -e parquet2`." + ) from exc + else: + from .geopandas import to_parquet as _backend + return _backend(df, path, schema_version=schema_version, **kwargs) + + +__all__ = ["to_parquet"] diff --git a/vecorel_cli/parquet/duckdb.py b/vecorel_cli/parquet/duckdb.py new file mode 100644 index 0000000..e6a5cf7 --- /dev/null +++ b/vecorel_cli/parquet/duckdb.py @@ -0,0 +1,260 @@ +"""GeoParquet 2.0+ writer backed by the DuckDB spatial extension. + +GeoParquet 2.0 uses the native Parquet ``GEOMETRY`` / ``GEOGRAPHY`` logical +types instead of WKB-encoded binary columns with side metadata. pyarrow 23 and +geopandas 1.1 don't expose those logical types yet, so we delegate the actual +write to DuckDB, which has had a full GeoParquet 2.0 writer since the spatial +extension shipped that support. + +The function exposes the same signature as +``vecorel_cli.parquet.geopandas.to_parquet`` so it can stand in transparently +behind the dispatcher in ``vecorel_cli.parquet.__init__``. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any, Iterable, Optional + +# Module-level import of the third-party ``duckdb`` (NOT this module — Python +# 3's absolute import rules make ``import duckdb`` resolve to the installed +# package even from inside ``vecorel_cli.parquet.duckdb``). The import is +# eager so that callers see ImportError up-front when the optional dep is +# missing; the dispatcher in ``__init__`` catches it. +import duckdb as _duckdb + + +def _ensure_spatial(con) -> None: + """Load the spatial extension, installing it on first use if needed.""" + try: + con.execute("LOAD spatial") + except _duckdb.Error: + con.execute("INSTALL spatial") + con.execute("LOAD spatial") + + +def _geometry_columns(df) -> list[str]: + return [col for col in df.columns if str(df.dtypes[col]) == "geometry"] + + +def _quote_ident(name: str) -> str: + """Quote an identifier for safe interpolation into SQL.""" + return '"' + name.replace('"', '""') + '"' + + +def _quote_path(path) -> str: + """Quote a path literal for SQL ``COPY ... TO 'path'``.""" + return "'" + str(path).replace("'", "''") + "'" + + +def _duckdb_geoparquet_mode(schema_version: Optional[str]) -> str: + """Translate a GeoParquet *spec* version into DuckDB's ``GEOPARQUET_VERSION`` + *option* value. + + DuckDB's spatial extension exposes the option as an enum, not as the + spec's MAJOR.MINOR.PATCH string. The allowed values are:: + + V1 - WKB-binary column + ``geo`` metadata block (classic GeoParquet 1.x) + NONE - Native Parquet GEOMETRY logical type, *no* ``geo`` metadata + BOTH - Native GEOMETRY column AND a legacy 1.x ``geo`` block (the + column is readable as both native GEOMETRY and WKB) + + Mapping rationale: + + * ``"1.x"`` / ``None`` -> ``V1`` + * ``"2.x"`` and higher -> ``BOTH`` + + Why ``BOTH`` and not ``NONE`` for 2.x: + + geopandas, and therefore the fiboa validator, hard-require a ``geo`` + metadata block (``geopandas.read_parquet`` raises "Missing geo metadata" + without it). ``NONE`` strips the block entirely, so even though the file + is a "pure" GeoParquet 2.0 it can't be read or validated by the existing + tooling. ``BOTH`` keeps a 1.x ``geo`` block for those tools while ALSO + storing the geometry as a native Parquet GEOMETRY logical type — newer + readers see the native type, older readers see the legacy WKB view. + + Tools that read only ``geo.version`` (e.g. ``gpio check``) will report + the file as 1.0.0 — that's a tooling limitation, not a property of the + file. To confirm the native GEOMETRY column is present, inspect the + Parquet schema directly (``pyarrow.parquet.ParquetFile(...).schema_arrow``). + + Override with the ``duckdb_geoparquet_mode`` kwarg to force ``NONE`` if + you need a strict 2.0 file and don't care about fiboa-validator compat. + """ + if not schema_version: + return "V1" + head = schema_version.split(".", 1)[0] + try: + major = int(head) + except ValueError: + return "V1" + return "BOTH" if major >= 2 else "V1" + + +def to_parquet( + df, + path, + index: Optional[bool] = None, + compression: str = "zstd", + schema_version: Optional[str] = None, + write_covering_bbox: bool = False, # ignored for 2.0+ (column stats replace it) + schema=None, + compression_level: Optional[int] = None, + row_group_size: Optional[int] = None, + duckdb_geoparquet_mode: Optional[str] = None, + **kwargs: Any, +): + """Write a GeoDataFrame as GeoParquet 2.0+ via DuckDB. + + Parameters mirror :func:`vecorel_cli.parquet.geopandas.to_parquet` so the + function can be used interchangeably through the dispatcher. ``schema`` + (a pyarrow Schema carrying the desired column metadata, used by the + geopandas writer to attach collection metadata) is interpreted as a + structural hint only: any schema-level metadata it carries is written + into the output as Parquet key/value metadata. ``write_covering_bbox`` is + accepted for signature compatibility and ignored (GeoParquet 2.0 uses + native Parquet column statistics for bounding boxes). + + Requires the optional ``duckdb`` dependency (``parquet2`` pixi feature). + """ + import pyarrow as pa # local import keeps base deps minimal + from geopandas.io.arrow import _validate_dataframe + + _validate_dataframe(df) + geom_cols = _geometry_columns(df) + if not geom_cols: + raise ValueError("Cannot write GeoParquet: dataframe has no geometry columns") + if any(df[c].array.has_z.any() for c in geom_cols): + raise ValueError("Cannot write 3D geometries to GeoParquet 2.0") + + crs_by_col: dict[str, Optional[str]] = {} + for col in geom_cols: + crs = df[col].crs + crs_by_col[col] = crs.to_authority(min_confidence=20) if crs is not None else None + + # Convert geometries to WKB so we can hand a plain Arrow table to DuckDB, + # then have DuckDB parse them back into native GEOMETRY columns via + # ST_GeomFromWKB inside the COPY. + df_wkb = df.to_wkb() + table = pa.Table.from_pandas(df_wkb, schema=schema, preserve_index=index) + + out_dir = os.path.dirname(str(path)) + if out_dir: + Path(out_dir).mkdir(parents=True, exist_ok=True) + + con = _duckdb.connect(":memory:") + try: + _ensure_spatial(con) + con.register("vecorel_input", table) + + select_terms = [] + for col in table.column_names: + ident = _quote_ident(col) + if col in geom_cols: + select_terms.append(f"ST_GeomFromWKB({ident}) AS {ident}") + else: + select_terms.append(ident) + select_sql = ", ".join(select_terms) + + mode = duckdb_geoparquet_mode or _duckdb_geoparquet_mode(schema_version) + copy_opts = [ + "FORMAT PARQUET", + f"COMPRESSION {compression}", + f"GEOPARQUET_VERSION '{mode}'", + ] + if compression_level is not None: + copy_opts.append(f"COMPRESSION_LEVEL {compression_level}") + if row_group_size is not None: + copy_opts.append(f"ROW_GROUP_SIZE {row_group_size}") + copy_opts_sql = ", ".join(copy_opts) + + con.execute( + f"COPY (SELECT {select_sql} FROM vecorel_input) " + f"TO {_quote_path(path)} ({copy_opts_sql})" + ) + finally: + con.close() + + +def copy_to_geoparquet2( + source: Iterable | str, + target, + *, + schema_version: str = "2.0.0", + compression: str = "zstd", + compression_level: Optional[int] = None, + row_group_size: Optional[int] = None, + drop_columns: Iterable[str] = ("bbox",), + duckdb_geoparquet_mode: Optional[str] = None, +) -> None: + """Convert one or more existing 1.x GeoParquet files to GeoParquet 2.0+. + + ``source`` can be a single path or an iterable of paths (or a glob string + that DuckDB's ``read_parquet`` accepts). Geometries stored as WKB in the + sources are decoded with ``ST_GeomFromWKB`` and re-emitted as native + GEOMETRY logical types. By default the per-row ``bbox`` covering column + is dropped from the output (GeoParquet 2.0 stores bbox in column stats). + + The row ordering of the source is preserved (DuckDB's ``COPY`` from a + Parquet scan walks row groups in file order, which keeps any upstream + Hilbert sort intact). + """ + if isinstance(source, (str, Path)): + sources = [str(source)] + else: + sources = [str(s) for s in source] + if not sources: + raise ValueError("copy_to_geoparquet2: no source files provided") + + out_dir = os.path.dirname(str(target)) + if out_dir: + Path(out_dir).mkdir(parents=True, exist_ok=True) + + con = _duckdb.connect(":memory:") + try: + _ensure_spatial(con) + sources_sql = "[" + ", ".join(_quote_path(s) for s in sources) + "]" + + # Discover the input columns so we can know which one is the geometry + # (DuckDB exposes binary WKB columns as BLOB). + df_info = con.execute(f"DESCRIBE SELECT * FROM read_parquet({sources_sql})").fetchall() + cols = [row[0] for row in df_info] + col_types = {row[0]: row[1].upper() for row in df_info} + + drop = {c for c in drop_columns} + out_cols = [c for c in cols if c not in drop] + + select_terms = [] + for col in out_cols: + ident = _quote_ident(col) + t = col_types.get(col, "") + if t == "BLOB": + # Treat the (single) primary WKB column as geometry. If multiple + # BLOB columns exist this heuristic would need refinement. + select_terms.append(f"ST_GeomFromWKB({ident}) AS {ident}") + else: + select_terms.append(ident) + + mode = duckdb_geoparquet_mode or _duckdb_geoparquet_mode(schema_version) + copy_opts = [ + "FORMAT PARQUET", + f"COMPRESSION {compression}", + f"GEOPARQUET_VERSION '{mode}'", + ] + if compression_level is not None: + copy_opts.append(f"COMPRESSION_LEVEL {compression_level}") + if row_group_size is not None: + copy_opts.append(f"ROW_GROUP_SIZE {row_group_size}") + + con.execute( + f"COPY (SELECT {', '.join(select_terms)} " + f"FROM read_parquet({sources_sql})) " + f"TO {_quote_path(target)} ({', '.join(copy_opts)})" + ) + finally: + con.close() + + +__all__ = ["to_parquet", "copy_to_geoparquet2"] diff --git a/vecorel_cli/parquet/geopandas.py b/vecorel_cli/parquet/geopandas.py index 9f61b16..e120b38 100644 --- a/vecorel_cli/parquet/geopandas.py +++ b/vecorel_cli/parquet/geopandas.py @@ -94,7 +94,7 @@ def to_parquet( output except `RangeIndex` which is stored as metadata only. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. - schema_version : {'0.1.0', '0.4.0', '1.0.0', None} + schema_version : {'0.1.0', '0.4.0', '1.0.0', '1.1.0', '2.0.0', None} GeoParquet specification version; if not provided will default to latest supported version. write_covering_bbox : bool, default False From 915a2b874772a7721a81b4d98f8a4d6162ac6bee Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Thu, 21 May 2026 08:34:52 +0200 Subject: [PATCH 2/2] Update Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49efad5..0935b1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +- Optionally write geoparquet 2.0 (with duckdb) + ## [v0.2.15] - 2026-02-16 - Add a `get_default_collection` to the Registry