From 7c5881fe8dc50ffeb138669ea1e31b4fd793bffe Mon Sep 17 00:00:00 2001
From: Ivor Bosloper <ivorbosloper@gmail.com>
Date: Wed, 20 May 2026 23:07:15 +0200
Subject: [PATCH 1/2] Write geoparquet with duckdb, geopandas does not yet
 support it

---
 pyproject.toml                     |  10 +-
 vecorel_cli/const.py               |   4 +-
 vecorel_cli/encoding/geoparquet.py |   2 +-
 vecorel_cli/parquet/__init__.py    |  55 ++++++
 vecorel_cli/parquet/duckdb.py      | 260 +++++++++++++++++++++++++++++
 vecorel_cli/parquet/geopandas.py   |   2 +-
 6 files changed, 328 insertions(+), 5 deletions(-)
 create mode 100644 vecorel_cli/parquet/duckdb.py

diff --git a/pyproject.toml b/pyproject.toml
index caa9175..266419b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,11 +85,19 @@ build = ">=1.0.0,<2.0.0"
 s3fs = "==2025.7.0"
 gcsfs = "==2025.7.0"
 
+# Optional dependency for writing GeoParquet 2.0+ (native Parquet GEOMETRY
+# logical type). pyarrow 23 and geopandas 1.1 can't emit GeoParquet 2.0;
+# DuckDB's spatial extension can, so we delegate the write when 2.x is
+# requested. See vecorel_cli/parquet/duckdb.py.
+[tool.pixi.feature.parquet2.dependencies]
+duckdb = ">=1.4,<2.0"
+
 [tool.pixi.environments]
 default = {solve-group = "default"}
 dev = {features = ["dev"], solve-group = "default"}
 cloud = {features = ["cloud"], solve-group = "default"}
-all = {features = ["dev", "cloud"], solve-group = "default"}
+parquet2 = {features = ["parquet2"], solve-group = "default"}
+all = {features = ["dev", "cloud", "parquet2"], solve-group = "default"}
 
 [tool.pixi.tasks]
 # Development tasks
diff --git a/vecorel_cli/const.py b/vecorel_cli/const.py
index a6fde98..b7e9e57 100644
--- a/vecorel_cli/const.py
+++ b/vecorel_cli/const.py
@@ -2,5 +2,5 @@
 
 COMPRESSION_METHODS = ["brotli", "gzip", "lz4", "snappy", "zstd", "none"]
 
-GEOPARQUET_VERSIONS = ["1.0.0", "1.1.0"]
-GEOPARQUET_DEFAULT_VERSION = "1.1.0"
+GEOPARQUET_VERSIONS = ["1.0.0", "1.1.0", "2.0.0"]
+GEOPARQUET_DEFAULT_VERSION = "2.0.0"
diff --git a/vecorel_cli/encoding/geoparquet.py b/vecorel_cli/encoding/geoparquet.py
index 79370fc..33a07cf 100644
--- a/vecorel_cli/encoding/geoparquet.py
+++ b/vecorel_cli/encoding/geoparquet.py
@@ -12,7 +12,7 @@
 
 from ..const import GEOPARQUET_DEFAULT_VERSION, GEOPARQUET_VERSIONS
 from ..encoding.geojson import VecorelJSONEncoder
-from ..parquet.geopandas import to_parquet
+from ..parquet import to_parquet
 from ..parquet.types import get_geopandas_dtype, get_pyarrow_field, get_pyarrow_type_for_geopandas
 from ..validation.base import Validator
 from ..vecorel.typing import SchemaMapping
diff --git a/vecorel_cli/parquet/__init__.py b/vecorel_cli/parquet/__init__.py
index e69de29..275d40d 100644
--- a/vecorel_cli/parquet/__init__.py
+++ b/vecorel_cli/parquet/__init__.py
@@ -0,0 +1,55 @@
+"""Parquet writers for the GeoParquet specification.
+
+This package contains version-specific writers and a dispatcher.
+
+* GeoParquet 1.0.0 and 1.1.0 are written through :mod:`.geopandas`, which uses
+  the geopandas Arrow path (WKB-encoded geometry + ``geo`` metadata).
+* GeoParquet 2.0.0+ is written through :mod:`.duckdb`, which uses the DuckDB
+  spatial extension to emit native Parquet ``GEOMETRY`` logical types. DuckDB
+  is an optional dependency (install the ``parquet2`` pixi feature).
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+
+def _major_version(schema_version: Optional[str]) -> int:
+    """Return the major part of a ``MAJOR.MINOR.PATCH`` version, or 0."""
+    if not schema_version:
+        return 0
+    head = schema_version.split(".", 1)[0]
+    try:
+        return int(head)
+    except ValueError:
+        return 0
+
+
+def to_parquet(
+    df,
+    path,
+    *,
+    schema_version: Optional[str] = None,
+    **kwargs: Any,
+):
+    """Write a (Geo)DataFrame to GeoParquet, dispatching to the right backend.
+
+    For ``schema_version`` < ``2.0.0`` (or unspecified) the geopandas writer
+    is used. For ``2.0.0+`` the DuckDB writer is used (requires the optional
+    ``duckdb`` dependency — install the ``parquet2`` pixi feature).
+    """
+    if _major_version(schema_version) >= 2:
+        try:
+            from .duckdb import to_parquet as _backend
+        except ImportError as exc:  # pragma: no cover - depends on env
+            raise ImportError(
+                f"Writing GeoParquet {schema_version!r} requires the optional "
+                "'duckdb' dependency. Install the 'parquet2' pixi feature, "
+                "e.g. `pixi install -e parquet2`."
+            ) from exc
+    else:
+        from .geopandas import to_parquet as _backend
+    return _backend(df, path, schema_version=schema_version, **kwargs)
+
+
+__all__ = ["to_parquet"]
diff --git a/vecorel_cli/parquet/duckdb.py b/vecorel_cli/parquet/duckdb.py
new file mode 100644
index 0000000..e6a5cf7
--- /dev/null
+++ b/vecorel_cli/parquet/duckdb.py
@@ -0,0 +1,260 @@
+"""GeoParquet 2.0+ writer backed by the DuckDB spatial extension.
+
+GeoParquet 2.0 uses the native Parquet ``GEOMETRY`` / ``GEOGRAPHY`` logical
+types instead of WKB-encoded binary columns with side metadata. pyarrow 23 and
+geopandas 1.1 don't expose those logical types yet, so we delegate the actual
+write to DuckDB, which has had a full GeoParquet 2.0 writer since the spatial
+extension shipped that support.
+
+The function exposes the same signature as
+``vecorel_cli.parquet.geopandas.to_parquet`` so it can stand in transparently
+behind the dispatcher in ``vecorel_cli.parquet.__init__``.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any, Iterable, Optional
+
+# Module-level import of the third-party ``duckdb`` (NOT this module — Python
+# 3's absolute import rules make ``import duckdb`` resolve to the installed
+# package even from inside ``vecorel_cli.parquet.duckdb``). The import is
+# eager so that callers see ImportError up-front when the optional dep is
+# missing; the dispatcher in ``__init__`` catches it.
+import duckdb as _duckdb
+
+
+def _ensure_spatial(con) -> None:
+    """Load the spatial extension, installing it on first use if needed."""
+    try:
+        con.execute("LOAD spatial")
+    except _duckdb.Error:
+        con.execute("INSTALL spatial")
+        con.execute("LOAD spatial")
+
+
+def _geometry_columns(df) -> list[str]:
+    return [col for col in df.columns if str(df.dtypes[col]) == "geometry"]
+
+
+def _quote_ident(name: str) -> str:
+    """Quote an identifier for safe interpolation into SQL."""
+    return '"' + name.replace('"', '""') + '"'
+
+
+def _quote_path(path) -> str:
+    """Quote a path literal for SQL ``COPY ... TO 'path'``."""
+    return "'" + str(path).replace("'", "''") + "'"
+
+
+def _duckdb_geoparquet_mode(schema_version: Optional[str]) -> str:
+    """Translate a GeoParquet *spec* version into DuckDB's ``GEOPARQUET_VERSION``
+    *option* value.
+
+    DuckDB's spatial extension exposes the option as an enum, not as the
+    spec's MAJOR.MINOR.PATCH string. The allowed values are::
+
+        V1   - WKB-binary column + ``geo`` metadata block (classic GeoParquet 1.x)
+        NONE - Native Parquet GEOMETRY logical type, *no* ``geo`` metadata
+        BOTH - Native GEOMETRY column AND a legacy 1.x ``geo`` block (the
+               column is readable as both native GEOMETRY and WKB)
+
+    Mapping rationale:
+
+    * ``"1.x"`` / ``None`` -> ``V1``
+    * ``"2.x"`` and higher -> ``BOTH``
+
+    Why ``BOTH`` and not ``NONE`` for 2.x:
+
+    geopandas, and therefore the fiboa validator, hard-require a ``geo``
+    metadata block (``geopandas.read_parquet`` raises "Missing geo metadata"
+    without it). ``NONE`` strips the block entirely, so even though the file
+    is a "pure" GeoParquet 2.0 it can't be read or validated by the existing
+    tooling. ``BOTH`` keeps a 1.x ``geo`` block for those tools while ALSO
+    storing the geometry as a native Parquet GEOMETRY logical type — newer
+    readers see the native type, older readers see the legacy WKB view.
+
+    Tools that read only ``geo.version`` (e.g. ``gpio check``) will report
+    the file as 1.0.0 — that's a tooling limitation, not a property of the
+    file. To confirm the native GEOMETRY column is present, inspect the
+    Parquet schema directly (``pyarrow.parquet.ParquetFile(...).schema_arrow``).
+
+    Override with the ``duckdb_geoparquet_mode`` kwarg to force ``NONE`` if
+    you need a strict 2.0 file and don't care about fiboa-validator compat.
+    """
+    if not schema_version:
+        return "V1"
+    head = schema_version.split(".", 1)[0]
+    try:
+        major = int(head)
+    except ValueError:
+        return "V1"
+    return "BOTH" if major >= 2 else "V1"
+
+
+def to_parquet(
+    df,
+    path,
+    index: Optional[bool] = None,
+    compression: str = "zstd",
+    schema_version: Optional[str] = None,
+    write_covering_bbox: bool = False,  # ignored for 2.0+ (column stats replace it)
+    schema=None,
+    compression_level: Optional[int] = None,
+    row_group_size: Optional[int] = None,
+    duckdb_geoparquet_mode: Optional[str] = None,
+    **kwargs: Any,
+):
+    """Write a GeoDataFrame as GeoParquet 2.0+ via DuckDB.
+
+    Parameters mirror :func:`vecorel_cli.parquet.geopandas.to_parquet` so the
+    function can be used interchangeably through the dispatcher. ``schema``
+    (a pyarrow Schema carrying the desired column metadata, used by the
+    geopandas writer to attach collection metadata) is interpreted as a
+    structural hint only: any schema-level metadata it carries is written
+    into the output as Parquet key/value metadata. ``write_covering_bbox`` is
+    accepted for signature compatibility and ignored (GeoParquet 2.0 uses
+    native Parquet column statistics for bounding boxes).
+
+    Requires the optional ``duckdb`` dependency (``parquet2`` pixi feature).
+    """
+    import pyarrow as pa  # local import keeps base deps minimal
+    from geopandas.io.arrow import _validate_dataframe
+
+    _validate_dataframe(df)
+    geom_cols = _geometry_columns(df)
+    if not geom_cols:
+        raise ValueError("Cannot write GeoParquet: dataframe has no geometry columns")
+    if any(df[c].array.has_z.any() for c in geom_cols):
+        raise ValueError("Cannot write 3D geometries to GeoParquet 2.0")
+
+    crs_by_col: dict[str, Optional[str]] = {}
+    for col in geom_cols:
+        crs = df[col].crs
+        crs_by_col[col] = crs.to_authority(min_confidence=20) if crs is not None else None
+
+    # Convert geometries to WKB so we can hand a plain Arrow table to DuckDB,
+    # then have DuckDB parse them back into native GEOMETRY columns via
+    # ST_GeomFromWKB inside the COPY.
+    df_wkb = df.to_wkb()
+    table = pa.Table.from_pandas(df_wkb, schema=schema, preserve_index=index)
+
+    out_dir = os.path.dirname(str(path))
+    if out_dir:
+        Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+    con = _duckdb.connect(":memory:")
+    try:
+        _ensure_spatial(con)
+        con.register("vecorel_input", table)
+
+        select_terms = []
+        for col in table.column_names:
+            ident = _quote_ident(col)
+            if col in geom_cols:
+                select_terms.append(f"ST_GeomFromWKB({ident}) AS {ident}")
+            else:
+                select_terms.append(ident)
+        select_sql = ", ".join(select_terms)
+
+        mode = duckdb_geoparquet_mode or _duckdb_geoparquet_mode(schema_version)
+        copy_opts = [
+            "FORMAT PARQUET",
+            f"COMPRESSION {compression}",
+            f"GEOPARQUET_VERSION '{mode}'",
+        ]
+        if compression_level is not None:
+            copy_opts.append(f"COMPRESSION_LEVEL {compression_level}")
+        if row_group_size is not None:
+            copy_opts.append(f"ROW_GROUP_SIZE {row_group_size}")
+        copy_opts_sql = ", ".join(copy_opts)
+
+        con.execute(
+            f"COPY (SELECT {select_sql} FROM vecorel_input) "
+            f"TO {_quote_path(path)} ({copy_opts_sql})"
+        )
+    finally:
+        con.close()
+
+
+def copy_to_geoparquet2(
+    source: Iterable | str,
+    target,
+    *,
+    schema_version: str = "2.0.0",
+    compression: str = "zstd",
+    compression_level: Optional[int] = None,
+    row_group_size: Optional[int] = None,
+    drop_columns: Iterable[str] = ("bbox",),
+    duckdb_geoparquet_mode: Optional[str] = None,
+) -> None:
+    """Convert one or more existing 1.x GeoParquet files to GeoParquet 2.0+.
+
+    ``source`` can be a single path or an iterable of paths (or a glob string
+    that DuckDB's ``read_parquet`` accepts). Geometries stored as WKB in the
+    sources are decoded with ``ST_GeomFromWKB`` and re-emitted as native
+    GEOMETRY logical types. By default the per-row ``bbox`` covering column
+    is dropped from the output (GeoParquet 2.0 stores bbox in column stats).
+
+    The row ordering of the source is preserved (DuckDB's ``COPY`` from a
+    Parquet scan walks row groups in file order, which keeps any upstream
+    Hilbert sort intact).
+    """
+    if isinstance(source, (str, Path)):
+        sources = [str(source)]
+    else:
+        sources = [str(s) for s in source]
+    if not sources:
+        raise ValueError("copy_to_geoparquet2: no source files provided")
+
+    out_dir = os.path.dirname(str(target))
+    if out_dir:
+        Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+    con = _duckdb.connect(":memory:")
+    try:
+        _ensure_spatial(con)
+        sources_sql = "[" + ", ".join(_quote_path(s) for s in sources) + "]"
+
+        # Discover the input columns so we can know which one is the geometry
+        # (DuckDB exposes binary WKB columns as BLOB).
+        df_info = con.execute(f"DESCRIBE SELECT * FROM read_parquet({sources_sql})").fetchall()
+        cols = [row[0] for row in df_info]
+        col_types = {row[0]: row[1].upper() for row in df_info}
+
+        drop = {c for c in drop_columns}
+        out_cols = [c for c in cols if c not in drop]
+
+        select_terms = []
+        for col in out_cols:
+            ident = _quote_ident(col)
+            t = col_types.get(col, "")
+            if t == "BLOB":
+                # Treat the (single) primary WKB column as geometry. If multiple
+                # BLOB columns exist this heuristic would need refinement.
+                select_terms.append(f"ST_GeomFromWKB({ident}) AS {ident}")
+            else:
+                select_terms.append(ident)
+
+        mode = duckdb_geoparquet_mode or _duckdb_geoparquet_mode(schema_version)
+        copy_opts = [
+            "FORMAT PARQUET",
+            f"COMPRESSION {compression}",
+            f"GEOPARQUET_VERSION '{mode}'",
+        ]
+        if compression_level is not None:
+            copy_opts.append(f"COMPRESSION_LEVEL {compression_level}")
+        if row_group_size is not None:
+            copy_opts.append(f"ROW_GROUP_SIZE {row_group_size}")
+
+        con.execute(
+            f"COPY (SELECT {', '.join(select_terms)} "
+            f"FROM read_parquet({sources_sql})) "
+            f"TO {_quote_path(target)} ({', '.join(copy_opts)})"
+        )
+    finally:
+        con.close()
+
+
+__all__ = ["to_parquet", "copy_to_geoparquet2"]
diff --git a/vecorel_cli/parquet/geopandas.py b/vecorel_cli/parquet/geopandas.py
index 9f61b16..e120b38 100644
--- a/vecorel_cli/parquet/geopandas.py
+++ b/vecorel_cli/parquet/geopandas.py
@@ -94,7 +94,7 @@ def to_parquet(
         output except `RangeIndex` which is stored as metadata only.
     compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
         Name of the compression to use. Use ``None`` for no compression.
-    schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
+    schema_version : {'0.1.0', '0.4.0', '1.0.0', '1.1.0', '2.0.0', None}
         GeoParquet specification version; if not provided will default to
         latest supported version.
     write_covering_bbox : bool, default False

From 915a2b874772a7721a81b4d98f8a4d6162ac6bee Mon Sep 17 00:00:00 2001
From: Ivor Bosloper <ivorbosloper@gmail.com>
Date: Thu, 21 May 2026 08:34:52 +0200
Subject: [PATCH 2/2] Update Changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49efad5..0935b1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+- Optionally write geoparquet 2.0 (with duckdb)
+
 ## [v0.2.15] - 2026-02-16
 
 - Add a `get_default_collection` to the Registry