From 4cf53fabe9b5b740c86548d7a6cb75c23255ea88 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:47:28 -0700 Subject: [PATCH 1/2] Centralize VRT capability validation (#2329) Add `_vrt_validation.validate_parsed_vrt` as the single entry point for auditing a parsed `VRTDataset` against every capability the read pipeline does not honour. Both `read_vrt` (eager and chunked dispatch) and `open_geotiff('foo.vrt')` call the validator before any source decode, so the two entry points produce equivalent failures for the same bad input. Previously the per-source SrcRect/DstRect, zero pixel-size, and unsupported-resample checks fired mid-decode (and per chunk task under chunked dispatch), so a malformed VRT could build a dask graph successfully and blow up deep in a `compute()` chunk function. Validator rules: band count sanity, dtype kind compatibility, transform orientation (rotated/sheared), pixel-size compatibility, SrcRect/DstRect non-negativity and within-extent, and the supported resampling set. The mixed-band-nodata case keeps its existing `MixedBandMetadataError` typed-error contract by delegating to the already-registered `_check_read_mixed_band_metadata` hook. The rotated-transform and unparseable-CRS cases keep their existing typed subclasses (`RotatedTransformError`, `UnparseableCRSError`) so `except`-by-subclass callers stay green; the validator adds the source-path to the message and lifts the check ahead of any decode. The new capability checks raise `VRTUnsupportedError`, a `GeoTIFFAmbiguousMetadataError` subclass (and therefore `ValueError`). --- xrspatial/geotiff/_backends/vrt.py | 34 ++ xrspatial/geotiff/_errors.py | 19 + xrspatial/geotiff/_vrt_validation.py | 341 ++++++++++++ .../geotiff/tests/test_vrt_validation_2321.py | 488 ++++++++++++++++++ 4 files changed, 882 insertions(+) create mode 100644 xrspatial/geotiff/_vrt_validation.py create mode 100644 xrspatial/geotiff/tests/test_vrt_validation_2321.py diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 087c59b9..3e879ef1 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -384,9 +384,27 @@ def read_vrt(source: str, *, from .._validation import validate_read_metadata from .._vrt import _read_vrt_xml from .._vrt import parse_vrt as _parse_vrt + from .._vrt_validation import validate_parsed_vrt as _validate_parsed_vrt _xml_str = _read_vrt_xml(source) _vrt_dir = _os.path.dirname(_os.path.abspath(source)) _parsed_vrt = _parse_vrt(_xml_str, _vrt_dir) + # Centralised VRT capability validator (issue #2329). Runs before + # ``validate_read_metadata`` so capability mismatches (negative + # SrcRect / DstRect, unsupported resampling, zero pixel size, etc.) + # surface with the typed ``VRTUnsupportedError`` and a message that + # names the offending source path and field. The validator overlaps + # ``validate_read_metadata`` on rotated-transform / unparseable-CRS + # / mixed-band-nodata; running the centralised one first keeps the + # ``VRTUnsupportedError`` type at the entry-point boundary for the + # capability checks added in #2329. + _validate_parsed_vrt( + _parsed_vrt, + source=source, + mode='read', + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + band_nodata=band_nodata, + ) _band_nodata_values = ( [b.nodata for b in _parsed_vrt.bands] if _parsed_vrt.bands else None @@ -736,6 +754,22 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, # the ``band`` / window / ``max_pixels`` validators below so the # error ordering matches the eager path. from .._validation import validate_read_metadata + from .._vrt_validation import validate_parsed_vrt as _validate_parsed_vrt + # Centralised VRT capability validator (issue #2329). Run at graph + # build time so capability mismatches surface here, not inside a + # per-chunk decode task. ``read_vrt(..., chunks=)`` previously let + # unsupported features ride through the graph build and raised + # deep in a ``compute()`` chunk function (an opaque user + # experience); the validator moves the rejection back to where the + # caller stands. + _validate_parsed_vrt( + vrt, + source=source, + mode='read', + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + band_nodata=band_nodata, + ) band_nodata_values = ( [b.nodata for b in vrt.bands] if vrt.bands else None ) diff --git a/xrspatial/geotiff/_errors.py b/xrspatial/geotiff/_errors.py index 47f2492e..929e1bca 100644 --- a/xrspatial/geotiff/_errors.py +++ b/xrspatial/geotiff/_errors.py @@ -103,6 +103,24 @@ class ConflictingNodataError(GeoTIFFAmbiguousMetadataError): """ +class VRTUnsupportedError(GeoTIFFAmbiguousMetadataError): + """A parsed VRT declares a feature the read pipeline does not honour (#2329). + + Raised by the centralised VRT validator at graph-build / eager-read + setup time, before any source bytes are decoded. Covers CRS / dtype + / band / nodata / transform / pixel-size / source-window / + destination-window / resampling mismatches that the VRT read path + cannot serve correctly. The message names the offending source path + and field so a caller can locate the bad source without re-parsing + the VRT XML themselves. + + Subclasses ``GeoTIFFAmbiguousMetadataError`` (and therefore + ``ValueError``) so existing ``except ValueError`` callers keep + catching VRT-capability failures alongside the older ambiguous- + metadata family. + """ + + class UnknownCRSModelTypeError(GeoTIFFAmbiguousMetadataError): """Can't classify an EPSG as geographic or projected on write (#2277). @@ -127,5 +145,6 @@ class UnknownCRSModelTypeError(GeoTIFFAmbiguousMetadataError): "MixedBandMetadataError", "ConflictingCRSError", "ConflictingNodataError", + "VRTUnsupportedError", "UnknownCRSModelTypeError", ] diff --git a/xrspatial/geotiff/_vrt_validation.py b/xrspatial/geotiff/_vrt_validation.py new file mode 100644 index 00000000..b1ca770c --- /dev/null +++ b/xrspatial/geotiff/_vrt_validation.py @@ -0,0 +1,341 @@ +"""Centralised VRT capability validation (issue #2329 / parent #2321). + +A single :func:`validate_parsed_vrt` entry point that audits an +already-parsed :class:`xrspatial.geotiff._vrt.VRTDataset` against every +capability the read pipeline does not honour. Both the direct +``read_vrt`` entry point in ``_backends/vrt.py`` and the dispatched +``open_geotiff('foo.vrt')`` branch in ``__init__.py`` call this +validator before any source bytes are decoded, so the two entry points +produce equivalent failures for the same bad input. + +Why centralise: prior to this module the capability checks were spread +across ``_vrt.read_vrt`` (per-source ``SrcRect`` / ``DstRect`` rejects +mid-decode), ``_backends/vrt.read_vrt`` (CRS / rotated-transform / +mixed-nodata checks via ``validate_read_metadata`` at the eager +boundary), and ``_check_resample_alg_supported`` (per-source resample +reject at the placement site). Under chunked dispatch the per-source +checks fired inside dask tasks, so a malformed VRT could build a graph +successfully and then blow up deep in a ``compute()``. The validator +moves all of those checks to graph-build / eager-read setup time. + +The function never re-parses the VRT XML. Callers pass an already- +parsed ``VRTDataset`` plus the source path (used only for error +messages). Raises :class:`VRTUnsupportedError` on the first failure +with the offending source path and field embedded in the message. +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from ._errors import ( + RotatedTransformError, + UnparseableCRSError, + VRTUnsupportedError, +) + +if TYPE_CHECKING: + from ._vrt import VRTDataset + + +# Numpy dtype kinds the VRT read path can serve correctly. Bool / complex +# / object / void / string fall outside the supported kind set: the +# decode + sentinel-mask pipeline assumes scalar numeric pixels. +_SUPPORTED_DTYPE_KINDS = frozenset({'u', 'i', 'f'}) + + +# GDAL ```` text equivalent to nearest-neighbour. Kept in +# sync with ``xrspatial.geotiff._vrt._NEAREST_RESAMPLE_ALGS`` (mirroring +# rather than importing to keep this module a leaf with no _vrt back- +# reference). +_NEAREST_RESAMPLE_ALGS = frozenset({ + '', 'nearest', 'nearestneighbour', 'nearestneighbor', 'near', +}) + + +def _is_nearest_resample(alg: str | None) -> bool: + if alg is None: + return True + return alg.strip().lower() in _NEAREST_RESAMPLE_ALGS + + +def _looks_like_wkt(crs_wkt: str) -> bool: + """Mirror of ``_crs._looks_like_wkt`` for the VRT validator. + + A WKT-shaped CRS string starts (after lstrip) with one of the + recognised root keywords. Strings that do not match are treated as + unparseable unless the caller passed ``allow_unparseable_crs=True``. + """ + if not isinstance(crs_wkt, str): + return False + head = crs_wkt.lstrip().upper() + return head.startswith(( + 'PROJCS', 'GEOGCS', 'PROJCRS', 'GEOGCRS', 'COMPD_CS', + 'COMPOUNDCRS', 'VERT_CS', 'VERTCRS', 'LOCAL_CS', 'ENGCRS', + 'GEOCCS', 'TIMECRS', + )) + + +def validate_parsed_vrt( + parsed: 'VRTDataset', + *, + source: str, + mode: str = 'read', + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, + band_nodata: str | None = None, +) -> None: + """Audit a parsed VRT for capability mismatches before read execution. + + Parameters + ---------- + parsed : VRTDataset + The already-parsed VRT structure. The validator never re-parses + XML; callers feed in the output of ``parse_vrt`` or the value + threaded through ``read_vrt(..., parsed=...)``. + source : str + Path to the ``.vrt`` file. Used only for error messages so a + caller can locate the offending file without re-parsing. + mode : {'read'} + Reserved for a future write-side validator. Currently only + ``'read'`` is supported; other modes are rejected. + allow_rotated : bool, default False + Caller opt-in for non-axis-aligned ``GeoTransform``. ``False`` + rejects any non-zero rotation / shear term. + allow_unparseable_crs : bool, default False + Caller opt-in for CRS strings that pyproj cannot resolve and + that do not structurally look like WKT. ``False`` rejects the + string with :class:`VRTUnsupportedError`. + band_nodata : {None, 'first'}, optional + Mixed-band-nodata policy. ``None`` (the default) rejects a VRT + whose bands declare disagreeing ```` sentinels; + ``'first'`` keeps the legacy flatten-to-band-0 behaviour and + the validator returns silently for the mixed case. + + Raises + ------ + VRTUnsupportedError + On the first capability mismatch encountered. The message names + the offending source path and field so the caller can locate + the bad source without re-parsing the VRT XML themselves. + + Notes + ----- + The rules audited here are not exhaustive of every VRT failure + mode -- the parse step (``parse_vrt``) already rejects unknown + ``dataType`` tokens, source paths that escape the VRT directory, + and so on. The validator covers the residual capability checks + that previously fired mid-decode (or per chunk task under chunked + dispatch). + """ + if mode != 'read': + raise ValueError( + f"validate_parsed_vrt only supports mode='read', got " + f"{mode!r}. The write-side validator is reserved for a " + f"future change." + ) + + # Rule 1: band count sanity. + # + # A VRT with zero ```` elements would land in + # ``_effective_dtype_for_bands`` (which raises a ValueError with a + # generic message) or, worse, build an empty graph. Reject up front + # so the message names the file. The "no " substring + # keeps backward compatibility with the existing message check in + # test_vrt_multiband_dtype_1696.py. + if not parsed.bands: + raise VRTUnsupportedError( + f"VRT '{source}' declares no elements. " + f"The read pipeline cannot serve a band-less mosaic; the " + f"VRT must declare at least one raster band." + ) + + # Rule 2: dtype compatibility. + # + # ``parse_vrt`` already rejects unknown ``dataType`` tokens (#1783), + # so this branch only fires when a future ``_DTYPE_MAP`` change + # adds a complex / bool / object entry, or when a caller hand-builds + # a ``VRTDataset``. Surface the bad band number and dtype so the + # message is actionable. + for band in parsed.bands: + if band.dtype.kind not in _SUPPORTED_DTYPE_KINDS: + raise VRTUnsupportedError( + f"VRT '{source}' band {band.band_num} declares dtype " + f"{band.dtype!r}, which the read pipeline does not " + f"support. Supported dtype kinds: unsigned integer " + f"('u'), signed integer ('i'), and float ('f'). " + f"Complex / bool / object bands fall outside the " + f"decode + sentinel-mask pipeline." + ) + + # Rule 3: transform orientation. + # + # A non-zero skew_x (gt[2]) or skew_y (gt[4]) means the VRT is + # rotated / sheared. The read pipeline treats those as no-georef + # (#2122) only when the caller opts in via ``allow_rotated=True``; + # otherwise the projected coords would mislabel a pixel grid as a + # georeferenced raster. Reject up front to mirror the read-side + # ``RotatedTransformError`` contract. + gt = parsed.geo_transform + if gt is not None and (gt[2] != 0.0 or gt[4] != 0.0) and not allow_rotated: + # Use the existing typed error so callers that already + # ``except RotatedTransformError`` keep catching this case. The + # message names the source path so the centralised validator + # still adds value over the registered check in + # ``validate_read_metadata``. + raise RotatedTransformError( + f"VRT '{source}' GeoTransform has non-zero rotation/shear " + f"terms (skew_x={gt[2]}, skew_y={gt[4]}). The read pipeline " + f"treats the array as no-georef under allow_rotated=True; " + f"pass allow_rotated=True to opt in, or re-export the VRT " + f"with an axis-aligned GeoTransform." + ) + + # Rule 4: pixel-size compatibility. + # + # A zero ``res_x`` (gt[1]) or ``res_y`` (gt[5]) would produce a + # degenerate coord array (every coordinate equals the origin) and + # divide-by-zero in any pixel-to-world conversion downstream. The + # eager read path would currently surface this as an opaque coord + # generation error; raise here instead with the file name. + if gt is not None: + # Skip the rotated case: it already raised above unless the + # caller opted in via allow_rotated, in which case the pipeline + # treats the array as no-georef and the pixel size is moot. + is_rotated = (gt[2] != 0.0 or gt[4] != 0.0) + if not is_rotated and (gt[1] == 0.0 or gt[5] == 0.0): + raise VRTUnsupportedError( + f"VRT '{source}' GeoTransform declares a zero pixel " + f"size (res_x={gt[1]}, res_y={gt[5]}). The read pipeline " + f"divides by these to compute pixel-to-world coordinates; " + f"a zero term produces a degenerate raster. Re-export " + f"the VRT with non-zero res_x / res_y." + ) + + # Rule 5: nodata policy (mixed per-band sentinels). + # + # Delegate the disagreement check to the existing + # ``_check_read_mixed_band_metadata`` registered hook so the + # ``MixedBandMetadataError`` message and the ``None``-counts-as- + # undeclared semantics stay canonical in one place. The validator's + # job here is to reject typo'd ``band_nodata`` values up front + # (matching the boundary-level value check in ``_backends/vrt.py`` + # and ``__init__.py``) so a typo never silently degrades to strict + # mode at the registered hook below. + if band_nodata not in (None, 'first'): + raise ValueError( + f"band_nodata must be None or 'first', got {band_nodata!r}." + ) + + # Rule 6: CRS compatibility (unparseable / not-WKT-shaped). + # + # An empty ```` (``crs_wkt == ''``) is the no-CRS case and is + # legal; the read path lands ``crs_wkt=None`` and emits the no-georef + # marker. A non-empty CRS string that does not structurally look + # like WKT and that pyproj cannot resolve is rejected unless the + # caller opted in. Defer the pyproj probe to ``_wkt_to_epsg`` so a + # missing-pyproj host still rejects the obviously-malformed case + # (the ``_looks_like_wkt`` cheap check). + crs_wkt = parsed.crs_wkt + if crs_wkt and not allow_unparseable_crs: + if not _looks_like_wkt(crs_wkt): + # Try pyproj before raising so a valid PROJ / EPSG token + # passes when pyproj is installed. + from ._crs import _wkt_to_epsg + epsg = _wkt_to_epsg(crs_wkt) + if epsg is None: + # Use the existing typed error so callers that already + # ``except UnparseableCRSError`` keep catching this + # case. Adds the source path to the message. + raise UnparseableCRSError( + f"VRT '{source}' declares a CRS string that does " + f"not look like WKT and that pyproj could not " + f"resolve: {crs_wkt!r}. Pass " + f"allow_unparseable_crs=True to read the VRT " + f"anyway, or re-export the VRT with a WKT-formatted " + f"." + ) + + # Rule 7 / 8: SrcRect and DstRect sanity, plus the supported + # resampling set. Walk every source on every band so the message + # names the offending source path and field. + for band in parsed.bands: + for src in band.sources: + sr = src.src_rect + dr = src.dst_rect + + # Rule 7a: negative SrcRect size. + if sr.x_size < 0 or sr.y_size < 0: + raise VRTUnsupportedError( + f"VRT '{source}' SimpleSource '{src.filename}' " + f"(band {band.band_num}) has negative SrcRect size " + f"(xSize={sr.x_size}, ySize={sr.y_size}); SrcRect " + f"sizes must be non-negative." + ) + # Rule 7b: negative SrcRect offset. + if sr.x_off < 0 or sr.y_off < 0: + raise VRTUnsupportedError( + f"VRT '{source}' SimpleSource '{src.filename}' " + f"(band {band.band_num}) has negative SrcRect offset " + f"(xOff={sr.x_off}, yOff={sr.y_off}); SrcRect " + f"offsets must be non-negative." + ) + + # Rule 8a: negative DstRect size. + if dr.x_size < 0 or dr.y_size < 0: + raise VRTUnsupportedError( + f"VRT '{source}' SimpleSource '{src.filename}' " + f"(band {band.band_num}) has negative DstRect size " + f"(xSize={dr.x_size}, ySize={dr.y_size}); DstRect " + f"sizes must be non-negative." + ) + + # Rule 8b: DstRect outside the VRT extent. A source whose + # destination rect lies entirely outside the + # ``rasterXSize x rasterYSize`` window contributes no pixels + # to the mosaic; the read path silently skips it via the + # clip-to-window check. Refuse the malformed mosaic up + # front so the message names the bad source. + dst_r0 = dr.y_off + dst_c0 = dr.x_off + dst_r1 = dr.y_off + dr.y_size + dst_c1 = dr.x_off + dr.x_size + if (dst_r0 >= parsed.height or dst_c0 >= parsed.width + or dst_r1 <= 0 or dst_c1 <= 0): + raise VRTUnsupportedError( + f"VRT '{source}' SimpleSource '{src.filename}' " + f"(band {band.band_num}) has a DstRect " + f"(xOff={dr.x_off}, yOff={dr.y_off}, " + f"xSize={dr.x_size}, ySize={dr.y_size}) entirely " + f"outside the VRT extent " + f"({parsed.height}x{parsed.width}); the source " + f"would contribute no pixels." + ) + + # Rule 9: unsupported resampling. Only matters when the + # SrcRect / DstRect sizes differ (the read path falls back + # to a no-op placement when they match, regardless of + # ````). Matches the ``_check_resample_alg_supported`` + # gate in ``_vrt.py`` but lifts it before any pixel decode + # so the chunked path raises at graph build too. + needs_resample = ( + sr.x_size != dr.x_size or sr.y_size != dr.y_size + ) + if needs_resample and not _is_nearest_resample(src.resample_alg): + raise VRTUnsupportedError( + f"VRT '{source}' ComplexSource '{src.filename}' " + f"(band {band.band_num}) requests " + f"{src.resample_alg} " + f"with differing SrcRect " + f"({sr.x_size}x{sr.y_size}) and DstRect " + f"({dr.x_size}x{dr.y_size}) sizes. The read " + f"pipeline only implements nearest-neighbour " + f"resampling; substituting nearest would silently " + f"mislabel the output. Re-export with " + f"Nearest or matching " + f"SrcRect/DstRect sizes." + ) + + +__all__ = ['validate_parsed_vrt'] diff --git a/xrspatial/geotiff/tests/test_vrt_validation_2321.py b/xrspatial/geotiff/tests/test_vrt_validation_2321.py new file mode 100644 index 00000000..ce178556 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_validation_2321.py @@ -0,0 +1,488 @@ +"""Regression tests for issue #2329 / parent #2321 sub-task 2. + +Centralised VRT capability validation: one negative test per rule the +validator enforces. Every test asserts ``VRTUnsupportedError`` is +raised at validator-time (i.e. before any source decode), and that the +direct ``read_vrt`` entry point and the dispatched +``open_geotiff('foo.vrt')`` entry point produce the same error type +and the same message for the same bad input. + +The validator accepts an already-parsed ``VRTDataset``; the tests +exercise it both directly (with a parsed structure) and through the +two public entry points so the wiring is covered too. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff +from xrspatial.geotiff._backends.vrt import read_vrt +from xrspatial.geotiff._errors import ( + GeoTIFFAmbiguousMetadataError, + MixedBandMetadataError, + RotatedTransformError, + UnparseableCRSError, + VRTUnsupportedError, +) +from xrspatial.geotiff._vrt import parse_vrt +from xrspatial.geotiff._vrt_validation import validate_parsed_vrt +from xrspatial.geotiff._writer import write + + +def _write_src(tmp_path, name: str = 'src.tif', + shape=(4, 4), dtype=np.uint16) -> str: + """Write a small source TIFF and return its path.""" + arr = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape) + p = str(tmp_path / name) + write(arr, p, compression='none', tiled=False) + return p + + +def _write_vrt(tmp_path, xml: str, name: str = 'test.vrt') -> str: + p = str(tmp_path / name) + with open(p, 'w') as f: + f.write(xml) + return p + + +def _parse(tmp_path, xml: str, name: str = 'test.vrt'): + """Helper: write + parse a VRT XML, return (path, parsed).""" + import os + path = _write_vrt(tmp_path, xml, name) + parsed = parse_vrt(xml, os.path.dirname(os.path.abspath(path))) + return path, parsed + + +# --------------------------------------------------------------------------- +# Subclassing contract +# --------------------------------------------------------------------------- + + +def test_vrt_unsupported_error_is_geotiff_metadata_error(): + """``VRTUnsupportedError`` must subclass + ``GeoTIFFAmbiguousMetadataError`` (and ``ValueError`` via the base) + so ``except ValueError`` callers keep catching VRT failures.""" + assert issubclass(VRTUnsupportedError, GeoTIFFAmbiguousMetadataError) + assert issubclass(VRTUnsupportedError, ValueError) + + +# --------------------------------------------------------------------------- +# Rule 1: band count sanity (zero bands) +# --------------------------------------------------------------------------- + + +_NO_BANDS_VRT = """ + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 +""" + + +def test_zero_bands_raises_vrt_unsupported(tmp_path): + path, parsed = _parse(tmp_path, _NO_BANDS_VRT, 'nobands.vrt') + with pytest.raises(VRTUnsupportedError, match='band'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +def test_zero_bands_parity_across_entry_points(tmp_path): + path = _write_vrt(tmp_path, _NO_BANDS_VRT, 'nobands_parity.vrt') + with pytest.raises(VRTUnsupportedError) as a: + read_vrt(path) + with pytest.raises(VRTUnsupportedError) as b: + open_geotiff(path) + assert type(a.value) is type(b.value) + assert str(a.value) == str(b.value) + + +# --------------------------------------------------------------------------- +# Rule 2: dtype compatibility / unsupported dataType +# (``parse_vrt`` already rejects unknown dtype tokens; the validator +# rejects ``Complex`` placeholders that slip past via a typo'd map. +# The negative we exercise here is per-band dtype that is not in the +# supported numpy kind set: complex bands cannot ride through.) +# --------------------------------------------------------------------------- + + +def test_complex_dtype_band_rejected_by_validator(tmp_path): + """Even if a complex numpy dtype appears on a parsed band (e.g. + via a future _DTYPE_MAP entry), the validator must reject the + band before any read execution begins.""" + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'complex.vrt') + # Patch the parsed band dtype to complex128 to simulate a + # hypothetical map regression. The validator should refuse it. + parsed.bands[0].dtype = np.dtype('complex128') + with pytest.raises(VRTUnsupportedError, match='dtype'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +# --------------------------------------------------------------------------- +# Rule 3: transform orientation (rotation / shear) +# --------------------------------------------------------------------------- + + +def test_rotated_transform_rejected_without_opt_in(tmp_path): + """A VRT with non-zero rotation/shear terms in its GeoTransform must + be rejected by the validator unless ``allow_rotated=True`` is + passed.""" + src_path = _write_src(tmp_path) + # GeoTransform: origin_x, res_x, skew_x, origin_y, skew_y, res_y + # Non-zero skew_x / skew_y => rotated. + xml = f""" + 0.0, 1.0, 0.1, 0.0, 0.1, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'rotated.vrt') + # Rotated transforms raise the existing typed + # ``RotatedTransformError`` for backward compatibility with + # callers that ``except RotatedTransformError``; the centralised + # validator's value here is in adding the source path to the + # message and lifting the rejection ahead of any decode. + with pytest.raises(RotatedTransformError, match='rotat'): + validate_parsed_vrt(parsed, source=path, mode='read', + allow_rotated=False) + # Opt-in path returns silently. + validate_parsed_vrt(parsed, source=path, mode='read', + allow_rotated=True) + + +# --------------------------------------------------------------------------- +# Rule 4: SrcRect sanity (negative size / negative offset) +# --------------------------------------------------------------------------- + + +def test_negative_src_rect_size_rejected(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'neg_src.vrt') + with pytest.raises(VRTUnsupportedError, match='SrcRect'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +def test_negative_src_rect_offset_rejected(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'neg_src_off.vrt') + with pytest.raises(VRTUnsupportedError, match='SrcRect'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +# --------------------------------------------------------------------------- +# Rule 5: DstRect sanity (negative size, plus out-of-VRT-extent rect) +# --------------------------------------------------------------------------- + + +def test_negative_dst_rect_size_rejected(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'neg_dst.vrt') + with pytest.raises(VRTUnsupportedError, match='DstRect'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +def test_dst_rect_outside_vrt_extent_rejected(tmp_path): + """A DstRect that lands entirely outside the VRT's + ``rasterXSize x rasterYSize`` extent contributes nothing and is a + malformed mosaic. Reject up front.""" + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'out_dst.vrt') + with pytest.raises(VRTUnsupportedError, match='DstRect'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +# --------------------------------------------------------------------------- +# Rule 6: pixel-size compatibility (zero pixel size in transform) +# --------------------------------------------------------------------------- + + +def test_zero_pixel_size_rejected(tmp_path): + """A GeoTransform with zero res_x or res_y produces a degenerate + coord array and divides by zero in coord generation. Reject up + front.""" + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 0.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'zero_px.vrt') + with pytest.raises(VRTUnsupportedError, match='pixel size'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +# --------------------------------------------------------------------------- +# Rule 7: unsupported resampling algorithm +# --------------------------------------------------------------------------- + + +def test_unsupported_resample_alg_rejected_at_validate(tmp_path): + """A ComplexSource with a non-nearest resampler and differing + SrcRect/DstRect sizes must be rejected by the validator before + any pixels are read.""" + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 + + + {src_path} + 1 + + + Bilinear + + +""" + path, parsed = _parse(tmp_path, xml, 'bilinear.vrt') + with pytest.raises(VRTUnsupportedError, match='[Rr]esampl'): + validate_parsed_vrt(parsed, source=path, mode='read') + + +# --------------------------------------------------------------------------- +# Rule 8: nodata policy (mixed per-band sentinels without opt-in) +# --------------------------------------------------------------------------- + + +def test_mixed_band_nodata_rejected_without_opt_in(tmp_path): + """A two-band VRT with disagreeing per-band ```` must + be rejected unless ``band_nodata='first'`` is passed.""" + src_path = _write_src(tmp_path, name='src_mixed.tif') + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + 0 + + {src_path} + 1 + + + + + + 9999 + + {src_path} + 1 + + + + +""" + path, _ = _parse(tmp_path, xml, 'mixed_nodata.vrt') + # Mixed per-band sentinels raise the existing typed + # ``MixedBandMetadataError`` (registered via the + # ``validate_read_metadata`` hook). The centralised + # ``validate_parsed_vrt`` delegates the disagreement-detection + # logic to that hook so the ``None``-counts-as-undeclared + # semantics stay canonical in one place. The public entry points + # run both the validator and ``validate_read_metadata``, so the + # rejection still surfaces at boundary time. + with pytest.raises(MixedBandMetadataError): + read_vrt(path) + with pytest.raises(MixedBandMetadataError): + open_geotiff(path) + # Opt-in returns silently. + r = read_vrt(path, band_nodata='first') + assert r.shape == (4, 4, 2) + + +# --------------------------------------------------------------------------- +# Rule 9: CRS compatibility +# (the parsed VRTDataset only carries one ``crs_wkt`` field, so the +# CRS-mismatch case happens when a caller's expected CRS differs from +# the file's. The validator surfaces an unparseable CRS string up +# front unless ``allow_unparseable_crs=True``.) +# --------------------------------------------------------------------------- + + +def test_unparseable_crs_rejected_without_opt_in(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + GARBAGE_CRS_NOT_A_REAL_WKT + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'badcrs.vrt') + # CRS strings that pyproj cannot resolve raise the existing typed + # ``UnparseableCRSError`` so callers that already + # ``except UnparseableCRSError`` keep working. + with pytest.raises(UnparseableCRSError): + validate_parsed_vrt(parsed, source=path, mode='read', + allow_unparseable_crs=False) + # Opt-in path returns silently. + validate_parsed_vrt(parsed, source=path, mode='read', + allow_unparseable_crs=True) + + +# --------------------------------------------------------------------------- +# Entry-point parity: direct read_vrt and open_geotiff produce the same +# rejection (same error type and same message) for the same bad input. +# --------------------------------------------------------------------------- + + +_BILINEAR_XML_TEMPLATE = """ + 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 + + + {src} + 1 + + + Bilinear + + +""" + + +def test_resample_parity_across_entry_points(tmp_path): + src_path = _write_src(tmp_path) + xml = _BILINEAR_XML_TEMPLATE.format(src=src_path) + path = _write_vrt(tmp_path, xml, 'bilinear_parity.vrt') + + with pytest.raises(VRTUnsupportedError) as a: + read_vrt(path) + with pytest.raises(VRTUnsupportedError) as b: + open_geotiff(path) + assert type(a.value) is type(b.value) + assert str(a.value) == str(b.value) + + +def test_rotated_parity_across_entry_points(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.1, 0.0, 0.1, -1.0 + + + {src_path} + 1 + + + + +""" + path = _write_vrt(tmp_path, xml, 'rotated_parity.vrt') + + # Both entry points raise the same typed error + # (``RotatedTransformError`` here, since rotated transforms keep + # their pre-existing subclass) with the same message. + with pytest.raises(RotatedTransformError) as a: + read_vrt(path) + with pytest.raises(RotatedTransformError) as b: + open_geotiff(path) + assert type(a.value) is type(b.value) + assert str(a.value) == str(b.value) + + +# --------------------------------------------------------------------------- +# Chunked (dask) path: rejection happens at graph build, not in a chunk +# function. Same error type and message as the eager path. +# --------------------------------------------------------------------------- + + +def test_unsupported_resample_chunked_raises_at_build(tmp_path): + """The chunked dispatcher must run the validator before building + the dask graph so the failure surfaces at construction time, not + deep in a chunk function during ``compute()``.""" + src_path = _write_src(tmp_path) + xml = _BILINEAR_XML_TEMPLATE.format(src=src_path) + path = _write_vrt(tmp_path, xml, 'bilinear_chunked.vrt') + + with pytest.raises(VRTUnsupportedError): + read_vrt(path, chunks=2) + + +# --------------------------------------------------------------------------- +# Sanity: a well-formed minimal VRT validates with no exception. +# --------------------------------------------------------------------------- + + +def test_well_formed_vrt_validates_silently(tmp_path): + src_path = _write_src(tmp_path) + xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + + {src_path} + 1 + + + + +""" + path, parsed = _parse(tmp_path, xml, 'good.vrt') + # No exception: + validate_parsed_vrt(parsed, source=path, mode='read') + # And the public entry points read it without raising the + # validator error. + result = read_vrt(path) + assert result.shape == (4, 4) From 8be5d84346a356cb903b44c266e42bdd3cadacbb Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:52:54 -0700 Subject: [PATCH 2/2] Address review nits: silent CRS probe and import canonical helpers (#2329) - Replace ``_wkt_to_epsg`` (which emits ``GeoTIFFFallbackWarning`` on parse failure) with a direct ``pyproj.CRS.from_user_input`` probe so the validator's unparseable-CRS check matches the no-warning behaviour of the registered ``_check_read_unparseable_crs`` hook. - Lazy-import ``_NEAREST_RESAMPLE_ALGS`` from ``_vrt`` and ``_looks_like_wkt`` from ``_crs`` instead of hand-copying both constants. Sidesteps the import cycle without the drift risk that comes with maintaining two copies. - Add a Notes section to the ``validate_parsed_vrt`` docstring that flags the overlap with the registered ``validate_read_metadata`` hooks so a future maintainer modifying the rotated-transform or unparseable-CRS check knows the validator preempts them on the VRT path. --- xrspatial/geotiff/_vrt_validation.py | 105 ++++++++++++++++----------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/xrspatial/geotiff/_vrt_validation.py b/xrspatial/geotiff/_vrt_validation.py index b1ca770c..407ffbbb 100644 --- a/xrspatial/geotiff/_vrt_validation.py +++ b/xrspatial/geotiff/_vrt_validation.py @@ -45,38 +45,21 @@ _SUPPORTED_DTYPE_KINDS = frozenset({'u', 'i', 'f'}) -# GDAL ```` text equivalent to nearest-neighbour. Kept in -# sync with ``xrspatial.geotiff._vrt._NEAREST_RESAMPLE_ALGS`` (mirroring -# rather than importing to keep this module a leaf with no _vrt back- -# reference). -_NEAREST_RESAMPLE_ALGS = frozenset({ - '', 'nearest', 'nearestneighbour', 'nearestneighbor', 'near', -}) - - def _is_nearest_resample(alg: str | None) -> bool: + """True iff ``alg`` is a recognised nearest-neighbour spelling. + + Delegates to the canonical ``_NEAREST_RESAMPLE_ALGS`` set in + ``_vrt.py``. Lazy-imported here so the validator stays a leaf + module with no eager back-reference to ``_vrt``, while still + matching whatever ``_vrt`` recognises as nearest. Avoids the + drift risk of a hand-copied set. + """ if alg is None: return True + from ._vrt import _NEAREST_RESAMPLE_ALGS return alg.strip().lower() in _NEAREST_RESAMPLE_ALGS -def _looks_like_wkt(crs_wkt: str) -> bool: - """Mirror of ``_crs._looks_like_wkt`` for the VRT validator. - - A WKT-shaped CRS string starts (after lstrip) with one of the - recognised root keywords. Strings that do not match are treated as - unparseable unless the caller passed ``allow_unparseable_crs=True``. - """ - if not isinstance(crs_wkt, str): - return False - head = crs_wkt.lstrip().upper() - return head.startswith(( - 'PROJCS', 'GEOGCS', 'PROJCRS', 'GEOGCRS', 'COMPD_CS', - 'COMPOUNDCRS', 'VERT_CS', 'VERTCRS', 'LOCAL_CS', 'ENGCRS', - 'GEOCCS', 'TIMECRS', - )) - - def validate_parsed_vrt( parsed: 'VRTDataset', *, @@ -128,6 +111,23 @@ def validate_parsed_vrt( and so on. The validator covers the residual capability checks that previously fired mid-decode (or per chunk task under chunked dispatch). + + Overlap with registered ``validate_read_metadata`` hooks + -------------------------------------------------------- + The rotated-transform and unparseable-CRS branches in this + validator overlap with the registered hooks + ``_check_read_rotated_transform`` and + ``_check_read_unparseable_crs`` in :mod:`._validation`. The VRT + call sites in ``_backends/vrt.py`` call this validator first and + ``validate_read_metadata`` immediately after, so the validator's + branches preempt the registered hooks for the VRT path: the + registered hooks remain reachable only when the validator passes + the case through (e.g. ``allow_rotated=True``, or a non-VRT call + site that does not run this validator at all). The duplication + is intentional -- the validator embeds the source path in the + message and lifts the check ahead of any decode -- but maintainers + editing the registered hooks should keep both copies in sync, or + fold one into the other. """ if mode != 'read': raise ValueError( @@ -239,23 +239,44 @@ def validate_parsed_vrt( # (the ``_looks_like_wkt`` cheap check). crs_wkt = parsed.crs_wkt if crs_wkt and not allow_unparseable_crs: + # Lazy-import the WKT structural check from ``_crs`` so the two + # call sites cannot drift on which root keywords count as WKT. + from ._crs import _looks_like_wkt if not _looks_like_wkt(crs_wkt): - # Try pyproj before raising so a valid PROJ / EPSG token - # passes when pyproj is installed. - from ._crs import _wkt_to_epsg - epsg = _wkt_to_epsg(crs_wkt) - if epsg is None: - # Use the existing typed error so callers that already - # ``except UnparseableCRSError`` keep catching this - # case. Adds the source path to the message. - raise UnparseableCRSError( - f"VRT '{source}' declares a CRS string that does " - f"not look like WKT and that pyproj could not " - f"resolve: {crs_wkt!r}. Pass " - f"allow_unparseable_crs=True to read the VRT " - f"anyway, or re-export the VRT with a WKT-formatted " - f"." - ) + # Probe pyproj directly so a valid PROJ / EPSG token passes + # when pyproj is installed, without going through + # ``_wkt_to_epsg``. The wrapper emits a + # ``GeoTIFFFallbackWarning`` on parse failure; the + # registered ``_check_read_unparseable_crs`` hook in + # ``_validation.py`` does not. Routing through the wrapper + # would emit a warning here that the existing check would + # not have emitted, which is a small but visible regression + # in the unparseable-CRS path. Match the existing hook's + # silent-probe behaviour. + try: + from pyproj import CRS as _PyProjCRS + from pyproj.exceptions import CRSError as _PyProjCRSError + except ImportError: + # No pyproj available: cannot prove the CRS is bad, so + # mirror the registered hook's no-op behaviour and let + # downstream code see the unparseable string. + pass + else: + try: + _PyProjCRS.from_user_input(crs_wkt) + except _PyProjCRSError: + # Use the existing typed error so callers that + # already ``except UnparseableCRSError`` keep + # catching this case. Adds the source path to the + # message. + raise UnparseableCRSError( + f"VRT '{source}' declares a CRS string that " + f"does not look like WKT and that pyproj could " + f"not resolve: {crs_wkt!r}. Pass " + f"allow_unparseable_crs=True to read the VRT " + f"anyway, or re-export the VRT with a " + f"WKT-formatted ." + ) # Rule 7 / 8: SrcRect and DstRect sanity, plus the supported # resampling set. Walk every source on every band so the message