diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 92ba91f0..23ff38a3 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -373,9 +373,10 @@ def read_geotiff_gpu(source: str, *, # Append sibling `.tif.ovr` sidecar IFDs onto the pyramid list so # ``overview_level`` indexes both internal and external overviews # (issue #2112). When the selected IFD comes from the sidecar, - # we swap ``data`` / ``header`` to the sidecar's buffers below - # and skip the GDS fast path -- GDS reads the source file path, - # which would point at the base file rather than the sidecar. + # tile / strip reads below use ``ifd_data`` / ``ifd_header`` for + # the sidecar's buffers, and we skip the GDS fast path -- GDS + # reads the source file path, which would point at the base + # file rather than the sidecar. from .._sidecar import attach_sidecar_origin, close_sidecar, find_sidecar, load_sidecar sidecar_origin: dict[int, tuple] = {} sidecar_path = find_sidecar(source) @@ -388,21 +389,36 @@ def read_geotiff_gpu(source: str, *, # Skip mask IFDs (NewSubfileType bit 2) ifd = select_overview_ifd(ifds, overview_level) - # Swap base data / header for the sidecar's buffers when the - # requested overview level lives in the sidecar. Subsequent - # tile / strip reads slice the right buffer. + # Keep ``data`` / ``header`` bound to the base file's buffers so + # the georef extractor below resolves base-IFD tag offsets + # against the right bytes. Introduce ``ifd_data`` / ``ifd_header`` + # for tile / strip reads that need the sidecar buffer when the + # selected IFD lives in the sidecar (issue #2324). + ifd_data, ifd_header = data, header origin = sidecar_origin.get(id(ifd)) if origin is not None: - data, header = origin + ifd_data, ifd_header = origin sidecar_owned_ifd = True bps = resolve_bits_per_sample(ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) # Inherit georef from the level-0 IFD when the overview itself # has no geokeys (issue #1640); pass-through for level 0. + # ``sidecar_origin`` routes tag-offset parsing to the sidecar + # bytes for any IFD that lives in the sidecar AND declares its + # own georef payload, mirroring the CPU eager path + # (``_reader.py``) and dask path (``__init__.py``). Without + # this, a sidecar-owned IFD that lacks geokeys would parse the + # base IFD against the sidecar bytes (issue #2324). + georef_origin = ( + {iid: (od, oh.byte_order) + for iid, (od, oh) in sidecar_origin.items()} + if sidecar_origin else None + ) geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order, - allow_rotated=allow_rotated) + allow_rotated=allow_rotated, + sidecar_origin=georef_origin) # Capture the Orientation tag (274) once so the post-decode flip # below picks it up for both the stripped fallback and the tiled # GPU pipelines. CPU read_to_array applies the array remap + @@ -670,7 +686,7 @@ def _read_once(): source, _read_once, band_offsets, band_byte_counts, tw, th, width, height, compression, predictor, file_dtype, - byte_order=header.byte_order, + byte_order=ifd_header.byte_order, gpu=gpu, ) if band_arr is None: @@ -734,7 +750,7 @@ def _read_once(): source, offsets, byte_counts, tw, th, width, height, compression, predictor, file_dtype, samples, - byte_order=header.byte_order, + byte_order=ifd_header.byte_order, masked_fill=masked_fill, ) except Exception as e: @@ -750,12 +766,13 @@ def _read_once(): if arr_gpu is None: # Fallback: extract tiles via CPU mmap, then GPU decode. For - # sidecar IFDs the tile bytes already live in ``data`` (loaded - # from the .ovr above); re-opening ``source`` would point at the - # base file. Use the in-scope ``data`` directly in that case. + # sidecar IFDs the tile bytes already live in ``ifd_data`` + # (loaded from the .ovr above); re-opening ``source`` would + # point at the base file. Use ``ifd_data`` directly in that + # case. if sidecar_owned_ifd: compressed_tiles = [ - bytes(data[offsets[i]:offsets[i] + byte_counts[i]]) + bytes(ifd_data[offsets[i]:offsets[i] + byte_counts[i]]) for i in range(len(offsets)) ] else: @@ -775,7 +792,7 @@ def _read_once(): compressed_tiles, tw, th, width, height, compression, predictor, file_dtype, samples, - byte_order=header.byte_order, + byte_order=ifd_header.byte_order, masked_fill=masked_fill, ) except Exception as e: @@ -1262,9 +1279,17 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level, if not ifds: raise ValueError("No IFDs found in TIFF file") ifd = select_overview_ifd(ifds, overview_level) + # The GDS qualification probe parses base-file IFDs only -- + # sidecar files do not qualify for the disk->GPU fast path + # and the chunked path falls through to ``read_geotiff_dask`` + # which carries its own sidecar handling. Pass + # ``sidecar_origin=None`` explicitly so all four call sites + # of this helper share the same call shape (review nit + # on #2324). geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, raw, header.byte_order, allow_rotated=allow_rotated, + sidecar_origin=None, ) orientation = ifd.orientation has_sparse_tile = ( diff --git a/xrspatial/geotiff/tests/test_gpu_sidecar_georef_parity_2324.py b/xrspatial/geotiff/tests/test_gpu_sidecar_georef_parity_2324.py new file mode 100644 index 00000000..bb0c8f1f --- /dev/null +++ b/xrspatial/geotiff/tests/test_gpu_sidecar_georef_parity_2324.py @@ -0,0 +1,165 @@ +"""GPU + sidecar georef-attr parity across backends (issue #2324). + +The GPU eager path used to overwrite the local ``data`` / ``header`` +references with the sidecar's buffer before calling +``extract_geo_info_with_overview_inheritance``. For the common GDAL +sidecar case where overview IFDs lack their own geokeys, the helper +then parsed the inherited base IFD against the sidecar bytes -- the +returned georef came out silently wrong, while the CPU eager and dask +paths handled it correctly. + +These tests load the same ``.tif`` + ``.tif.ovr`` pair via the three +backends and assert georef attrs match. The GPU portion skips cleanly +when cupy / CUDA is unavailable so the test runs on CI without a GPU +and still guards the metadata code on a real GPU box. + +Test fixture helpers (sidecar writer, NewSubfileType insertion, georef +tag stripping) are shared with ``test_sidecar_own_geokeys_2315.py`` -- +imported rather than duplicated so a fixture-format change there does +not silently desync this file. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff, read_geotiff_dask + +from .test_sidecar_own_geokeys_2315 import _write_pair + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not _HAS_GPU, reason="cupy + CUDA required", +) + + +def _attrs_subset(da): + """Pull the georef attrs we want to compare across backends. + + Coerce transform elements to plain ``float`` so an attr stored as a + numpy array (some backends round-trip through ``np.asarray``) does + not produce numpy scalars in the tuple. The equality check below + would still pass for matching values but the comparison semantics + are clearer with native Python floats. + """ + transform = da.attrs.get("transform", ()) + return { + "transform": tuple(float(x) for x in transform), + "crs": da.attrs.get("crs"), + } + + +def test_sidecar_without_geokeys_attrs_match_cpu_vs_dask_2324(tmp_path): + """Baseline: CPU eager and dask agree on inherited georef. + + Pre-#2324 already passes; included so the parity matrix is complete + and a future regression on either path surfaces here too. The GPU + half lives in ``test_sidecar_without_geokeys_gpu_matches_cpu_2324``. + """ + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair( + tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=False, + ) + + cpu = open_geotiff(str(base), overview_level=1) + dsk = read_geotiff_dask(str(base), overview_level=1, chunks=2) + + assert _attrs_subset(cpu) == _attrs_subset(dsk) + # And the inherited transform is what we expect from the base file + # (pixel size rescaled by 8/4 = 2 for the 4x4 overview). + t = cpu.attrs["transform"] + assert t[0] == pytest.approx(20.0) + assert t[2] == pytest.approx(100.0) + assert t[4] == pytest.approx(-20.0) + assert t[5] == pytest.approx(200.0) + assert cpu.attrs.get("crs") == 4326 + + +@_gpu_only +def test_sidecar_without_geokeys_gpu_matches_cpu_2324(tmp_path): + """Regression for #2324: GPU eager georef matches CPU / dask. + + The sidecar is stripped of its georef tags, so the inheritance + helper has to walk back to the base IFD. Pre-fix, the GPU path + handed the helper the sidecar bytes (because ``data, header`` got + swapped before the call) and parsed the base IFD's georef offsets + against the sidecar buffer -- silently corrupt transform / crs. + """ + from xrspatial.geotiff import read_geotiff_gpu + + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair( + tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=False, + ) + + cpu = open_geotiff(str(base), overview_level=1) + gpu = read_geotiff_gpu(str(base), overview_level=1) + + # Same georef attrs as the CPU path. + assert _attrs_subset(gpu) == _attrs_subset(cpu) + # And pixel data also matches (sidecar bytes are still consumed for + # the actual decode; this confirms the fix did not break the tile + # read path while moving the variable rename). + np.testing.assert_array_equal(np.asarray(gpu.data.get()), + np.asarray(cpu.data)) + + +@_gpu_only +def test_sidecar_with_own_geokeys_gpu_matches_cpu_2324(tmp_path): + """GPU path routes a sidecar-owned georef payload to sidecar bytes. + + When the sidecar IFD declares its own georef payload the helper's + ``sidecar_origin`` argument routes the parse to the sidecar buffer. + The CPU eager / dask paths already pass the mapping; the GPU eager + path now passes it too. The sidecar's transform must win for both + backends. + """ + from xrspatial.geotiff import read_geotiff_gpu + + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair( + tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=True, + ) + + cpu = open_geotiff(str(base), overview_level=1) + gpu = read_geotiff_gpu(str(base), overview_level=1) + + assert _attrs_subset(gpu) == _attrs_subset(cpu) + # Sidecar's transform won on both sides. + t = gpu.attrs["transform"] + assert t[0] == pytest.approx(2.5) + assert t[2] == pytest.approx(500.0) + assert t[4] == pytest.approx(-2.5) + assert t[5] == pytest.approx(800.0) + assert gpu.attrs.get("crs") == 3857