diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 2f692500..aacf380d 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -273,24 +273,38 @@ def _read_geo_info(source, *, overview_level: int | None = None, # so ``overview_level`` indexes both internal and external # overviews (issue #2112). Local file paths only. from ._sidecar import attach_sidecar_origin, find_sidecar, load_sidecar + sidecar_origin: dict[int, tuple] = {} sidecar_path = find_sidecar(source) if sidecar_path is not None: sidecar = load_sidecar(sidecar_path) - # Metadata-only path: drop the origin mapping. The reader - # only needs the merged IFD list to resolve the requested - # ``overview_level`` against; strip/tile bytes are sliced by - # ``read_to_array`` on the actual read. - attach_sidecar_origin( + # The origin mapping is consumed below for georef extraction + # only -- strip/tile bytes are sliced by ``read_to_array`` on + # the actual read. A sidecar IFD that carries its own + # GeoKeyDirectory / ModelPixelScale / ModelTiepoint / + # ModelTransformation needs the sidecar's byte order to + # parse cleanly; without the mapping the helper falls back + # to the base file's bytes (today's default, correct under + # the usual GDAL convention). See issue #2315. + sidecar_origin = attach_sidecar_origin( sidecar.ifds, sidecar.data, sidecar.header) ifds = ifds + sidecar.ifds ifd = select_overview_ifd(ifds, overview_level) # Inherit georef from the level-0 IFD when the overview itself # has no geokeys (issue #1640). Pass-through for level 0. The # sidecar IFDs typically lack geokeys so the inheritance pulls - # from the base file's full-resolution IFD as GDAL does. + # from the base file's full-resolution IFD as GDAL does. When a + # sidecar IFD does declare its own georef payload, ``georef_origin`` + # routes the parse to the sidecar's bytes / byte order so the + # sidecar's georef wins. See issue #2315. + georef_origin = ( + {iid: (od, oh.byte_order) + for iid, (od, oh) in sidecar_origin.items()} + if sidecar_origin else None + ) geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order, - allow_rotated=allow_rotated) + allow_rotated=allow_rotated, + sidecar_origin=georef_origin) bps = resolve_bits_per_sample(ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) _validate_predictor_sample_format(ifd.predictor, ifd.sample_format) diff --git a/xrspatial/geotiff/_cog_http.py b/xrspatial/geotiff/_cog_http.py index 84e00818..5d52d4ff 100644 --- a/xrspatial/geotiff/_cog_http.py +++ b/xrspatial/geotiff/_cog_http.py @@ -239,6 +239,13 @@ def _parse_cog_http_meta( # IFD lives in the sidecar; that mirrors the eager local reader, # whose sidecar IFDs typically carry no out-of-line geokeys and # inherit from level-0 (which sits in the base buffer). #2239. + # + # The ``sidecar_origin`` kwarg added in #2315 for the eager local / + # fsspec paths is intentionally not threaded here. A separate fix + # is tracked in the HTTP / dask sidecar-byte-order finding (see + # the linked issue / PR for the HTTP side). When that lands, this + # call should pick up the same mapping so an HTTP sidecar with + # its own geokeys is parsed against the sidecar bytes too. geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, header_bytes, header.byte_order, allow_rotated=allow_rotated) diff --git a/xrspatial/geotiff/_geotags.py b/xrspatial/geotiff/_geotags.py index be0e6956..34e7d75b 100644 --- a/xrspatial/geotiff/_geotags.py +++ b/xrspatial/geotiff/_geotags.py @@ -1085,6 +1085,35 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview, ) +# Tag IDs that carry georeferencing payload (offsets into either the +# IFD entry's inline value or the file's byte stream). When a sidecar +# IFD declares any of these, the georef extractor must parse against +# the sidecar's bytes / byte order, not the base file's. See issue +# #2315. +_GEOREF_PAYLOAD_TAGS = frozenset({ + 33550, # TAG_MODEL_PIXEL_SCALE + 33922, # TAG_MODEL_TIEPOINT + 34264, # TAG_MODEL_TRANSFORMATION + 34735, # TAG_GEO_KEY_DIRECTORY +}) + + +def _ifd_has_georef_payload(ifd: IFD) -> bool: + """Return True if ``ifd`` declares any georef-bearing tag. + + The GDAL convention is that an external ``.tif.ovr`` sidecar + inherits georef from the base file -- the sidecar IFDs do not + re-declare ModelPixelScale / ModelTiepoint / GeoKeyDirectory. + Hand-rolled sidecars and non-GDAL writers can break that convention, + and when they do the sidecar's georef tags must be parsed against + the sidecar's byte buffer rather than the base file's. This helper + is the conservative gate that flips the data / byte_order swap in + :func:`extract_geo_info_with_overview_inheritance`. See issue #2315. + """ + entries = ifd.entries + return any(tag in entries for tag in _GEOREF_PAYLOAD_TAGS) + + def extract_geo_info_with_overview_inheritance( ifd: IFD, ifds: list, @@ -1092,6 +1121,7 @@ def extract_geo_info_with_overview_inheritance( byte_order: str, *, allow_rotated: bool = False, + sidecar_origin: dict | None = None, ) -> GeoInfo: """Extract geo metadata, inheriting from level 0 when the IFD lacks it. @@ -1141,12 +1171,36 @@ def extract_geo_info_with_overview_inheritance( Full file bytes (forwarded to ``extract_geo_info``). byte_order : str ``'<'`` or ``'>'`` (forwarded to ``extract_geo_info``). + sidecar_origin : dict, optional + Mapping from ``id(ifd)`` to ``(data, byte_order)`` for IFDs + that live in an external ``.tif.ovr`` sidecar. When the + selected IFD is from a sidecar that declares its own georef + payload (ModelPixelScale / ModelTiepoint / + ModelTransformation / GeoKeyDirectory), this mapping tells + the extractor which byte buffer the tag offsets resolve + against. IFDs absent from the mapping fall through to + ``(data, byte_order)`` for the base file. The conservative + default is ``None``, which preserves the legacy behavior + (read sidecar IFDs against the base file's bytes -- correct + only when the sidecar has no geokeys, which is the GDAL + convention). See issue #2315. Returns ------- GeoInfo """ - info = extract_geo_info(ifd, data, byte_order, + # When the selected IFD lives in a sidecar AND it carries its own + # georef payload, resolve tag offsets against the sidecar's bytes / + # byte order. The sidecar-without-geokeys convention (GDAL) keeps + # ``(data, byte_order)`` from the base file and inherits below. + # See issue #2315. + sel_data, sel_byte_order = data, byte_order + if sidecar_origin is not None: + origin = sidecar_origin.get(id(ifd)) + if origin is not None and _ifd_has_georef_payload(ifd): + sel_data, sel_byte_order = origin + + info = extract_geo_info(ifd, sel_data, sel_byte_order, allow_rotated=allow_rotated) # Overview IFDs have NewSubfileType bit 0 set; mask IFDs (bit 2) and @@ -1171,7 +1225,19 @@ def extract_geo_info_with_overview_inheritance( if base_ifd is None: return info - base_info = extract_geo_info(base_ifd, data, byte_order, + # Mirror the sidecar-origin routing for ``base_ifd``. The base IFD + # normally lives in the base file (``data`` / ``byte_order`` are + # correct), but a file with no full-resolution IFD of its own could + # land here with ``base_ifd`` resolved out of a sidecar. The lookup + # is the same shape as the one applied to ``ifd`` above. See + # review nit on #2315. + base_data, base_byte_order = data, byte_order + if sidecar_origin is not None: + base_origin = sidecar_origin.get(id(base_ifd)) + if base_origin is not None: + base_data, base_byte_order = base_origin + + base_info = extract_geo_info(base_ifd, base_data, base_byte_order, allow_rotated=allow_rotated) # Inherit the per-IFD metadata that the COG writer emits only on the diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 80bd0e92..f9a3003e 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -227,13 +227,24 @@ def _read_to_array(source, *, window=None, overview_level: int | None = None, # Inherit georef from level 0 when an overview IFD lacks its own # geokeys (issue #1640). For overview_level=0 (or None) this is a # no-op: the helper short-circuits when the IFD is not a - # NewSubfileType=overview entry. Sidecar IFDs always lack - # geokeys, so the inheritance pulls from the base file's - # level-0 IFD (kept first in the merged list) which is the - # GDAL convention. + # NewSubfileType=overview entry. Sidecar IFDs typically lack + # geokeys (the GDAL convention), so the inheritance pulls from + # the base file's level-0 IFD (kept first in the merged list). + # A sidecar that does declare its own georef payload is a corner + # case: ``georef_origin`` maps the sidecar IFDs to + # ``(data, byte_order)`` from the sidecar so the helper resolves + # those tags against the right buffer. Sidecar IFDs without + # geokeys still inherit from the base file via the existing + # overview-inheritance path. See issues #1640 and #2315. + georef_origin = ( + {iid: (od, oh.byte_order) + for iid, (od, oh) in sidecar_origin.items()} + if sidecar_origin else None + ) geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order, - allow_rotated=allow_rotated) + allow_rotated=allow_rotated, + sidecar_origin=georef_origin) # Orientation tag (274): values 2-8 mean the stored pixel order # differs from display order. We need to remap the array post diff --git a/xrspatial/geotiff/tests/test_sidecar_own_geokeys_2315.py b/xrspatial/geotiff/tests/test_sidecar_own_geokeys_2315.py new file mode 100644 index 00000000..5c2b96b1 --- /dev/null +++ b/xrspatial/geotiff/tests/test_sidecar_own_geokeys_2315.py @@ -0,0 +1,384 @@ +"""Sidecar IFDs that declare their own georef payload (issue #2315). + +The local eager reader and the metadata-only path both swap pixel bytes +over to the sidecar when the selected IFD lives there, but pre-#2315 +they always read georef tags from the base file. That is fine under +the usual GDAL convention -- a ``.tif.ovr`` sidecar carries no +geokeys -- but a sidecar that does declare its own +GeoKeyDirectory / ModelPixelScale / ModelTiepoint / +ModelTransformation had its tags parsed against the wrong buffer. + +These tests cover the two contracted behaviors: + +* When the sidecar IFD declares its own georef payload, the sidecar's + transform wins over the base file's. +* When the sidecar IFD has no georef tags (the GDAL convention), + today's inheritance from the base file is preserved. +""" +from __future__ import annotations + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import _read_geo_info, open_geotiff +from xrspatial.geotiff._geotags import _ifd_has_georef_payload +from xrspatial.geotiff._header import parse_all_ifds, parse_header +from xrspatial.geotiff._writers.eager import to_geotiff + + +def _make_dataarray(values: np.ndarray, *, origin_x: float, origin_y: float, + pixel_w: float, pixel_h: float, + epsg: int = 4326) -> xr.DataArray: + """Build a 2D DataArray whose attrs survive ``to_geotiff``.""" + h, w = values.shape + xs = origin_x + (np.arange(w) + 0.5) * pixel_w + ys = origin_y + (np.arange(h) + 0.5) * pixel_h + da = xr.DataArray(values, dims=("y", "x"), coords={"y": ys, "x": xs}) + da.attrs["crs"] = epsg + return da + + +def _write_pair(tmp_path, *, base_geo, sidecar_geo, sidecar_has_geokeys): + """Write base.tif and (optionally geokey-bearing) sidecar.tif.ovr. + + ``base_geo`` and ``sidecar_geo`` are kwarg dicts forwarded to + :func:`_make_dataarray`. When ``sidecar_has_geokeys`` is False the + sidecar file is rewritten to strip its GeoKey / Model* tags so the + file represents the "GDAL convention" (no geokeys on the sidecar + IFD). The byte-level strip operates on the parsed IFD entries: we + rewrite the IFD's entry count and entries array to exclude the + four georef tag IDs. + """ + base_arr = np.full((8, 8), 1, dtype=np.uint16) + sidecar_arr = np.full((4, 4), 2, dtype=np.uint16) + + base_path = tmp_path / "base_2315.tif" + side_path = tmp_path / "base_2315.tif.ovr" + + base_da = _make_dataarray(base_arr, **base_geo) + to_geotiff(base_da, str(base_path), tiled=False, compression="none") + + side_da = _make_dataarray(sidecar_arr, **sidecar_geo) + to_geotiff(side_da, str(side_path), tiled=False, compression="none") + + # Real GDAL ``.tif.ovr`` files mark every IFD as a reduced-res + # overview (NewSubfileType bit 0 set). ``to_geotiff`` writes a + # standalone full-res TIFF so the tag is absent; insert one so the + # sidecar IFD is treated as an overview by the inheritance helper. + _mark_first_ifd_as_overview(side_path) + + if not sidecar_has_geokeys: + _strip_georef_tags(side_path) + + return base_path, side_path + + +def _mark_first_ifd_as_overview(path): + """Set NewSubfileType=1 (reduced-resolution overview) on the first IFD. + + The tag is added or rewritten so the sidecar IFD looks like a real + GDAL external overview rather than a freshly written full-res TIFF. + Inline value, type LONG (3 = uint16 not enough -- the spec uses + type 4 / uint32). + """ + import struct + + raw = bytearray(path.read_bytes()) + assert raw[:2] == b"II", "test fixture assumes little-endian classic TIFF" + first_ifd_offset = struct.unpack_from(" 0: + first_tag = struct.unpack_from(" NSF, ( + f"test fixture invariant: first emitted tag must be > {NSF}, " + f"got {first_tag}" + ) + insert_pos = first_ifd_offset + 2 + + next_ifd_off_pos = first_ifd_offset + 2 + n_entries * 12 + block_end = next_ifd_off_pos + 4 + + # Slide every byte from ``insert_pos`` to end-of-file forward by 12 + # bytes. This makes room for the new entry and keeps value blocks + # that lived past the entries array intact at their new offsets. + tail = bytes(raw[insert_pos:]) + raw[insert_pos:insert_pos] = b"\x00" * 12 # carve a 12-byte hole + # raw is now 12 bytes longer; the hole sits at insert_pos..insert_pos+12. + # The original tail bytes are at insert_pos+12..end now. + + struct.pack_into(" 4. For count==1 LONG (typ=4, + # size=4) the value IS the byte offset and total=4 is inline, so + # the helper above skipped it. Fix it explicitly here. + _bump_inline_long_offsets(raw, first_ifd_offset, n_entries + 1, + insert_pos, 12) + + path.write_bytes(bytes(raw)) + + +def _bump_inline_long_offsets(raw, ifd_offset, n_entries, threshold, delta): + """Bump inline LONG offsets stored in StripOffsets / TileOffsets. + + For these tags the *value* itself is a byte offset into the file. + When count==1, the value sits inline in the entry's value field + and ``_bump_offsets`` skips it (because total <= 4). Patch it here. + """ + import struct + + OFFSET_TAGS = {273, 324} # StripOffsets, TileOffsets + for i in range(n_entries): + entry_off = ifd_offset + 2 + i * 12 + tag = struct.unpack_from("= threshold: + struct.pack_into("= threshold: + struct.pack_into("= threshold. + + Type sizes per TIFF 6.0: BYTE/SBYTE/ASCII/UNDEFINED=1, SHORT/SSHORT=2, + LONG/SLONG/FLOAT=4, RATIONAL/SRATIONAL/DOUBLE=8. + """ + import struct + + TYPE_SIZE = {1: 1, 2: 1, 3: 2, 4: 4, 5: 8, 6: 1, 7: 1, + 8: 2, 9: 4, 10: 8, 11: 4, 12: 8} + for i in range(n_entries): + entry_off = ifd_offset + 2 + i * 12 + type_id = struct.unpack_from("= threshold: + struct.pack_into("= threshold and next_off != 0: + struct.pack_into(" tail_start: + raw[tail_start:tail_end] = b"\x00" * (tail_end - tail_start) + + path.write_bytes(bytes(raw)) + + +# --------------------------------------------------------------------------- +# Helper unit test: the georef-payload detector recognises the four tags. +# --------------------------------------------------------------------------- +def test_ifd_has_georef_payload_true_for_pixel_scale(tmp_path): + p = tmp_path / "p_2315.tif" + da = _make_dataarray( + np.zeros((4, 4), dtype=np.uint16), + origin_x=0.0, origin_y=0.0, pixel_w=1.0, pixel_h=-1.0, + ) + to_geotiff(da, str(p), tiled=False, compression="none") + data = p.read_bytes() + hdr = parse_header(data) + ifds = parse_all_ifds(data, hdr) + assert _ifd_has_georef_payload(ifds[0]) + + +def test_ifd_has_georef_payload_false_after_strip(tmp_path): + p = tmp_path / "p_2315_strip.tif" + da = _make_dataarray( + np.zeros((4, 4), dtype=np.uint16), + origin_x=0.0, origin_y=0.0, pixel_w=1.0, pixel_h=-1.0, + ) + to_geotiff(da, str(p), tiled=False, compression="none") + _strip_georef_tags(p) + data = p.read_bytes() + hdr = parse_header(data) + ifds = parse_all_ifds(data, hdr) + assert not _ifd_has_georef_payload(ifds[0]) + + +# --------------------------------------------------------------------------- +# Sidecar carries its own geokeys -> the sidecar's georef wins. +# --------------------------------------------------------------------------- +def test_sidecar_with_own_geokeys_wins_eager(tmp_path): + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, side = _write_pair(tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=True) + + da = open_geotiff(str(base), overview_level=1) + # Sidecar transform wins. Rasterio-style 6-tuple: + # ``(pixel_width, 0, origin_x, 0, pixel_height, origin_y)``. + t = da.attrs["transform"] + assert t[0] == pytest.approx(2.5) + assert t[2] == pytest.approx(500.0) + assert t[4] == pytest.approx(-2.5) + assert t[5] == pytest.approx(800.0) + # Sidecar EPSG wins too (sidecar CRS is 3857, base is 4326). + assert da.attrs.get("crs") == 3857 + + +def test_sidecar_with_own_geokeys_wins_metadata_only(tmp_path): + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair(tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=True) + + geo_info, h, w, _, _ = _read_geo_info(str(base), overview_level=1) + assert (h, w) == (4, 4) + assert geo_info.transform.origin_x == pytest.approx(500.0) + assert geo_info.transform.origin_y == pytest.approx(800.0) + assert geo_info.transform.pixel_width == pytest.approx(2.5) + assert geo_info.transform.pixel_height == pytest.approx(-2.5) + assert geo_info.crs_epsg == 3857 + + +# --------------------------------------------------------------------------- +# Sidecar has no geokeys -> today's behavior is preserved (inherit base). +# --------------------------------------------------------------------------- +def test_sidecar_without_geokeys_inherits_from_base_eager(tmp_path): + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + # The sidecar's georef bytes will be stripped after write, so the + # numbers in ``side_geo`` are irrelevant to the assertion. + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair(tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=False) + + da = open_geotiff(str(base), overview_level=1) + # The sidecar IFD has no georef payload; the inheritance helper + # pulls the transform from the base IFD and rescales for the + # overview's reduction factor. Origin stays unchanged + # (PixelIsArea), pixel size scales by 8/4 = 2. + t = da.attrs["transform"] + assert t[0] == pytest.approx(20.0) + assert t[2] == pytest.approx(100.0) + assert t[4] == pytest.approx(-20.0) + assert t[5] == pytest.approx(200.0) + assert da.attrs.get("crs") == 4326 + + +def test_sidecar_without_geokeys_inherits_from_base_metadata_only(tmp_path): + base_geo = dict(origin_x=100.0, origin_y=200.0, + pixel_w=10.0, pixel_h=-10.0, epsg=4326) + side_geo = dict(origin_x=500.0, origin_y=800.0, + pixel_w=2.5, pixel_h=-2.5, epsg=3857) + + base, _ = _write_pair(tmp_path, + base_geo=base_geo, sidecar_geo=side_geo, + sidecar_has_geokeys=False) + + geo_info, h, w, _, _ = _read_geo_info(str(base), overview_level=1) + assert (h, w) == (4, 4) + # Base-inherited transform with rescaled pixel size for the 8->4 + # reduction factor. + assert geo_info.transform.origin_x == pytest.approx(100.0) + assert geo_info.transform.origin_y == pytest.approx(200.0) + assert geo_info.transform.pixel_width == pytest.approx(20.0) + assert geo_info.transform.pixel_height == pytest.approx(-20.0) + assert geo_info.crs_epsg == 4326