From ae36ea621a27bb28eb6ef0dc2d00f5b92767ccc0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 16 May 2026 06:04:39 -0700 Subject: [PATCH 1/2] geotiff: golden corpus phase 2.9 -- GDAL_METADATA + extra_tags (#1930) Two new fixtures for the geotiff golden corpus from #1930: * gdal_metadata_namespaced_uint16: stripped uint16 with GDAL_METADATA entries in the default domain, IMAGE_STRUCTURE, SUBDATASETS, and a custom user domain. Covers the reader's cross-domain key handling. * extra_tags_uint16: stripped uint16 with ImageDescription, Software, Artist, and a private numeric tag (65000) written as real IFD entries. Generator changes: * Validate gdal_metadata and extra_tags shapes at manifest load so typos fail early. * Route extra_tags through a tifffile post-pass. rasterio's update_tags stores well-known names like Software inside the GDAL_METADATA XML instead of the real TIFF tag code, and it cannot emit private numeric tag codes at all. The post-pass keeps the GeoTIFF tags rasterio wrote and appends the requested extras. Both fixtures stay under 4 KB and pass compare_to_oracle against a rasterio-derived DataArray. --- .../fixtures/extra_tags_uint16.tif | Bin 0 -> 1056 bytes .../gdal_metadata_namespaced_uint16.tif | Bin 0 -> 2219 bytes .../geotiff/tests/golden_corpus/generate.py | 139 ++++++++++- .../geotiff/tests/golden_corpus/manifest.yaml | 68 ++++- .../test_golden_corpus_metadata_tags_1930.py | 233 ++++++++++++++++++ 5 files changed, 433 insertions(+), 7 deletions(-) create mode 100644 xrspatial/geotiff/tests/golden_corpus/fixtures/extra_tags_uint16.tif create mode 100644 xrspatial/geotiff/tests/golden_corpus/fixtures/gdal_metadata_namespaced_uint16.tif create mode 100644 xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py diff --git a/xrspatial/geotiff/tests/golden_corpus/fixtures/extra_tags_uint16.tif b/xrspatial/geotiff/tests/golden_corpus/fixtures/extra_tags_uint16.tif new file mode 100644 index 0000000000000000000000000000000000000000..458c22713befdf9a3b930eb208ed8c3983927565 GIT binary patch literal 1056 zcmaixbyyT(5QpErQ$$6?#6r(j^z2S-vAb&t@f6`)4h3vb&+hK-0K2=pySux4ZUvtE z&wZZVdB2(8#D3%J>y0ar0hA;tLLTxePHa$&iGE`|J9I8i?5-Fvi7$l&(!}SQ3{ure zlhLWBXFMaCEJ{+nzPIFNjpq`=1)Oq=OZ`kzzk>KH3a1usChQ*LhEdp0IC_X1F2Yg5 z>+D8M!YhSi?QV`XJQt3$8}z^ZF8uVDf%M`xID+y-Xkk{>W>JGpL7~9`fgvXUP|X@1 zX6kB*u!U=Z`izp6XcP73e|qyjJrSy=sgZIDv1t}R__^ZU`G$T?swP;y*?&Ks_Ue*B zwS5w#irzUHByr4OaI8zLOeE`5cYl_ zl%Xu;C{G0{Qi;k`p(;L9qdGOHNiAwqhr0MukNPyAA&qEE6PnVD=Cq(Ct!Paf+R~2p zbf6=h=u8(>{O~7$K)Mn{H!O6g2f_3tgix&XLL&?t;q;~t5kwM2U;5FX0SsgigNY`F zAq-_0c7`*8k&I$AV;IXg#xsG5Oky%qn94M!GlQATVm5P_%RJ_@fQ2k#F-us=GM2M~ zm8@blYgo%VVu@ot8`#JuHnWATY-2k+*vT$-vxmLxV?PHt$RQ4MgrgkeI43yCDNb{S zvz+5R7r4kJE^~#eT;n=7xXCSUbBDX!<30~~$Ri%}gr_{?IWKt0D_--4x4h#$ANa^8 SKJ$gIeB(Pm_{lGR^M}92u*tdr literal 0 HcmV?d00001 diff --git a/xrspatial/geotiff/tests/golden_corpus/fixtures/gdal_metadata_namespaced_uint16.tif b/xrspatial/geotiff/tests/golden_corpus/fixtures/gdal_metadata_namespaced_uint16.tif new file mode 100644 index 0000000000000000000000000000000000000000..b61a3a7261514f8fc385bc8724e19441fc4ac564 GIT binary patch literal 2219 zcmd^0Iu_|BQR-@JF{yYJ0>-*vnD z)6f<=gT;o0IKq;EtqIuHgpItU1l+~aCjPEW(=}l}HR0WE2{}Cza=Zz+)3g+9!tt71 zQ>UkpByp!)YhyMPiK$Y!E|!8-@F>CcF)PV}%Z&}m*t>eqMo-Cz!DeinZRtL)A)Zk) zG{?-e#81->+Q?^-YO-~SQbdYb*xA{VTpyEFlH$*^`m|53iy7-3;}K>Aq6>S~>IPrV zO5Ygg=zOm-Xy9adG}vrH+g8rjPUzaxfsUkb2%R`o{z4ZH<8Y3kE2(tjNV?O5qv%O5 zj^-E?dSj;#$8sFU)0Y$IM}JP_B+@vUbWS0I0i4P}PUCdWU=U}L$yqqaBAdbF;A9B7 zxX5ED`J9cLa~Q^OJQPsK2u5-)UVIcWiqVYWJkI9=E@Uj@DCQzmN+_iajdI3wF@7## z0u=-Z5+ck*bRtBlWD-?Wb19c`Ig`19E15zKS22}oOy_E@A;t`@<%rr1ml;m15p23_vEZ6ot9%t$CfFGN z&MDR=r!?7Cc$#yJBj4p5?$M%ZnHp8IJ1C079n}I#Pz`8V_M!r}&uh;PXi-g%IMR)k zzovw$f|_nu%0dCvAI!4n7I}Szo?=&_$LTJxXIJZyup0HN6=|g*JzUugN4QcChay_D zETwuV5=m<;p~z<9dsQNV&;-rglq3D%bF^Bg`fn*}o-}MzQELR7irOyNR8$XXoTj1% z2sRWoQG}3bG*lU8?>1xOY)kjn>f!%8WghG(Z_yB!)93W&`HC}&U3uQz1EN2$B$kW# rgX1f-v=TK~rcCl{Ri@7Wfvv#l$!mtK++Q87)U`~5F2khyn`QO`JE*9T literal 0 HcmV?d00001 diff --git a/xrspatial/geotiff/tests/golden_corpus/generate.py b/xrspatial/geotiff/tests/golden_corpus/generate.py index 823cdf445..1048bc62b 100644 --- a/xrspatial/geotiff/tests/golden_corpus/generate.py +++ b/xrspatial/geotiff/tests/golden_corpus/generate.py @@ -242,6 +242,38 @@ def _validate_one(entry: dict[str, Any], seen_ids: set[str]) -> None: f"got {entry['external_overview']!r}" ) + gdal_md = entry.get("gdal_metadata") + if gdal_md is not None: + if not isinstance(gdal_md, dict): + raise ManifestError( + f"{fid}: gdal_metadata must be a mapping of " + f"{{domain: {{item: value}}}}, got {gdal_md!r}" + ) + for domain, items in gdal_md.items(): + if not isinstance(domain, str): + raise ManifestError( + f"{fid}: gdal_metadata domain keys must be strings, " + f"got {domain!r}" + ) + if not isinstance(items, dict): + raise ManifestError( + f"{fid}: gdal_metadata[{domain!r}] must be a mapping, " + f"got {items!r}" + ) + + extra_tags = entry.get("extra_tags") + if extra_tags is not None: + if not isinstance(extra_tags, dict): + raise ManifestError( + f"{fid}: extra_tags must be a mapping, got {extra_tags!r}" + ) + for k in extra_tags: + if not isinstance(k, (str, int)): + raise ManifestError( + f"{fid}: extra_tags keys must be strings or ints, " + f"got {k!r}" + ) + def validate(manifest: dict[str, Any]) -> list[dict[str, Any]]: """Validate the parsed manifest and return resolved fixture entries. @@ -403,6 +435,104 @@ def _rasterio_kwargs(entry: dict[str, Any]) -> dict[str, Any]: return kwargs +# Well-known TIFF tag codes for the string keys we accept in extra_tags. +# Keys are stored in lower-case for case-insensitive lookup. The values are +# the TIFF tag numbers from the baseline TIFF 6.0 spec. +_WELL_KNOWN_TIFF_TAGS = { + "imagedescription": 270, + "software": 305, + "artist": 315, + "copyright": 33432, + "datetime": 306, + "documentname": 269, + "hostcomputer": 316, +} + +# GeoTIFF tag codes the post-pass must preserve when rewriting a file with +# tifffile to attach extra TIFF tags. These are the tags rasterio emits to +# encode the CRS / transform; everything else in the IFD gets rebuilt by +# tifffile from the pixel data and writer kwargs. +_GEOTIFF_TAG_CODES = (33550, 33922, 34735, 34736, 34737, 42112) + + +def _normalize_extra_tag_key(key: Any) -> tuple[int, str]: + """Resolve an extra_tags key to a (numeric_code, display_name). + + String keys are matched against `_WELL_KNOWN_TIFF_TAGS` case-insensitively; + int keys pass through as private tag codes. Unknown string keys raise so + typos surface at generation time rather than producing a silently + different file. + """ + if isinstance(key, int): + return key, str(key) + if isinstance(key, str): + code = _WELL_KNOWN_TIFF_TAGS.get(key.lower()) + if code is None: + raise ManifestError( + f"unknown extra_tags name: {key!r}. Use an integer tag code " + f"or one of {sorted(_WELL_KNOWN_TIFF_TAGS)}." + ) + return code, key + raise ManifestError(f"extra_tags key must be str or int, got {key!r}") + + +def _apply_extra_tags_with_tifffile( + path: pathlib.Path, extra_tags: dict[Any, Any] +) -> None: + """Rewrite ``path`` so each entry in ``extra_tags`` lands as a real TIFF tag. + + rasterio cannot emit private numeric TIFF tags via its writer, and its + `update_tags` API stores well-known names like ``Software`` inside the + ``GDAL_METADATA`` XML rather than in the actual TIFF tag code. To get a + real IFD entry, the generator writes the raster with rasterio first + (so the GeoTIFF tags are correct) and then this helper rewrites the + file in place via tifffile, preserving the GeoTIFF tags and adding the + requested extras. + """ + import tifffile + + with tifffile.TiffFile(str(path)) as t: + page = t.pages[0] + pixels = page.asarray() + photometric = page.photometric + planarconfig = page.planarconfig + preserved: list[tuple[int, int, int, Any, bool]] = [] + for tag in page.tags: + if tag.code in _GEOTIFF_TAG_CODES: + dcode = tag.dtype.value if hasattr(tag.dtype, "value") else int(tag.dtype) + preserved.append((tag.code, dcode, tag.count, tag.value, True)) + + # tifffile reserves the `description` and `software` writer kwargs for + # tag codes 270 and 305 respectively; routing those through `extratags` + # produces a warning and a silently-dropped tag. Pull them out. + description: str | None = None + software: str | None = None + extratags = list(preserved) + for key, value in sorted(extra_tags.items(), key=lambda kv: _normalize_extra_tag_key(kv[0])[0]): + code, _ = _normalize_extra_tag_key(key) + sval = str(value) + if code == 270: + description = sval + elif code == 305: + software = sval + else: + # Type 's' = ASCII string. count=0 lets tifffile size it. + extratags.append((code, "s", 0, sval, True)) + + write_kwargs: dict[str, Any] = { + "photometric": photometric, + "planarconfig": planarconfig, + "metadata": None, # do not emit tifffile's JSON header into tag 270 + "extratags": extratags, + } + if description is not None: + write_kwargs["description"] = description + if software is not None: + write_kwargs["software"] = software + + tifffile.imwrite(str(path), pixels, **write_kwargs) + + def write_fixture(entry: dict[str, Any], output_dir: pathlib.Path) -> pathlib.Path: """Materialise one fixture. Returns the written path. @@ -421,9 +551,9 @@ def write_fixture(entry: dict[str, Any], output_dir: pathlib.Path) -> pathlib.Pa dst.write(pixels[b], b + 1) gdal_md = entry.get("gdal_metadata") or {} for domain, items in sorted(gdal_md.items()): - dst.update_tags(ns=domain, **{str(k): str(v) for k, v in sorted(items.items())}) - if extra_tags: - dst.update_tags(**{str(k): str(v) for k, v in sorted(extra_tags.items())}) + dst.update_tags( + ns=domain, **{str(k): str(v) for k, v in sorted(items.items())} + ) overviews = entry.get("overviews") or [] if overviews and not entry.get("external_overview"): from rasterio.enums import Resampling @@ -432,6 +562,9 @@ def write_fixture(entry: dict[str, Any], output_dir: pathlib.Path) -> pathlib.Pa ) dst.build_overviews(sorted(overviews), resamp) + if extra_tags: + _apply_extra_tags_with_tifffile(out_path, extra_tags) + # Normalise mtime so re-runs are byte-stable on filesystems that # encode timestamps in sidecar files. os.utime(out_path, (DETERMINISTIC_EPOCH, DETERMINISTIC_EPOCH)) diff --git a/xrspatial/geotiff/tests/golden_corpus/manifest.yaml b/xrspatial/geotiff/tests/golden_corpus/manifest.yaml index ad345ff09..d452dcb6c 100644 --- a/xrspatial/geotiff/tests/golden_corpus/manifest.yaml +++ b/xrspatial/geotiff/tests/golden_corpus/manifest.yaml @@ -55,10 +55,19 @@ # gdal_metadata: map Optional. Written into the GDAL_METADATA tag as # {: {: }}. Domain "" is the # default. Used to exercise XML escaping and the -# extra_tags safe-filter path. -# extra_tags: map Optional. Raw TIFF tag overrides written via -# rasterio's `extra_tags=` kwarg. Tag name to -# value, e.g. {Software: "xrspatial-corpus"}. +# extra_tags safe-filter path. Recognised GDAL +# domains include "", "IMAGE_STRUCTURE", +# "SUBDATASETS", and any custom user domain. +# extra_tags: map Optional. Real TIFF tag overrides written as +# actual IFD entries (not GDAL_METADATA XML). +# Keys may be string names ("ImageDescription", +# "Software", "Artist") or integers (private +# tag codes like 65000). Values are strings. +# The generator writes the raster with rasterio +# first (for the GeoTIFF tags) and then rewrites +# with tifffile to inject these as concrete TIFF +# tags, since rasterio cannot emit private +# numeric tags directly. # pixel_pattern: str How the generator fills pixel data. One of: # "ramp" deterministic linear ramp # "checker" block checker for compression @@ -141,3 +150,54 @@ fixtures: atol: 0.0 rtol: 0.0 lossy: false + + - id: gdal_metadata_namespaced_uint16 + description: >- + Stripped uint16 with GDAL_METADATA entries across multiple domains + (default, IMAGE_STRUCTURE, SUBDATASETS, and a custom user domain). + Exercises the reader's cross-domain key handling. + width: 16 + height: 16 + dtype: uint16 + blocksize: 16 + gdal_metadata: + "": + UNITS: meters + AREA_OR_POINT: Area + IMAGE_STRUCTURE: + COMPRESSION: NONE + INTERLEAVE: PIXEL + SUBDATASETS: + SUBDATASET_1_NAME: "fixture:band1" + SUBDATASET_1_DESC: "single-band view" + CUSTOM_DOMAIN: + owner: xrspatial-corpus + purpose: cross-domain metadata smoke + pixel_pattern: ramp + tags: [fast, metadata, gdal_metadata] + tolerance: + atol: 0.0 + rtol: 0.0 + lossy: false + + - id: extra_tags_uint16 + description: >- + Stripped uint16 with arbitrary TIFF tags (ImageDescription, Artist, + Software, and a private numeric tag 65000) written as real IFD + entries via a tifffile post-pass. Exercises attribute round-tripping + of pass-through tags across backends. + width: 16 + height: 16 + dtype: uint16 + blocksize: 16 + extra_tags: + ImageDescription: "xrspatial golden corpus fixture" + Software: "xrspatial-golden-corpus" + Artist: "xarray-contrib" + 65000: "private-tag-payload" + pixel_pattern: ramp + tags: [fast, metadata, extra_tags] + tolerance: + atol: 0.0 + rtol: 0.0 + lossy: false diff --git a/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py new file mode 100644 index 000000000..ca49c680c --- /dev/null +++ b/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py @@ -0,0 +1,233 @@ +"""Smoke tests for the GDAL_METADATA + extra_tags golden-corpus fixtures. + +Phase 2 PR 9 of issue #1930 adds two fixtures: + +* ``gdal_metadata_namespaced_uint16`` -- a stripped uint16 raster with + GDAL_METADATA entries spread across the default domain, the + ``IMAGE_STRUCTURE`` and ``SUBDATASETS`` GDAL domains, and a custom user + domain. This exercises the cross-domain key handling that the xrspatial + reader has to navigate. +* ``extra_tags_uint16`` -- a stripped uint16 raster with arbitrary TIFF + tags (``ImageDescription``, ``Software``, ``Artist``, and a private + numeric tag ``65000``) written as real IFD entries. The generator routes + these through a tifffile post-pass because rasterio cannot emit private + numeric tags via its writer. + +These tests confirm the fixtures land on disk in the expected shape and +that an xarray DataArray built from a plain rasterio read still satisfies +``compare_to_oracle`` -- i.e. the metadata layer does not break the parity +contract. + +The full canonical-attrs contract for GDAL_METADATA / pass-through tags +lands in issue #1984; this PR does not assert it. +""" +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +import xarray as xr + +pytest.importorskip("yaml") +rasterio = pytest.importorskip("rasterio") +tifffile = pytest.importorskip("tifffile") + +from xrspatial.geotiff.tests.golden_corpus import generate as _generate # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus._oracle import ( # noqa: E402 + compare_to_oracle, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_FIXTURE_DIR = ( + Path(_generate.__file__).resolve().parent / "fixtures" +) + + +def _read_as_dataarray(path: Path) -> xr.DataArray: + """Build an xrspatial-shaped DataArray from a rasterio read. + + Mirrors what a numpy backend would produce: 2-D for single-band rasters, + pixel-centre y/x coords, and an ``attrs`` dict carrying the canonical + keys the oracle inspects (``transform``, ``crs``, ``nodata``). + """ + with rasterio.open(path) as src: + data = src.read() # (bands, H, W) + transform = src.transform + crs = src.crs + nodata = src.nodata + height, width = src.height, src.width + + pw, ph = float(transform.a), float(transform.e) + ox, oy = float(transform.c), float(transform.f) + x = ox + (np.arange(width) + 0.5) * pw + y = oy + (np.arange(height) + 0.5) * ph + + attrs: dict = {"transform": (pw, 0.0, ox, 0.0, ph, oy)} + if crs is not None: + epsg = crs.to_epsg() + if epsg is not None: + attrs["crs"] = epsg + else: + attrs["crs_wkt"] = crs.to_wkt() + if nodata is not None: + attrs["nodata"] = nodata + + if data.shape[0] == 1: + return xr.DataArray( + data[0], + dims=("y", "x"), + coords={"y": y, "x": x}, + attrs=attrs, + ) + return xr.DataArray( + data, + dims=("band", "y", "x"), + coords={"band": np.arange(1, data.shape[0] + 1), "y": y, "x": x}, + attrs=attrs, + ) + + +def _ensure_fixture(fixture_id: str) -> Path: + """Regenerate ``fixture_id`` if its .tif file is missing.""" + path = _FIXTURE_DIR / f"{fixture_id}.tif" + if not path.exists(): + _generate.generate(only=[fixture_id], output_dir=_FIXTURE_DIR) + return path + + +# --------------------------------------------------------------------------- +# gdal_metadata_namespaced_uint16 +# --------------------------------------------------------------------------- + +def test_gdal_metadata_namespaced_default_domain_present() -> None: + """Default-domain GDAL_METADATA entries survive the write.""" + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + with rasterio.open(path) as src: + tags = src.tags() # default domain + assert tags.get("UNITS") == "meters" + assert tags.get("AREA_OR_POINT") == "Area" + + +def test_gdal_metadata_namespaced_image_structure_domain_present() -> None: + """IMAGE_STRUCTURE is a GDAL-managed domain. We do not own the values + GDAL writes there (it overrides INTERLEAVE / COMPRESSION from the + actual codec / planar config), but the domain must be non-empty so the + reader's cross-domain enumeration has a real entry to walk. + """ + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + with rasterio.open(path) as src: + struct = src.tags(ns="IMAGE_STRUCTURE") + assert struct, ( + f"IMAGE_STRUCTURE domain must be non-empty, got {struct!r}" + ) + # INTERLEAVE is always present for raster TIFFs. + assert "INTERLEAVE" in struct + + +def test_gdal_metadata_namespaced_subdatasets_domain_present() -> None: + """SUBDATASETS entries written by the generator round-trip via rasterio.""" + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + with rasterio.open(path) as src: + subs = src.tags(ns="SUBDATASETS") + assert subs.get("SUBDATASET_1_NAME") == "fixture:band1" + assert subs.get("SUBDATASET_1_DESC") == "single-band view" + + +def test_gdal_metadata_namespaced_custom_domain_present() -> None: + """User-defined GDAL domains round-trip key-for-key.""" + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + with rasterio.open(path) as src: + custom = src.tags(ns="CUSTOM_DOMAIN") + assert custom.get("owner") == "xrspatial-corpus" + assert custom.get("purpose") == "cross-domain metadata smoke" + + +def test_gdal_metadata_namespaced_passes_oracle() -> None: + """A plain rasterio-derived DataArray satisfies the oracle. + + The oracle inspects crs / transform / nodata / dtype + pixels; GDAL + metadata is intentionally pass-through, so it must not break parity. + """ + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + cand = _read_as_dataarray(path) + compare_to_oracle(path, cand) + + +# --------------------------------------------------------------------------- +# extra_tags_uint16 +# --------------------------------------------------------------------------- + +_EXPECTED_EXTRA_TAGS = { + 270: "xrspatial golden corpus fixture", # ImageDescription + 305: "xrspatial-golden-corpus", # Software + 315: "xarray-contrib", # Artist + 65000: "private-tag-payload", # private numeric tag +} + + +def test_extra_tags_uint16_real_tiff_tags_present() -> None: + """Every requested extra tag lands as a real IFD entry, not in GDAL_METADATA. + + Walks the IFD via tifffile because rasterio surfaces well-known tags + under TIFFTAG_* but hides private numeric tags. The test checks the + raw tag codes so a regression that silently rerouted entries through + ``GDAL_METADATA`` XML would fail here. + """ + path = _ensure_fixture("extra_tags_uint16") + with tifffile.TiffFile(path) as t: + page_tags = {tag.code: tag.value for tag in t.pages[0].tags} + for code, expected in _EXPECTED_EXTRA_TAGS.items(): + assert code in page_tags, ( + f"expected TIFF tag {code} not present on the primary IFD; " + f"got tag codes {sorted(page_tags)}" + ) + assert page_tags[code] == expected, ( + f"tag {code} value mismatch: got {page_tags[code]!r}, " + f"expected {expected!r}" + ) + + +def test_extra_tags_uint16_rasterio_surfaces_known_names() -> None: + """The three well-known names show up in ``src.tags()`` (default domain). + + rasterio rewrites well-known TIFF tag codes into the ``TIFFTAG_*`` + namespace when it reads the default-domain tag dict; private numeric + codes are not surfaced here and are checked by the tifffile test above. + """ + path = _ensure_fixture("extra_tags_uint16") + with rasterio.open(path) as src: + tags = src.tags() + assert tags.get("TIFFTAG_IMAGEDESCRIPTION") == _EXPECTED_EXTRA_TAGS[270] + assert tags.get("TIFFTAG_SOFTWARE") == _EXPECTED_EXTRA_TAGS[305] + assert tags.get("TIFFTAG_ARTIST") == _EXPECTED_EXTRA_TAGS[315] + + +def test_extra_tags_uint16_passes_oracle() -> None: + """Adding TIFF tags does not perturb the oracle's pixel / georef checks.""" + path = _ensure_fixture("extra_tags_uint16") + cand = _read_as_dataarray(path) + compare_to_oracle(path, cand) + + +def test_extra_tags_uint16_size_under_4kb() -> None: + """The fixture stays under the 4 KB per-file size budget.""" + path = _ensure_fixture("extra_tags_uint16") + size = path.stat().st_size + assert size < 4096, ( + f"extra_tags_uint16.tif grew past the 4 KB budget: {size} bytes" + ) + + +def test_gdal_metadata_namespaced_size_under_4kb() -> None: + """The fixture stays under the 4 KB per-file size budget.""" + path = _ensure_fixture("gdal_metadata_namespaced_uint16") + size = path.stat().st_size + assert size < 4096, ( + f"gdal_metadata_namespaced_uint16.tif grew past the 4 KB budget: " + f"{size} bytes" + ) From 19a62ce4a95956a101fa04e66f83c46d748128fc Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 16 May 2026 06:09:43 -0700 Subject: [PATCH 2/2] geotiff: address review of #1998 * Reject bool keys in extra_tags. bool subclasses int in Python, so the previous isinstance(k, (str, int)) check let YAML true/false keys through. * Preserve tag 42112 GDAL_METADATA across the tifffile post-pass so a fixture can combine extra_tags with gdal_metadata without losing the GDAL XML. The canonical example_tiled_uint16_deflate_pred2 fixture did exactly that and would have started writing a half-formed file with the original change. * Resolve extra_tags keys once per item instead of twice (the sort key and the loop body each called _normalize_extra_tag_key). * Note in the manifest that IMAGE_STRUCTURE is a GDAL-managed domain and the COMPRESSION / INTERLEAVE values are placeholders, not values the writer round-trips. * Swap height, width = ... -> width, height = ... in the test helper to match the order of np.arange(width) / np.arange(height) below. * Add validator tests for the bool guard and for non-string gdal_metadata domain keys. --- .../geotiff/tests/golden_corpus/generate.py | 34 +++++++++++---- .../geotiff/tests/golden_corpus/manifest.yaml | 5 +++ .../test_golden_corpus_metadata_tags_1930.py | 42 ++++++++++++++++++- 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/xrspatial/geotiff/tests/golden_corpus/generate.py b/xrspatial/geotiff/tests/golden_corpus/generate.py index 1048bc62b..2c01fb909 100644 --- a/xrspatial/geotiff/tests/golden_corpus/generate.py +++ b/xrspatial/geotiff/tests/golden_corpus/generate.py @@ -268,7 +268,9 @@ def _validate_one(entry: dict[str, Any], seen_ids: set[str]) -> None: f"{fid}: extra_tags must be a mapping, got {extra_tags!r}" ) for k in extra_tags: - if not isinstance(k, (str, int)): + # bool is a subclass of int in Python, so the (str, int) check + # would otherwise let `True` / `False` through as tag code 1/0. + if isinstance(k, bool) or not isinstance(k, (str, int)): raise ManifestError( f"{fid}: extra_tags keys must be strings or ints, " f"got {k!r}" @@ -448,10 +450,15 @@ def _rasterio_kwargs(entry: dict[str, Any]) -> dict[str, Any]: "hostcomputer": 316, } -# GeoTIFF tag codes the post-pass must preserve when rewriting a file with -# tifffile to attach extra TIFF tags. These are the tags rasterio emits to -# encode the CRS / transform; everything else in the IFD gets rebuilt by -# tifffile from the pixel data and writer kwargs. +# Tag codes the tifffile post-pass must preserve when rewriting a file to +# attach extra TIFF tags. These cover: +# * GeoTIFF georeferencing tags (33550, 33922, 34735, 34736, 34737) -- rasterio +# emits these to encode CRS / transform. +# * 42112 GDAL_METADATA -- holds any cross-domain GDAL metadata rasterio +# wrote via update_tags(ns=...). Preserved so a fixture can carry both +# gdal_metadata and extra_tags without losing the GDAL XML. +# tifffile re-derives SampleFormat / ExtraSamples from the pixel dtype on +# write, so those are intentionally not in this list. _GEOTIFF_TAG_CODES = (33550, 33922, 34735, 34736, 34737, 42112) @@ -488,6 +495,13 @@ def _apply_extra_tags_with_tifffile( (so the GeoTIFF tags are correct) and then this helper rewrites the file in place via tifffile, preserving the GeoTIFF tags and adding the requested extras. + + Only the codes listed in ``_GEOTIFF_TAG_CODES`` survive the rewrite. + That set covers the GeoTIFF georeferencing tags, GDAL_METADATA, and + the SampleFormat / ExtraSamples tags rasterio writes for non-uint8 + dtypes. Other tags rasterio may emit (nodata sentinel, ResolutionUnit, + etc.) are not currently forwarded; extend the list if a future fixture + needs them. """ import tifffile @@ -508,9 +522,13 @@ def _apply_extra_tags_with_tifffile( description: str | None = None software: str | None = None extratags = list(preserved) - for key, value in sorted(extra_tags.items(), key=lambda kv: _normalize_extra_tag_key(kv[0])[0]): - code, _ = _normalize_extra_tag_key(key) - sval = str(value) + # Resolve every key once, sort by numeric code so the on-disk order + # is deterministic across runs. + resolved = sorted( + ((_normalize_extra_tag_key(k)[0], str(v)) for k, v in extra_tags.items()), + key=lambda item: item[0], + ) + for code, sval in resolved: if code == 270: description = sval elif code == 305: diff --git a/xrspatial/geotiff/tests/golden_corpus/manifest.yaml b/xrspatial/geotiff/tests/golden_corpus/manifest.yaml index d452dcb6c..8adb2786b 100644 --- a/xrspatial/geotiff/tests/golden_corpus/manifest.yaml +++ b/xrspatial/geotiff/tests/golden_corpus/manifest.yaml @@ -164,6 +164,11 @@ fixtures: "": UNITS: meters AREA_OR_POINT: Area + # IMAGE_STRUCTURE is a GDAL-managed domain. GDAL overwrites + # COMPRESSION and INTERLEAVE on write based on the real codec / + # planar config, so the values below are placeholders. The point + # of listing them is to make the domain non-empty so the reader + # has to enumerate it. IMAGE_STRUCTURE: COMPRESSION: NONE INTERLEAVE: PIXEL diff --git a/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py index ca49c680c..452503d52 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_metadata_tags_1930.py @@ -60,7 +60,7 @@ def _read_as_dataarray(path: Path) -> xr.DataArray: transform = src.transform crs = src.crs nodata = src.nodata - height, width = src.height, src.width + width, height = src.width, src.height pw, ph = float(transform.a), float(transform.e) ox, oy = float(transform.c), float(transform.f) @@ -231,3 +231,43 @@ def test_gdal_metadata_namespaced_size_under_4kb() -> None: f"gdal_metadata_namespaced_uint16.tif grew past the 4 KB budget: " f"{size} bytes" ) + + +# --------------------------------------------------------------------------- +# Validator guard rails +# --------------------------------------------------------------------------- + +def _minimal_entry(defaults: dict) -> dict: + """Build a manifest entry minus extra_tags / gdal_metadata for guard tests.""" + base = dict(defaults) + base.update( + id="validator_smoke", + description="validator smoke", + width=16, + height=16, + dtype="uint16", + blocksize=16, + ) + return base + + +def test_validator_rejects_bool_extra_tags_key() -> None: + """`bool` subclasses `int` so the isinstance check has to filter it out.""" + manifest = _generate.load_manifest() + defaults = manifest.get("defaults") or {} + entry = _minimal_entry(defaults) + entry["extra_tags"] = {True: "boom"} + bad = {"version": 1, "defaults": {}, "fixtures": [entry]} + with pytest.raises(_generate.ManifestError, match="strings or ints"): + _generate.validate(bad) + + +def test_validator_rejects_non_string_gdal_metadata_domain() -> None: + """Domain keys must be strings; the YAML loader can produce ints.""" + manifest = _generate.load_manifest() + defaults = manifest.get("defaults") or {} + entry = _minimal_entry(defaults) + entry["gdal_metadata"] = {123: {"k": "v"}} + bad = {"version": 1, "defaults": {}, "fixtures": [entry]} + with pytest.raises(_generate.ManifestError, match="domain keys must be strings"): + _generate.validate(bad)