Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions xrspatial/geotiff/tests/golden_corpus/_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,76 @@ def _pixels_equal(ref: np.ndarray, cand: np.ndarray) -> bool:
return np.array_equal(ref, cand)


# ---------------------------------------------------------------------------
# Masked-nodata normalisation (issue #1988)
# ---------------------------------------------------------------------------

def _has_masked_nodata(candidate_da: xr.DataArray) -> bool:
"""True when the candidate reports xrspatial's masked-nodata contract.

The contract (issue #1988): xrspatial reads an integer GeoTIFF whose
nodata tag carries an integer sentinel, masks the sentinel-equal
pixels to NaN, and upcasts the array to float so NaN can live in
it. The reader stamps ``attrs['masked_nodata'] = True`` to record
that the masking happened; ``attrs['nodata']`` still carries the
original integer sentinel so a write round-trip can put the tag
back.
"""
return bool(candidate_da.attrs.get('masked_nodata', False))


def _normalise_for_masked_nodata(
ref_pixels: np.ndarray,
ref_dtype: np.dtype,
ref_nodata,
candidate_da: xr.DataArray,
) -> tuple[np.ndarray, np.dtype]:
"""Apply xrspatial's masked-nodata contract to the rasterio reference.

When the candidate reports ``attrs['masked_nodata']=True``, the
reference is rewritten so it matches what the candidate produced:
cast to the candidate's float dtype, then any pixel equal to the
integer sentinel ``ref_nodata`` becomes ``NaN``. The ``ref_dtype``
returned alongside is the candidate's dtype, so the downstream
``_assert_dtype`` check passes by design.

If the candidate does not report masked nodata, or if ``ref_nodata``
is not a finite integer sentinel, the inputs pass through
unchanged. That keeps every existing fixture's behaviour intact.
"""
if not _has_masked_nodata(candidate_da):
return ref_pixels, ref_dtype
cand_dtype = np.dtype(candidate_da.dtype)
if cand_dtype.kind != 'f':
# The masked_nodata contract requires a float dtype on the
# candidate so NaN can live in the array. If it is anything
# else, fall through to the normal strict comparison and let
# it fail with a clear message.
return ref_pixels, ref_dtype
# The sentinel must be a finite, integer-valued real number that
# fits in the source integer dtype. The upstream xrspatial reader
# at xrspatial/geotiff/__init__.py only sets attrs['masked_nodata']
# under the same three guards; the oracle mirrors them so a
# candidate that drifts (sets the flag with an out-of-range or
# fractional sentinel) cannot trick the oracle into masking the
# wrong pixels via integer truncation or unsigned wraparound.
try:
nd_float = float(ref_nodata)
except (TypeError, ValueError):
return ref_pixels, ref_dtype
if not np.isfinite(nd_float):
return ref_pixels, ref_dtype
if not float(nd_float).is_integer():
return ref_pixels, ref_dtype
if ref_pixels.dtype.kind in ('i', 'u'):
info = np.iinfo(ref_pixels.dtype)
if not (info.min <= int(nd_float) <= info.max):
return ref_pixels, ref_dtype
new_ref = ref_pixels.astype(cand_dtype, copy=True)
new_ref[ref_pixels == ref_pixels.dtype.type(nd_float)] = np.nan
return new_ref, cand_dtype


# ---------------------------------------------------------------------------
# Canonical-attrs hook (issue #1984)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -459,6 +529,14 @@ def compare_to_oracle(
'dtype': ref_dtype,
}

# When the candidate reports the masked-nodata contract (#1988),
# rewrite the rasterio reference to match: cast to the candidate's
# float dtype and replace sentinel-equal pixels with NaN. Then the
# dtype + pixel assertions run on directly comparable arrays.
ref_pixels, ref_dtype = _normalise_for_masked_nodata(
ref_pixels, ref_dtype, ref_nodata, candidate_da
)

_assert_dtype(ref_dtype, candidate_da)
_assert_transform(ref_transform, candidate_da, ref_has_georef=ref_has_georef)
_assert_crs(ref_crs, candidate_da)
Expand Down
211 changes: 211 additions & 0 deletions xrspatial/geotiff/tests/golden_corpus/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,3 +502,214 @@ def test_crs_equal_rejects_empty_proj_dict() -> None:
assert b.to_epsg() is None
assert a.to_dict() == {} == b.to_dict()
assert _crs_equal(a, b) is False


# ---------------------------------------------------------------------------
# Masked-nodata contract (issue #1988)
# ---------------------------------------------------------------------------

def _masked_nodata_pair(
tmp_path: Path, sentinel: int = 0
) -> tuple[Path, xr.DataArray, np.ndarray]:
"""Build a uint16 fixture with an integer nodata sentinel and a
float candidate that has masked the sentinel pixels to NaN.

Returns ``(fixture_path, candidate, candidate_float_array)``.
"""
raw = np.array(
[
[1, 2, sentinel, 4],
[5, sentinel, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16],
],
dtype=np.uint16,
)
transform = from_origin(0.0, 4.0, 1.0, 1.0)
fixture = _write_tiff(
tmp_path / 'masked_nodata.tif',
raw,
transform=transform,
nodata=sentinel,
)
masked = raw.astype(np.float64)
masked[raw == sentinel] = np.nan
cand = _build_candidate(
masked, transform=transform, nodata=sentinel,
)
cand.attrs['masked_nodata'] = True
return fixture, cand, masked


def test_masked_nodata_success(tmp_path: Path) -> None:
"""Candidate that masks the integer sentinel to NaN passes the oracle.

The oracle rewrites the rasterio reference to match the candidate's
float-plus-NaN view before comparing, so dtype shift + pixel mask
no longer registers as a mismatch.
"""
fixture, cand, _ = _masked_nodata_pair(tmp_path)
compare_to_oracle(fixture, cand)


def test_masked_nodata_off_keeps_strict_dtype_check(tmp_path: Path) -> None:
"""Without ``masked_nodata=True``, the oracle keeps strict dtype parity.

Drop the flag and the candidate's float dtype no longer matches the
rasterio uint16 reference; the dtype assertion fires.
"""
fixture, cand, _ = _masked_nodata_pair(tmp_path)
del cand.attrs['masked_nodata']
with pytest.raises(AssertionError, match='dtype mismatch'):
compare_to_oracle(fixture, cand)


def test_masked_nodata_candidate_left_sentinel_pixel_fails(
tmp_path: Path,
) -> None:
"""If the candidate forgot to mask a sentinel pixel, the pixel
comparison fires.

The reference is rewritten to have NaN at the sentinel positions;
a candidate that still carries the raw integer value (here 0,
cast to float) at one of those positions cannot pass NaN-aware
equality.
"""
fixture, cand, masked = _masked_nodata_pair(tmp_path)
# Restore the first sentinel pixel to the raw value, simulating
# a reader that mis-masked.
bad = masked.copy()
bad[0, 2] = 0.0
cand2 = _build_candidate(
bad, transform=from_origin(0.0, 4.0, 1.0, 1.0), nodata=0,
)
cand2.attrs['masked_nodata'] = True
with pytest.raises(AssertionError, match='pixel arrays differ'):
compare_to_oracle(fixture, cand2)


def test_masked_nodata_wrong_nodata_attr_fails(tmp_path: Path) -> None:
"""``attrs['nodata']`` must still match the fixture even with masking.

The oracle masks the reference using the *reference's* nodata, so
a wrong sentinel in the candidate's attrs is caught by the
independent ``_assert_nodata`` check before pixels are compared.
"""
fixture, cand, _ = _masked_nodata_pair(tmp_path)
cand.attrs['nodata'] = 999 # not the real sentinel
with pytest.raises(AssertionError, match='nodata mismatch'):
compare_to_oracle(fixture, cand)


def test_masked_nodata_does_not_engage_for_float_nan_fixture(
tmp_path: Path,
) -> None:
"""Float-NaN-nodata fixtures take the plain pixel path unchanged.

The masked-nodata normaliser only fires when the candidate reports
``masked_nodata=True``. A float fixture whose nodata is NaN never
needs the dtype shift, and the new path must not muck with it.
"""
data = np.array([[1.0, np.nan], [np.nan, 4.0]], dtype=np.float32)
transform = from_origin(0.0, 2.0, 1.0, 1.0)
fixture = _write_tiff(
tmp_path / 'float_nan.tif', data, transform=transform,
nodata=float('nan'),
)
cand = _build_candidate(
data.copy(), transform=transform, nodata=float('nan'),
)
# No masked_nodata flag -- the candidate is float to begin with.
compare_to_oracle(fixture, cand)


def test_masked_nodata_with_non_integer_sentinel_passes_through(
tmp_path: Path,
) -> None:
"""A NaN sentinel in the rasterio reference does not trip the integer
masking path even when ``masked_nodata`` is set.

The normaliser is gated on a finite sentinel via ``np.isfinite``.
A NaN-nodata fixture should compare like any other float-NaN
fixture: candidate keeps its own NaN pixels, reference keeps
its own NaN pixels, and NaN-aware equality does the rest.
"""
data = np.array([[1.0, np.nan], [np.nan, 4.0]], dtype=np.float32)
transform = from_origin(0.0, 2.0, 1.0, 1.0)
fixture = _write_tiff(
tmp_path / 'masked_nan.tif', data, transform=transform,
nodata=float('nan'),
)
cand = _build_candidate(
data.copy(), transform=transform, nodata=float('nan'),
)
# Set the flag to confirm the normaliser early-exits cleanly on
# NaN sentinels rather than trying to do something silly.
cand.attrs['masked_nodata'] = True
compare_to_oracle(fixture, cand)


def test_masked_nodata_fractional_sentinel_does_not_mask(
tmp_path: Path,
) -> None:
"""A non-integer sentinel cannot match an integer pixel; the
normaliser declines to mask.

Without this guard the oracle would cast ``3.5`` to ``3`` and
silently mask every 3-valued pixel. The upstream xrspatial reader
never sets ``attrs['masked_nodata']`` for a fractional sentinel,
so the oracle's stricter check mirrors that contract. Outcome:
the oracle stays on the raw-pixel path and the dtype mismatch
surfaces normally.
"""
raw = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16)
transform = from_origin(0.0, 2.0, 1.0, 1.0)
fixture = _write_tiff(
tmp_path / 'frac_sentinel.tif', raw, transform=transform,
nodata=3.5,
)
masked = raw.astype(np.float64)
cand = _build_candidate(
masked, transform=transform, nodata=3.5,
)
cand.attrs['masked_nodata'] = True
with pytest.raises(AssertionError, match='dtype mismatch'):
compare_to_oracle(fixture, cand)


def test_masked_nodata_out_of_range_sentinel_does_not_mask() -> None:
"""A sentinel outside the source integer range cannot match any pixel.

Without the range guard, ``np.uint16(-1.0)`` wraps to ``65535`` and
masks the dtype-max pixel. The reader rejects such sentinels via
``info.min <= nodata_int <= info.max``; the oracle mirrors that
check.

Rasterio refuses to write an out-of-range nodata at the writer
level, so we cannot reach this code path through a real fixture.
The test calls the helper directly with synthesised inputs to
confirm the guard fires.
"""
from xrspatial.geotiff.tests.golden_corpus._oracle import (
_normalise_for_masked_nodata,
)

ref_pixels = np.array(
[[1, 2, 65535], [4, 5, 6]], dtype=np.uint16,
)
ref_dtype = ref_pixels.dtype
cand = _build_candidate(
ref_pixels.astype(np.float64),
transform=from_origin(0.0, 2.0, 1.0, 1.0),
nodata=-1,
)
cand.attrs['masked_nodata'] = True

out_pixels, out_dtype = _normalise_for_masked_nodata(
ref_pixels, ref_dtype, -1, cand,
)
# Out-of-range sentinel must abort the rewrite. The helper returns
# the inputs unchanged; the unsigned wraparound that would otherwise
# mask the dtype-max pixel does not happen.
assert out_dtype == ref_dtype
assert out_pixels is ref_pixels
16 changes: 3 additions & 13 deletions xrspatial/geotiff/tests/test_golden_corpus_dask_numpy_1930.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,11 @@
CHUNK_SIZE = 32


_NODATA_MASKING_REASON = (
"integer nodata masking: xrspatial masks sentinel pixels to NaN and "
"upcasts to float64 per #1988 (attrs['masked_nodata']=True). The oracle "
"compares raw integer pixels; needs an oracle extension that consults "
"attrs['masked_nodata']."
)

# Shared parity gaps. These live in the codec / attrs layer, not in the
# dask plumbing, so the dask backend inherits them verbatim from the
# eager backend.
# eager backend. Integer nodata masking used to live here too; the
# oracle's _normalise_for_masked_nodata helper closes that gap so it
# is no longer xfailed on any backend.
_PARITY_GAPS: dict[str, str] = {
"compression_jpeg_uint8_ycbcr": (
"RGB band axis order divergence: rasterio reads (bands, y, x) while "
Expand All @@ -67,11 +62,6 @@
"attrs['geog_citation'] but does not emit a canonical attrs['crs'] "
"or attrs['crs_wkt']. Real parity gap; needs a fix in _crs.py."
),
"nodata_int_sentinel_uint16": _NODATA_MASKING_REASON,
"stripped_le_uint16": _NODATA_MASKING_REASON,
"stripped_be_uint16": _NODATA_MASKING_REASON,
"tiled_le_uint16": _NODATA_MASKING_REASON,
"tiled_be_uint16": _NODATA_MASKING_REASON,
}

# Dask-only gaps go here. Empty in the first pass; add an entry only when
Expand Down
29 changes: 10 additions & 19 deletions xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,16 @@
deprecated ``attrs['geog_citation']`` but does not emit a canonical
``attrs['crs']`` or ``attrs['crs_wkt']``. Real parity gap; needs a fix
in ``_crs.py`` to round-trip citation WKT.
* ``nodata_int_sentinel_uint16``, ``stripped_le_uint16``,
``stripped_be_uint16``, ``tiled_le_uint16``, ``tiled_be_uint16`` --
integer nodata masking. xrspatial masks sentinel pixels to NaN and
upcasts to float64 per #1988 (``attrs['masked_nodata']=True``); the
oracle compares the raw integer pixel array. Needs a small oracle
extension that consults ``attrs['masked_nodata']`` and applies the
equivalent mask to the rasterio reference before comparing.

Resolved gaps (no longer xfail):

* Integer nodata masking. The oracle now consults
``attrs['masked_nodata']`` and rewrites the rasterio reference to
match the candidate's float-plus-NaN view (see
``_normalise_for_masked_nodata`` in ``_oracle.py``), so the five
fixtures ``nodata_int_sentinel_uint16``, ``stripped_le_uint16``,
``stripped_be_uint16``, ``tiled_le_uint16``, ``tiled_be_uint16``
pass directly.

Intentional skip (``skip``):

Expand Down Expand Up @@ -71,13 +74,6 @@
)


_NODATA_MASKING_REASON = (
"integer nodata masking: xrspatial masks sentinel pixels to NaN and "
"upcasts to float64 per #1988 (attrs['masked_nodata']=True). The oracle "
"compares raw integer pixels; needs an oracle extension that consults "
"attrs['masked_nodata']."
)

_PARITY_GAPS: dict[str, str] = {
"compression_jpeg_uint8_ycbcr": (
"RGB band axis order divergence: rasterio reads (bands, y, x) while "
Expand All @@ -89,11 +85,6 @@
"attrs['geog_citation'] but does not emit a canonical attrs['crs'] "
"or attrs['crs_wkt']. Real parity gap; needs a fix in _crs.py."
),
"nodata_int_sentinel_uint16": _NODATA_MASKING_REASON,
"stripped_le_uint16": _NODATA_MASKING_REASON,
"stripped_be_uint16": _NODATA_MASKING_REASON,
"tiled_le_uint16": _NODATA_MASKING_REASON,
"tiled_be_uint16": _NODATA_MASKING_REASON,
}

_INTENTIONAL_SKIPS: dict[str, str] = {
Expand Down
Loading
Loading