Skip to content

Commit b30600b

Browse files
committed
Address review nits: overview pixels, lossy mean check, drift tests (#2299)
- Lossless cells now also compare every overview level's pixel array, closing the gap where the COG fixture's pyramid could drift unnoticed. - Lossy (JPEG) cells gain a coarse per-band mean tolerance so a content regression in the JPEG branch is still detected. - Add direct negative-path tests that doctor a fixture and assert _assert_semantic_equal rejects it. - Rename the regenerated_dir tmp prefix to reflect the test's scope.
1 parent a2b38b7 commit b30600b

1 file changed

Lines changed: 155 additions & 14 deletions

File tree

xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py

Lines changed: 155 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,15 @@ def _nodata_equal(a, b) -> bool:
102102
return af == bf
103103

104104

105+
# Lossy cells (today: JPEG-YCbCr) can't compare pixels bit-exactly across
106+
# libjpeg versions, but a per-band mean drift much beyond a few intensity
107+
# units points at a real generator regression (wrong input array, swapped
108+
# band order before encode) rather than codec noise. 4.0 on a 0-255 scale
109+
# leaves several stops of headroom for libjpeg/YCbCr churn while still
110+
# catching the kind of bug the rest of the test is meant to flag.
111+
_LOSSY_PIXEL_MEAN_TOL = 4.0
112+
113+
105114
def _assert_semantic_equal(
106115
committed: pathlib.Path,
107116
regenerated: pathlib.Path,
@@ -111,11 +120,14 @@ def _assert_semantic_equal(
111120
112121
Used for ``cog`` and ``jpeg`` fixtures where the on-disk encoding
113122
is toolchain-coupled but the readable content is stable.
114-
Lossy cells (``tolerance.lossy: true`` in the manifest, today the
115-
JPEG-YCbCr entry) skip pixel equality and check only shape, dtype,
116-
georeferencing, and nodata.
123+
Lossless cells assert bit-exact pixels at the base IFD and at
124+
every overview level the file declares. Lossy cells
125+
(``tolerance.lossy: true`` in the manifest, today the JPEG-YCbCr
126+
entry) drop to a coarse per-band mean tolerance instead of a
127+
bit-exact compare.
117128
"""
118129
lossy = bool(entry.get("tolerance", {}).get("lossy", False))
130+
fid = entry["id"]
119131
with rasterio.open(committed) as ref, rasterio.open(regenerated) as cand:
120132
assert ref.count == cand.count, (
121133
f"band count differs: committed={ref.count}, "
@@ -140,19 +152,85 @@ def _assert_semantic_equal(
140152
f"nodata differs: committed={ref.nodata!r}, "
141153
f"regenerated={cand.nodata!r}"
142154
)
143-
assert ref.overviews(1) == cand.overviews(1), (
155+
ref_overviews = ref.overviews(1)
156+
assert ref_overviews == cand.overviews(1), (
144157
f"overview decimation factors differ: "
145-
f"committed={ref.overviews(1)}, regenerated={cand.overviews(1)}"
146-
)
147-
if lossy:
148-
return
149-
ref_pixels = ref.read()
150-
cand_pixels = cand.read()
151-
assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), (
152-
f"pixel arrays differ for {entry['id']!r}; the generator output "
153-
f"no longer round-trips to the committed fixture's pixels"
158+
f"committed={ref_overviews}, regenerated={cand.overviews(1)}"
154159
)
155160

161+
if lossy:
162+
_assert_pixels_close_lossy(committed, regenerated, fid)
163+
return
164+
_assert_pixels_exact(committed, regenerated, fid)
165+
# Overview pixels are part of the determinism contract for fixtures
166+
# that ship them (the COG cell today). rasterio's OVERVIEW_LEVEL
167+
# is 0-indexed against the overview chain, hence range(len(...)).
168+
for level in range(len(ref_overviews)):
169+
_assert_overview_pixels_exact(committed, regenerated, level, fid)
170+
171+
172+
def _read_all(path: pathlib.Path, *, overview_level: int | None = None) -> np.ndarray:
173+
"""Open ``path`` with rasterio and return ``src.read()`` for the
174+
requested IFD. ``overview_level=None`` reads the base IFD.
175+
"""
176+
if overview_level is None:
177+
with rasterio.open(path) as src:
178+
return src.read()
179+
with rasterio.open(path, OVERVIEW_LEVEL=overview_level) as src:
180+
return src.read()
181+
182+
183+
def _assert_pixels_exact(
184+
committed: pathlib.Path, regenerated: pathlib.Path, fid: str,
185+
) -> None:
186+
ref_pixels = _read_all(committed)
187+
cand_pixels = _read_all(regenerated)
188+
assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), (
189+
f"pixel arrays differ for {fid!r}; the generator output "
190+
f"no longer round-trips to the committed fixture's pixels"
191+
)
192+
193+
194+
def _assert_overview_pixels_exact(
195+
committed: pathlib.Path,
196+
regenerated: pathlib.Path,
197+
overview_level: int,
198+
fid: str,
199+
) -> None:
200+
ref_pixels = _read_all(committed, overview_level=overview_level)
201+
cand_pixels = _read_all(regenerated, overview_level=overview_level)
202+
assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), (
203+
f"overview level {overview_level} pixels differ for {fid!r}; "
204+
f"the generator's overview pyramid no longer matches the "
205+
f"committed fixture"
206+
)
207+
208+
209+
def _assert_pixels_close_lossy(
210+
committed: pathlib.Path, regenerated: pathlib.Path, fid: str,
211+
) -> None:
212+
"""Coarse per-band mean comparison for lossy (JPEG) cells.
213+
214+
Bit-exact comparison would re-introduce the libjpeg coupling this
215+
PR removed, but the per-band mean is stable enough across libjpeg
216+
versions to catch a real content regression (a swapped input
217+
array, a band-permutation bug) while tolerating ordinary codec
218+
drift.
219+
"""
220+
ref_pixels = _read_all(committed).astype(np.float64)
221+
cand_pixels = _read_all(regenerated).astype(np.float64)
222+
# rasterio always returns (bands, H, W), so axis=(1, 2) collapses
223+
# to one mean per band.
224+
ref_means = ref_pixels.mean(axis=(1, 2))
225+
cand_means = cand_pixels.mean(axis=(1, 2))
226+
diff = np.abs(ref_means - cand_means)
227+
assert np.all(diff <= _LOSSY_PIXEL_MEAN_TOL), (
228+
f"per-band mean drift exceeds {_LOSSY_PIXEL_MEAN_TOL} for {fid!r}: "
229+
f"committed_means={ref_means.tolist()}, "
230+
f"regenerated_means={cand_means.tolist()}, "
231+
f"abs_diff={diff.tolist()}"
232+
)
233+
156234

157235
def _load_entries() -> list[dict]:
158236
"""Return validated manifest entries (defaults merged), sorted by id."""
@@ -175,7 +253,7 @@ def regenerated_dir(tmp_path_factory: pytest.TempPathFactory) -> pathlib.Path:
175253
Module-scoped so the (few-second) write cost is paid once per
176254
test session rather than per parametrised case.
177255
"""
178-
out = tmp_path_factory.mktemp("regen_corpus_1930")
256+
out = tmp_path_factory.mktemp("regen_corpus_determinism")
179257
generate.generate(output_dir=out)
180258
return out
181259

@@ -257,6 +335,69 @@ def test_external_overview_sidecar_is_deterministic(
257335
)
258336

259337

338+
def _write_doctored_copy(
339+
src: pathlib.Path, dst: pathlib.Path, *, delta: int = 1
340+
) -> None:
341+
"""Copy ``src`` to ``dst`` and flip one pixel by ``delta``.
342+
343+
Used by the negative-path tests below: the resulting file has the
344+
same georeferencing and overview chain as the source but differs
345+
in pixel content, so a working semantic check must reject it.
346+
"""
347+
with rasterio.open(src) as r:
348+
profile = r.profile
349+
data = r.read()
350+
overview_factors = r.overviews(1)
351+
data = data.copy()
352+
data[0, 0, 0] = (int(data[0, 0, 0]) + delta) & np.iinfo(data.dtype).max
353+
with rasterio.open(dst, "w", **profile) as w:
354+
w.write(data)
355+
if overview_factors:
356+
# Match the source's overview chain so the decimation check
357+
# passes and the comparison falls through to pixel reads.
358+
w.build_overviews(overview_factors)
359+
360+
361+
def test_semantic_equal_rejects_lossless_pixel_drift(tmp_path) -> None:
362+
"""A doctored lossless fixture with one flipped pixel must fail
363+
``_assert_semantic_equal``. Locks the drift-detection path that the
364+
PR refactor depends on.
365+
"""
366+
src = FIXTURES_DIR / "cog_internal_overview_uint16.tif"
367+
if not src.exists():
368+
pytest.skip("cog fixture not committed; cannot exercise drift path")
369+
doctored = tmp_path / "cog_doctored_2299.tif"
370+
_write_doctored_copy(src, doctored)
371+
entry = _ENTRY_BY_ID["cog_internal_overview_uint16"]
372+
with pytest.raises(AssertionError, match=r"pixels? .* differ"):
373+
_assert_semantic_equal(src, doctored, entry)
374+
375+
376+
def test_semantic_equal_rejects_lossy_mean_drift(tmp_path) -> None:
377+
"""A doctored lossy fixture with a large constant offset must fail
378+
the per-band mean check. Catches the case where the JPEG path
379+
would otherwise silently accept anything since pixel equality is
380+
skipped.
381+
"""
382+
src = FIXTURES_DIR / "compression_jpeg_uint8_ycbcr.tif"
383+
if not src.exists():
384+
pytest.skip("jpeg fixture not committed; cannot exercise drift path")
385+
# Read the source, add a constant offset well past the mean
386+
# tolerance, then re-encode through the same profile so the
387+
# resulting file is still a valid JPEG-YCbCr TIFF.
388+
with rasterio.open(src) as r:
389+
profile = r.profile
390+
data = r.read()
391+
doctored = tmp_path / "jpeg_doctored_2299.tif"
392+
offset = int(_LOSSY_PIXEL_MEAN_TOL * 4) + 1
393+
shifted = np.clip(data.astype(np.int32) + offset, 0, 255).astype(data.dtype)
394+
with rasterio.open(doctored, "w", **profile) as w:
395+
w.write(shifted)
396+
entry = _ENTRY_BY_ID["compression_jpeg_uint8_ycbcr"]
397+
with pytest.raises(AssertionError, match="per-band mean drift"):
398+
_assert_semantic_equal(src, doctored, entry)
399+
400+
260401
def test_no_orphan_fixtures_on_disk() -> None:
261402
"""Every committed ``.tif`` (and ``.tif.ovr`` sidecar) corresponds
262403
to a manifest entry. Catches stale fixtures left behind after a

0 commit comments

Comments
 (0)