@@ -102,6 +102,15 @@ def _nodata_equal(a, b) -> bool:
102102 return af == bf
103103
104104
105+ # Lossy cells (today: JPEG-YCbCr) can't compare pixels bit-exactly across
106+ # libjpeg versions, but a per-band mean drift much beyond a few intensity
107+ # units points at a real generator regression (wrong input array, swapped
108+ # band order before encode) rather than codec noise. 4.0 on a 0-255 scale
109+ # leaves several stops of headroom for libjpeg/YCbCr churn while still
110+ # catching the kind of bug the rest of the test is meant to flag.
111+ _LOSSY_PIXEL_MEAN_TOL = 4.0
112+
113+
105114def _assert_semantic_equal (
106115 committed : pathlib .Path ,
107116 regenerated : pathlib .Path ,
@@ -111,11 +120,14 @@ def _assert_semantic_equal(
111120
112121 Used for ``cog`` and ``jpeg`` fixtures where the on-disk encoding
113122 is toolchain-coupled but the readable content is stable.
114- Lossy cells (``tolerance.lossy: true`` in the manifest, today the
115- JPEG-YCbCr entry) skip pixel equality and check only shape, dtype,
116- georeferencing, and nodata.
123+ Lossless cells assert bit-exact pixels at the base IFD and at
124+ every overview level the file declares. Lossy cells
125+ (``tolerance.lossy: true`` in the manifest, today the JPEG-YCbCr
126+ entry) drop to a coarse per-band mean tolerance instead of a
127+ bit-exact compare.
117128 """
118129 lossy = bool (entry .get ("tolerance" , {}).get ("lossy" , False ))
130+ fid = entry ["id" ]
119131 with rasterio .open (committed ) as ref , rasterio .open (regenerated ) as cand :
120132 assert ref .count == cand .count , (
121133 f"band count differs: committed={ ref .count } , "
@@ -140,19 +152,85 @@ def _assert_semantic_equal(
140152 f"nodata differs: committed={ ref .nodata !r} , "
141153 f"regenerated={ cand .nodata !r} "
142154 )
143- assert ref .overviews (1 ) == cand .overviews (1 ), (
155+ ref_overviews = ref .overviews (1 )
156+ assert ref_overviews == cand .overviews (1 ), (
144157 f"overview decimation factors differ: "
145- f"committed={ ref .overviews (1 )} , regenerated={ cand .overviews (1 )} "
146- )
147- if lossy :
148- return
149- ref_pixels = ref .read ()
150- cand_pixels = cand .read ()
151- assert np .array_equal (ref_pixels , cand_pixels , equal_nan = True ), (
152- f"pixel arrays differ for { entry ['id' ]!r} ; the generator output "
153- f"no longer round-trips to the committed fixture's pixels"
158+ f"committed={ ref_overviews } , regenerated={ cand .overviews (1 )} "
154159 )
155160
161+ if lossy :
162+ _assert_pixels_close_lossy (committed , regenerated , fid )
163+ return
164+ _assert_pixels_exact (committed , regenerated , fid )
165+ # Overview pixels are part of the determinism contract for fixtures
166+ # that ship them (the COG cell today). rasterio's OVERVIEW_LEVEL
167+ # is 0-indexed against the overview chain, hence range(len(...)).
168+ for level in range (len (ref_overviews )):
169+ _assert_overview_pixels_exact (committed , regenerated , level , fid )
170+
171+
172+ def _read_all (path : pathlib .Path , * , overview_level : int | None = None ) -> np .ndarray :
173+ """Open ``path`` with rasterio and return ``src.read()`` for the
174+ requested IFD. ``overview_level=None`` reads the base IFD.
175+ """
176+ if overview_level is None :
177+ with rasterio .open (path ) as src :
178+ return src .read ()
179+ with rasterio .open (path , OVERVIEW_LEVEL = overview_level ) as src :
180+ return src .read ()
181+
182+
183+ def _assert_pixels_exact (
184+ committed : pathlib .Path , regenerated : pathlib .Path , fid : str ,
185+ ) -> None :
186+ ref_pixels = _read_all (committed )
187+ cand_pixels = _read_all (regenerated )
188+ assert np .array_equal (ref_pixels , cand_pixels , equal_nan = True ), (
189+ f"pixel arrays differ for { fid !r} ; the generator output "
190+ f"no longer round-trips to the committed fixture's pixels"
191+ )
192+
193+
194+ def _assert_overview_pixels_exact (
195+ committed : pathlib .Path ,
196+ regenerated : pathlib .Path ,
197+ overview_level : int ,
198+ fid : str ,
199+ ) -> None :
200+ ref_pixels = _read_all (committed , overview_level = overview_level )
201+ cand_pixels = _read_all (regenerated , overview_level = overview_level )
202+ assert np .array_equal (ref_pixels , cand_pixels , equal_nan = True ), (
203+ f"overview level { overview_level } pixels differ for { fid !r} ; "
204+ f"the generator's overview pyramid no longer matches the "
205+ f"committed fixture"
206+ )
207+
208+
209+ def _assert_pixels_close_lossy (
210+ committed : pathlib .Path , regenerated : pathlib .Path , fid : str ,
211+ ) -> None :
212+ """Coarse per-band mean comparison for lossy (JPEG) cells.
213+
214+ Bit-exact comparison would re-introduce the libjpeg coupling this
215+ PR removed, but the per-band mean is stable enough across libjpeg
216+ versions to catch a real content regression (a swapped input
217+ array, a band-permutation bug) while tolerating ordinary codec
218+ drift.
219+ """
220+ ref_pixels = _read_all (committed ).astype (np .float64 )
221+ cand_pixels = _read_all (regenerated ).astype (np .float64 )
222+ # rasterio always returns (bands, H, W), so axis=(1, 2) collapses
223+ # to one mean per band.
224+ ref_means = ref_pixels .mean (axis = (1 , 2 ))
225+ cand_means = cand_pixels .mean (axis = (1 , 2 ))
226+ diff = np .abs (ref_means - cand_means )
227+ assert np .all (diff <= _LOSSY_PIXEL_MEAN_TOL ), (
228+ f"per-band mean drift exceeds { _LOSSY_PIXEL_MEAN_TOL } for { fid !r} : "
229+ f"committed_means={ ref_means .tolist ()} , "
230+ f"regenerated_means={ cand_means .tolist ()} , "
231+ f"abs_diff={ diff .tolist ()} "
232+ )
233+
156234
157235def _load_entries () -> list [dict ]:
158236 """Return validated manifest entries (defaults merged), sorted by id."""
@@ -175,7 +253,7 @@ def regenerated_dir(tmp_path_factory: pytest.TempPathFactory) -> pathlib.Path:
175253 Module-scoped so the (few-second) write cost is paid once per
176254 test session rather than per parametrised case.
177255 """
178- out = tmp_path_factory .mktemp ("regen_corpus_1930 " )
256+ out = tmp_path_factory .mktemp ("regen_corpus_determinism " )
179257 generate .generate (output_dir = out )
180258 return out
181259
@@ -257,6 +335,69 @@ def test_external_overview_sidecar_is_deterministic(
257335 )
258336
259337
338+ def _write_doctored_copy (
339+ src : pathlib .Path , dst : pathlib .Path , * , delta : int = 1
340+ ) -> None :
341+ """Copy ``src`` to ``dst`` and flip one pixel by ``delta``.
342+
343+ Used by the negative-path tests below: the resulting file has the
344+ same georeferencing and overview chain as the source but differs
345+ in pixel content, so a working semantic check must reject it.
346+ """
347+ with rasterio .open (src ) as r :
348+ profile = r .profile
349+ data = r .read ()
350+ overview_factors = r .overviews (1 )
351+ data = data .copy ()
352+ data [0 , 0 , 0 ] = (int (data [0 , 0 , 0 ]) + delta ) & np .iinfo (data .dtype ).max
353+ with rasterio .open (dst , "w" , ** profile ) as w :
354+ w .write (data )
355+ if overview_factors :
356+ # Match the source's overview chain so the decimation check
357+ # passes and the comparison falls through to pixel reads.
358+ w .build_overviews (overview_factors )
359+
360+
361+ def test_semantic_equal_rejects_lossless_pixel_drift (tmp_path ) -> None :
362+ """A doctored lossless fixture with one flipped pixel must fail
363+ ``_assert_semantic_equal``. Locks the drift-detection path that the
364+ PR refactor depends on.
365+ """
366+ src = FIXTURES_DIR / "cog_internal_overview_uint16.tif"
367+ if not src .exists ():
368+ pytest .skip ("cog fixture not committed; cannot exercise drift path" )
369+ doctored = tmp_path / "cog_doctored_2299.tif"
370+ _write_doctored_copy (src , doctored )
371+ entry = _ENTRY_BY_ID ["cog_internal_overview_uint16" ]
372+ with pytest .raises (AssertionError , match = r"pixels? .* differ" ):
373+ _assert_semantic_equal (src , doctored , entry )
374+
375+
376+ def test_semantic_equal_rejects_lossy_mean_drift (tmp_path ) -> None :
377+ """A doctored lossy fixture with a large constant offset must fail
378+ the per-band mean check. Catches the case where the JPEG path
379+ would otherwise silently accept anything since pixel equality is
380+ skipped.
381+ """
382+ src = FIXTURES_DIR / "compression_jpeg_uint8_ycbcr.tif"
383+ if not src .exists ():
384+ pytest .skip ("jpeg fixture not committed; cannot exercise drift path" )
385+ # Read the source, add a constant offset well past the mean
386+ # tolerance, then re-encode through the same profile so the
387+ # resulting file is still a valid JPEG-YCbCr TIFF.
388+ with rasterio .open (src ) as r :
389+ profile = r .profile
390+ data = r .read ()
391+ doctored = tmp_path / "jpeg_doctored_2299.tif"
392+ offset = int (_LOSSY_PIXEL_MEAN_TOL * 4 ) + 1
393+ shifted = np .clip (data .astype (np .int32 ) + offset , 0 , 255 ).astype (data .dtype )
394+ with rasterio .open (doctored , "w" , ** profile ) as w :
395+ w .write (shifted )
396+ entry = _ENTRY_BY_ID ["compression_jpeg_uint8_ycbcr" ]
397+ with pytest .raises (AssertionError , match = "per-band mean drift" ):
398+ _assert_semantic_equal (src , doctored , entry )
399+
400+
260401def test_no_orphan_fixtures_on_disk () -> None :
261402 """Every committed ``.tif`` (and ``.tif.ovr`` sidecar) corresponds
262403 to a manifest entry. Catches stale fixtures left behind after a
0 commit comments