diff --git a/docs/source/reference/geotiff.rst b/docs/source/reference/geotiff.rst index d6511587a..2a3593814 100644 --- a/docs/source/reference/geotiff.rst +++ b/docs/source/reference/geotiff.rst @@ -169,12 +169,12 @@ this section is the brief. raw sentinel. The signal is part of the canonical attrs contract; ``xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py`` pins the canonical form and - ``xrspatial/geotiff/tests/test_vrt_masked_nodata_attr_2159.py`` + ``xrspatial/geotiff/tests/vrt/test_metadata.py`` covers the VRT mosaic case. * Mixed-band nodata. A VRT whose sources declare disagreeing per-band nodata sentinels raises ``MixedBandMetadataError`` by default. Pass ``band_nodata='first'`` to opt back into the legacy flatten-to-band-0 - behaviour; see ``xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py``. + behaviour; see ``xrspatial/geotiff/tests/vrt/test_metadata.py``. The lifecycle is locked end-to-end by ``xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py`` and @@ -533,7 +533,7 @@ regression test that locks the behaviour. ``xrspatial/geotiff/tests/test_vrt_capability_validator_2371.py`` * - Mixed per-band nodata across VRT sources (default ``band_nodata=None``) - - ``xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py``, + - ``xrspatial/geotiff/tests/vrt/test_metadata.py``, ``xrspatial/geotiff/tests/test_unsupported_features_2349.py`` (``test_mixed_per_source_nodata_rejected``) * - Rotated read without ``allow_rotated=True`` diff --git a/docs/source/reference/release_gate_geotiff.rst b/docs/source/reference/release_gate_geotiff.rst index b616af390..fc5fe2a0b 100644 --- a/docs/source/reference/release_gate_geotiff.rst +++ b/docs/source/reference/release_gate_geotiff.rst @@ -389,7 +389,7 @@ Nodata lifecycle - ``masked_nodata`` records whether the read produced NaN-masked output distinct from the on-disk sentinel; mixed-band VRT inputs honour the split. - - ``xrspatial/geotiff/tests/test_vrt_masked_nodata_attr_2159.py``, + - ``xrspatial/geotiff/tests/vrt/test_metadata.py``, ``xrspatial/geotiff/tests/test_mask_nodata_gpu_vrt_2052.py`` - `#2341`_ * - Mixed-band metadata reject @@ -403,9 +403,8 @@ Nodata lifecycle - stable - VRT sources with conflicting per-band nodata raise rather than silently flatten. - - ``xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py``, - ``xrspatial/geotiff/tests/test_vrt_int_nodata_1564.py``, - ``xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py`` + - ``xrspatial/geotiff/tests/vrt/test_metadata.py``, + ``xrspatial/geotiff/tests/vrt/test_dtype_conversion.py`` - `#2342`_ attrs contract @@ -506,7 +505,7 @@ VRT supported subset - advanced - Holes surface as the band sentinel, ``attrs['vrt_holes']`` is set, and a :class:`GeoTIFFFallbackWarning` is emitted. - - ``xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py``, + - ``xrspatial/geotiff/tests/vrt/test_metadata.py``, ``xrspatial/geotiff/tests/vrt/test_missing_sources.py``, ``xrspatial/geotiff/tests/test_vrt_chunked_missing_sources_1799.py`` - `#2342`_ @@ -514,8 +513,7 @@ VRT supported subset - stable - Out-of-bounds source or destination rectangles raise at construction. - ``xrspatial/geotiff/tests/test_geotiff_vrt_srcrect_validation_1784.py``, - ``xrspatial/geotiff/tests/test_vrt_scaled_rects_1694.py``, - ``xrspatial/geotiff/tests/test_vrt_dstrect_resample_cap_1737.py`` + ``xrspatial/geotiff/tests/vrt/test_window.py`` - `#2342`_ * - VRT path containment - stable @@ -528,30 +526,27 @@ VRT supported subset - Unsupported resampling identifiers are rejected; supported ones (``nearest``, ``bilinear``, ``cubic``) round-trip pixels through eager and dask. - - ``xrspatial/geotiff/tests/test_vrt_resample_alg_1751.py``, - ``xrspatial/geotiff/tests/test_vrt_resample_window_inverse_1704.py`` + - ``xrspatial/geotiff/tests/vrt/test_dtype_conversion.py``, + ``xrspatial/geotiff/tests/vrt/test_window.py`` - `#2342`_ * - VRT dtype / band layout consistency - stable - Mixed dtype, mixed band count, or mismatched 12-bit-vs-16-bit sources raise rather than coerce. - - ``xrspatial/geotiff/tests/test_vrt_dtype_1783.py``, - ``xrspatial/geotiff/tests/test_vrt_dtype_12bit_1914.py``, - ``xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py`` + - ``xrspatial/geotiff/tests/vrt/test_dtype_conversion.py`` - `#2342`_ * - VRT lazy / chunked read parity - advanced - Chunked VRT reads return the same shape, coords, attrs, and values as eager reads on the supported subset. - - ``xrspatial/geotiff/tests/test_vrt_lazy_chunks_1814.py``, - ``xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py``, - ``xrspatial/geotiff/tests/test_vrt_chunked_shared_dataset_1923.py`` + - ``xrspatial/geotiff/tests/vrt/test_window.py``, + ``xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py`` - `#2342`_ * - VRT single-parse contract - stable - VRT XML is parsed once per read; chunked callers do not re-parse per-chunk. - - ``xrspatial/geotiff/tests/test_vrt_single_parse_1825.py`` + - ``xrspatial/geotiff/tests/vrt/test_metadata.py`` - `#2321`_ * - VRT narrow exception surface - stable diff --git a/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py b/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py deleted file mode 100644 index 29a743f10..000000000 --- a/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Regression tests for issue #1598. - -``read_vrt(path, band=N)`` used to always source the nodata sentinel -from ``vrt.bands[0]`` rather than the requested band, so a multi-band -VRT with per-band ```` would mis-mask reads of any band -other than band 0: - -* ``attrs['nodata']`` advertised band 0's sentinel (wrong). -* The integer-to-float64 promotion ran against band 0's sentinel, so - band N's actual nodata pixels stayed as literal integers. -* The returned dtype was integer when it should have been float64. - -The fix uses ``vrt.bands[band].nodata`` when a band is selected. -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_two_band_per_band_nodata_vrt(tmp_path): - """Two single-band uint16 sources, each with a distinct nodata - sentinel, exposed as bands 1 and 2 of a hand-rolled VRT. - """ - band0 = np.array([[1, 2], [3, 65535]], dtype=np.uint16) - band1 = np.array([[7, 8], [9, 65000]], dtype=np.uint16) - p0 = str(tmp_path / 'vrt_band0_1598.tif') - p1 = str(tmp_path / 'vrt_band1_1598.tif') - write(band0, p0, nodata=65535, compression='none', tiled=False) - write(band1, p1, nodata=65000, compression='none', tiled=False) - - vrt_path = str(tmp_path / 'two_band_per_band_nodata_1598.vrt') - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - 65535 - - {p0} - 1 - - - - - - 65000 - - {p1} - 1 - - - - -""" - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -def test_read_vrt_band0_uses_band0_nodata(tmp_path): - """Sanity check the band-0 selection still works after the fix. - - Confirms the refactor did not flip the index. - - The fixture mosaics two bands with distinct per-band sentinels, so - after #1987 PR 5 the default read raises ``MixedBandMetadataError``. - The pre-#1987 flatten-to-first-band semantics this regression tests - are still reachable via ``band_nodata='first'``; the opt-in surfaces - at the call site that the test is exercising the legacy behaviour. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band=0, band_nodata='first') - assert r.dtype == np.float64 - assert r.attrs.get('nodata') == 65535.0 - assert np.isnan(r.values[1, 1]) - assert r.values[0, 0] == 1 - - -def test_read_vrt_band1_uses_band1_nodata(tmp_path): - """The previously-broken case: band=1 must use band 1's sentinel. - - Before the fix this returned dtype=uint16 with values=[[7,8], - [9,65000]] and attrs['nodata']=65535. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band=1, band_nodata='first') - assert r.dtype == np.float64, ( - "band=1 read kept uint16 dtype; per-band nodata regression." - ) - assert r.attrs.get('nodata') == 65000.0, ( - f"attrs['nodata'] was {r.attrs.get('nodata')}, " - f"expected 65000 from band 1's ." - ) - assert np.isnan(r.values[1, 1]), ( - "band 1's sentinel pixel was not NaN-masked; " - "promotion ran against the wrong sentinel." - ) - assert r.values[0, 0] == 7 - assert r.values[1, 0] == 9 - - -def test_read_vrt_no_band_keeps_band0_nodata_attr(tmp_path): - """Unselected reads still surface band 0's sentinel. - - Multi-band VRTs with mixed sentinels return all bands stacked, and - the canonical attr cannot encode per-band values; advertising - band 0's sentinel matches the prior behavior and the documented - "first band wins" contract for multi-band reads. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band_nodata='first') - assert r.attrs.get('nodata') == 65535.0 - - -def test_read_vrt_negative_band_raises(tmp_path): - """Negative band indices used to be silently accepted via Python - list indexing (``vrt.bands[-1]`` returned the last band) while the - public reader's nodata lookup rejected them, producing band-N data - with no nodata sentinel. They are now a clear ValueError up front. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - with pytest.raises(ValueError, match="band"): - read_vrt(vrt_path, band=-1) - - -def test_read_vrt_out_of_range_band_raises(tmp_path): - """Out-of-range band indices used to raise IndexError from deep in - the read path. They are now a ValueError that names the available - band count. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - with pytest.raises(ValueError, match="out of range"): - read_vrt(vrt_path, band=5, band_nodata='first') - - -def test_read_vrt_non_integer_band_raises(tmp_path): - """A non-int ``band`` would previously have raised TypeError on the - list index. ValueError here matches the rest of the input - validation surface. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - with pytest.raises(ValueError, match="band"): - read_vrt(vrt_path, band="1") - with pytest.raises(ValueError, match="band"): - read_vrt(vrt_path, band=True) diff --git a/xrspatial/geotiff/tests/test_vrt_chunked_shared_dataset_1923.py b/xrspatial/geotiff/tests/test_vrt_chunked_shared_dataset_1923.py deleted file mode 100644 index 37427014d..000000000 --- a/xrspatial/geotiff/tests/test_vrt_chunked_shared_dataset_1923.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Regression test for issue #1923. - -``_read_vrt_chunked`` previously passed the parsed ``VRTDataset`` as a -plain kwarg to each per-chunk ``dask.delayed`` call, so dask embedded -the full source list (filenames, src/dst rects, per-source nodata) into -every task graph entry. With N sources and M chunks the graph held -``N * M`` copies of the same metadata; a 1000-source VRT split into -1000 chunks built a ~57 MB driver graph and reserialised that payload -1000 times under distributed/process schedulers. - -The fix wraps the dataset in ``dask.delayed(vrt, pure=True)`` once before -the per-chunk loop and passes that single shared reference into every -task, mirroring the ``http_meta_key`` pattern in ``_backends/dask.py`` -and the ``meta_key`` pattern in ``_backends/gpu.py``. - -These tests assert the shared-input behaviour structurally so a -regression would either flip the structural check or pump the graph -size back over the cap. -""" -from __future__ import annotations - -import os - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import read_vrt, to_geotiff - - -def _make_tile_vrt(tmp_path, n_tiles_per_side=4): - """Build a small multi-source VRT for testing the chunked path. - - Each source is a 64x64 tile written to a temp directory; the VRT - stitches them into a ``(64*N, 64*N)`` mosaic. ``N=4`` keeps the - fixture cheap while still producing a multi-source VRT whose - embedded metadata is measurable. - """ - tile_dir = os.path.join(tmp_path, "tiles") - os.makedirs(tile_dir, exist_ok=True) - tile_size = 64 - sources = [] - for r in range(n_tiles_per_side): - for c in range(n_tiles_per_side): - arr = np.full( - (tile_size, tile_size), - fill_value=r * n_tiles_per_side + c, - dtype=np.float32, - ) - ox = c * tile_size - oy = -(r * tile_size) - # GeoTransform tuple ordering matches rasterio: pixel_width - # first, then column rotation (forced to zero), then origin_x, - # then row rotation (forced to zero), then pixel_height, - # then origin_y. - da = xr.DataArray( - arr, - dims=["y", "x"], - attrs={"transform": (1.0, 0.0, ox, 0.0, -1.0, oy)}, - ) - path = os.path.join(tile_dir, f"tile_{r}_{c}.tif") - to_geotiff(da, path, compression="deflate", tiled=True, - tile_size=64) - sources.append((path, r, c, tile_size)) - - vrt_path = os.path.join(tmp_path, "mosaic.vrt") - width = n_tiles_per_side * tile_size - height = n_tiles_per_side * tile_size - lines = [ - f'', - '', - '0.0, 1.0, 0.0, 0.0, 0.0, -1.0', - '', - ] - for (path, r, c, ts) in sources: - lines.extend([ - "", - f'{path}', - "1", - f'', - f'', - "", - ]) - lines.extend(["", ""]) - with open(vrt_path, "w") as f: - f.write("\n".join(lines)) - return vrt_path, len(sources) - - -def test_vrt_chunked_dataset_is_shared_graph_input(tmp_path): - """Issue #1923: parsed VRTDataset is wrapped as a single Delayed. - - Walks each ``_vrt_chunk_read`` task's kwargs dict in the dask graph - and verifies that ``parsed_vrt`` is NOT an inline ``VRTDataset`` - instance (the pre-fix shape). With the fix, ``parsed_vrt`` is - routed through ``dask.delayed(vrt, pure=True)`` so each task's - ``kwargs['parsed_vrt']`` is a graph reference (an ``Alias`` / - ``TaskRef``-style placeholder pointing into a single shared - ``from-value`` layer) rather than a literal embedded dataset. - - Under the synchronous / threaded scheduler tasks are not pickled - at all, so an embedded copy is harmless in that path. The bug - surfaces under the distributed / multi-process scheduler where - each task is serialised independently and the full dataset is - shipped once per task -- so the structural shape of the graph, - not in-process behaviour, is what matters. - """ - from xrspatial.geotiff._vrt import VRTDataset - - vrt_path, n_sources = _make_tile_vrt(str(tmp_path), n_tiles_per_side=4) - - # Use small chunks so the grid produces multiple tasks. 4 tiles per - # side at 64 px each is 256x256 image; chunks=32 gives 64 chunks. - result = read_vrt(vrt_path, chunks=32) - graph = result.__dask_graph__() - - assert n_sources == 16, "fixture build sanity check" - - chunk_task_count = 0 - embedded_vrt_count = 0 - for layer_name, layer in graph.layers.items(): - if "_vrt_chunk_read" not in layer_name: - continue - for _key, task in layer.items(): - # New-style dask Task: kwargs is a dict attribute. - kwargs = getattr(task, "kwargs", None) - if kwargs is None: - continue - parsed_vrt = kwargs.get("parsed_vrt") - if parsed_vrt is None: - continue - chunk_task_count += 1 - if isinstance(parsed_vrt, VRTDataset): - embedded_vrt_count += 1 - - assert chunk_task_count > 1, ( - f"fixture sanity: expected multiple chunk tasks, got " - f"{chunk_task_count}" - ) - assert embedded_vrt_count == 0, ( - f"#1923 regression: {embedded_vrt_count} of {chunk_task_count} " - f"_vrt_chunk_read tasks still embed an inline VRTDataset in " - f"kwargs['parsed_vrt']. The fix wraps the dataset in " - f"dask.delayed(vrt, pure=True) so kwargs['parsed_vrt'] should " - f"be a TaskRef-style graph reference, not a VRTDataset." - ) - - -def test_vrt_chunked_decode_unchanged_after_shared_wrap(tmp_path): - """The shared-Delayed wrap must not change decoded pixel values.""" - vrt_path, _ = _make_tile_vrt(str(tmp_path), n_tiles_per_side=3) - - # Eager read produces the baseline. - eager = read_vrt(vrt_path) - chunked = read_vrt(vrt_path, chunks=32).compute() - - np.testing.assert_array_equal(np.asarray(eager), np.asarray(chunked)) - - -def test_vrt_chunked_band_kwarg_still_validates(tmp_path): - """Wrapping the dataset must not change band validation behaviour.""" - vrt_path, _ = _make_tile_vrt(str(tmp_path), n_tiles_per_side=2) - - with pytest.raises(ValueError): - # The fixture is single-band; band=5 should still raise the - # same ValueError the chunked path raised pre-fix. - read_vrt(vrt_path, chunks=32, band=5) diff --git a/xrspatial/geotiff/tests/test_vrt_dstrect_resample_cap_1737.py b/xrspatial/geotiff/tests/test_vrt_dstrect_resample_cap_1737.py deleted file mode 100644 index 6bf396c6d..000000000 --- a/xrspatial/geotiff/tests/test_vrt_dstrect_resample_cap_1737.py +++ /dev/null @@ -1,157 +0,0 @@ -"""VRT ``DstRect`` xSize/ySize must not drive unbounded resample intermediates. - -A crafted VRT can declare a ```` whose ``xSize`` and -``ySize`` are orders of magnitude larger than the VRT's own -``rasterXSize`` / ``rasterYSize``. Originally (issue #1737) ``read_vrt`` -called ``_resample_nearest(src_arr, dr.y_size, dr.x_size)`` *before* clipping, -allocating the full DstRect-sized intermediate before discarding most of it, -so the read was refused with a ``ValueError`` naming the offending size. - -After issue #1704 the resample path reads only the source subset that feeds -the clipped destination sub-window, so the intermediate is bounded by the -caller's window (and by the VRT extent) rather than the raw DstRect. The -huge-DstRect attack vector is therefore neutralised by the read path itself, -not by the per-source pixel-budget guard. The per-source guard still applies -to the clipped sub-window, which is now what the cap is measured against. -""" -from __future__ import annotations - -import os -import tempfile - -import numpy as np -import pytest - -from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._vrt import read_vrt - - -def _write_source(td: str) -> str: - """Write a 10x10 uint8 source GeoTIFF and return its path. - - Stripped (non-tiled) so the source read does not allocate a 256x256 - tile that trips ``_check_dimensions`` under the small ``max_pixels`` - values these tests pass. - """ - src_path = os.path.join(td, 'src.tif') - to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, - compression='none', tiled=False) - return src_path - - -def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, - raster_x: int = 100, raster_y: int = 100) -> str: - """Write a VRT with a single SimpleSource using the given DstRect size.""" - vrt_path = os.path.join(td, 'mosaic.vrt') - vrt_xml = ( - f'\n' - f' \n' - f' \n' - f' src.tif\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -def test_huge_dstrect_no_longer_allocates_full_intermediate(): - """After #1704 the windowed read clips a 50000x50000 DstRect down to - the 100x100 VRT extent, so the resample intermediate is 100x100 and - no longer hits the pixel-budget cap. The earlier behaviour rejected - the read up front; the new behaviour just returns the assembled - 100x100 mosaic. - """ - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt(td, dst_x_size=50000, dst_y_size=50000) - arr, _ = read_vrt(vrt_path) - assert arr.shape == (100, 100) - - -def test_huge_dstrect_y_axis_clipped_to_extent(): - """Asymmetric blow-up: ``ySize`` declared as 10 billion but the VRT - extent caps the clipped sub-window at 100 rows. Read succeeds with - the bounded intermediate.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt( - td, dst_x_size=10, dst_y_size=10_000_000_000) - arr, _ = read_vrt(vrt_path) - assert arr.shape == (100, 100) - - -def test_legitimate_upsample_still_works(): - """A legitimate upsample stays under the cap and must succeed.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - # 100 x 100 destination, matches the VRT extent. - vrt_path = _write_vrt(td, dst_x_size=100, dst_y_size=100) - arr, _ = read_vrt(vrt_path) - assert arr.shape == (100, 100) - - -def test_per_source_cap_bites_when_sub_window_exceeds_budget(): - """The per-source pixel-budget guard applies to the clipped - sub-window, not the raw DstRect. Pick a VRT and ``max_pixels`` where - the sub-window itself exceeds the cap so the per-source check fires - even after the windowed-read change. - - The output buffer dimension check (``_check_dimensions``) is also - bounded by ``max_pixels``, so to isolate the per-source branch we - request a window whose sub-window product crosses the cap. Both - guards use the same threshold; the per-source one provides defence - in depth. - """ - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt(td, dst_x_size=2000, dst_y_size=2000, - raster_x=2000, raster_y=2000) - # Sub-window is 2000x2000 = 4e6 pixels. Cap of 1e6 rejects. - with pytest.raises(ValueError, match="resample intermediate|safety limit"): - read_vrt(vrt_path, max_pixels=1_000_000) - # Bump the cap above 4e6: accepted. - arr, _ = read_vrt(vrt_path, max_pixels=4_000_000) - assert arr.shape == (2000, 2000) - - -def test_per_source_cap_inclusive_boundary(): - """The per-source cap is inclusive: exactly ``max_pixels`` succeeds, - one below rejects. Mirrors the boundary the original #1737 test - pinned down, on the new sub-window semantics.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt(td, dst_x_size=100, dst_y_size=100, - raster_x=100, raster_y=100) - # Sub-window is 100x100 = 10_000 pixels. - with pytest.raises(ValueError, match="resample intermediate|safety limit"): - read_vrt(vrt_path, max_pixels=9_999) - arr, _ = read_vrt(vrt_path, max_pixels=10000) - assert arr.shape == (100, 100) - - -def test_negative_dstrect_rejected(): - """Negative ``xSize`` / ``ySize`` must surface as ``ValueError`` - rather than be silently skipped by the overlap check. The error - message must call out the malformed negative size, not the pixel - budget.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt(td, dst_x_size=-5, dst_y_size=100) - with pytest.raises(ValueError, match="negative size"): - read_vrt(vrt_path) - - -def test_negative_dstrect_y_size_rejected(): - """Negative ``ySize`` is also rejected with the same tailored error.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_source(td) - vrt_path = _write_vrt(td, dst_x_size=100, dst_y_size=-5) - with pytest.raises(ValueError, match="negative size"): - read_vrt(vrt_path) diff --git a/xrspatial/geotiff/tests/test_vrt_dtype_12bit_1914.py b/xrspatial/geotiff/tests/test_vrt_dtype_12bit_1914.py deleted file mode 100644 index 982c79c74..000000000 --- a/xrspatial/geotiff/tests/test_vrt_dtype_12bit_1914.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Regression tests for issue #1914. - -``write_vrt`` used to build its GDAL ``dataType`` attribute from a local -if/elif/else ladder keyed on ``sample_format`` and ``bps`` rather than -going through the central ``tiff_dtype_to_numpy`` resolver. For -``bps=12, sample_format=1`` that ladder fell through to ``'Byte'``, -even though the reader promotes the same sample to ``uint16``. A VRT -over a valid 12-bit unsigned source would then advertise a narrower -type and could be truncated by downstream GDAL/VRT consumers. - -These tests pin the VRT dtype name for every TIFF (bps, sample_format) -the resolver supports, including the 12-bit regression case. The -helper ``_vrt_dtype_name_for`` is called directly because the on-disk -writers in ``to_geotiff`` only emit standard 8/16/32/64-bit samples, -so a real 12-bit TIFF can't be round-tripped through ``write_vrt`` in -the test environment. - -A second block exercises ``write_vrt`` end-to-end with normal 16-bit -unsigned tiles and asserts the generated XML carries ``UInt16`` (not -``Byte``) so the actual writer path is covered too. -""" -from __future__ import annotations - -import os -import uuid - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._vrt import _NP_TO_VRT_DTYPE, _vrt_dtype_name_for, write_vrt - -# --------------------------------------------------------------------------- -# Direct helper tests: every supported (bps, sample_format) pair -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "bps,sf,expected", - [ - # The regression case. ``_dtypes.tiff_dtype_to_numpy`` promotes - # ``bps=12, sf=1`` to ``uint16``; the VRT dtype must follow. - (12, 1, "UInt16"), - (12, 4, "UInt16"), # SAMPLE_FORMAT_UNDEFINED -> uint - # Sub-byte unsigned bit depths the reader promotes to uint8. - (1, 1, "Byte"), - (2, 1, "Byte"), - (4, 1, "Byte"), - # Standard unsigned bit depths. - (8, 1, "Byte"), - (16, 1, "UInt16"), - (32, 1, "UInt32"), - (64, 1, "UInt64"), - # Signed integers. - (8, 2, "Int8"), - (16, 2, "Int16"), - (32, 2, "Int32"), - (64, 2, "Int64"), - # Floats. The previous ladder defaulted ``sf=3`` to ``Float32`` - # for any non-64 bps, which silently downcast ``bps=16`` halfs. - # The resolver rejects bps=16 floats outright (no IEEE half - # support), so we only pin the supported widths here. - (32, 3, "Float32"), - (64, 3, "Float64"), - ], -) -def test_vrt_dtype_name_for_supported(bps, sf, expected): - assert _vrt_dtype_name_for(bps, sf) == expected - - -def test_vrt_dtype_name_for_sample_format_sequence_resolves(): - # ``ifd.sample_format`` is sometimes a tuple of per-band values; the - # helper must funnel it through ``resolve_sample_format`` rather - # than treating the raw tuple as an int. - assert _vrt_dtype_name_for(8, [1, 1]) == "Byte" - assert _vrt_dtype_name_for(16, (2, 2)) == "Int16" - - -def test_vrt_dtype_name_for_unsupported_raises(): - # bps=24 / sf=2 isn't in ``tiff_dtype_to_numpy``; should surface as - # a ValueError from the resolver rather than silently mapping to - # 'Byte' or 'Int32' the way the old ladder did. - with pytest.raises(ValueError): - _vrt_dtype_name_for(24, 2) - - -def test_np_to_vrt_dtype_table_covers_all_resolver_outputs(): - # Defensive: every numpy dtype that ``tiff_dtype_to_numpy`` can - # produce must have an entry in ``_NP_TO_VRT_DTYPE`` or else - # ``_vrt_dtype_name_for`` will raise the catch-all ValueError in - # normal use. This catches future additions to the resolver that - # forget to wire up a GDAL name. - from xrspatial.geotiff._dtypes import tiff_dtype_to_numpy - - pairs = [ - (8, 1), (8, 2), - (16, 1), (16, 2), - (32, 1), (32, 2), (32, 3), - (64, 1), (64, 2), (64, 3), - (1, 1), (2, 1), (4, 1), (12, 1), - ] - for bps, sf in pairs: - np_dtype = tiff_dtype_to_numpy(bps, sf) - assert np_dtype.type in _NP_TO_VRT_DTYPE, ( - f"resolver yields {np_dtype} for bps={bps}, sf={sf} but " - f"_NP_TO_VRT_DTYPE has no entry for it" - ) - - -# --------------------------------------------------------------------------- -# End-to-end write_vrt sanity: uint16 source produces UInt16 in the VRT -# --------------------------------------------------------------------------- - - -def _unique_dir(tmp_path, label: str) -> str: - d = tmp_path / f"vrt_1914_{label}_{uuid.uuid4().hex[:8]}" - d.mkdir() - return str(d) - - -def _write_uint16_tif(path: str, *, h: int = 4, w: int = 4, - origin_x: float = 0.0) -> None: - arr = np.arange(h * w, dtype=np.uint16).reshape(h, w) - y = 100.0 + (np.arange(h) + 0.5) * -1.0 - x = origin_x + (np.arange(w) + 0.5) * 1.0 - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}, - ) - to_geotiff(da, path, compression='none') - - -def test_uint16_source_writes_uint16_vrt_datatype(tmp_path): - # Pre-fix this would have produced ``dataType="UInt16"`` too, since - # bps=16/sf=1 happened to be in the old ladder. The point of this - # test is to assert the XML still says UInt16 after refactoring the - # dtype path through the central resolver -- i.e. we didn't - # accidentally regress the easy case while fixing the 12-bit one. - d = _unique_dir(tmp_path, "u16") - a = os.path.join(d, "a.tif") - b = os.path.join(d, "b.tif") - _write_uint16_tif(a) - _write_uint16_tif(b, origin_x=4.0) - vrt = os.path.join(d, "out.vrt") - write_vrt(vrt, [a, b]) - with open(vrt) as f: - xml = f.read() - assert 'dataType="UInt16"' in xml - assert 'dataType="Byte"' not in xml - - -def test_int16_source_writes_int16_vrt_datatype(tmp_path): - # The old ladder mapped sf=2, bps=16 to 'Int16' correctly; pin that - # behaviour so the new resolver path doesn't drift. - d = _unique_dir(tmp_path, "i16") - a = os.path.join(d, "a.tif") - arr = np.arange(16, dtype=np.int16).reshape(4, 4) - y = 100.0 + (np.arange(4) + 0.5) * -1.0 - x = (np.arange(4) + 0.5) * 1.0 - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}, - ) - to_geotiff(da, a, compression='none') - vrt = os.path.join(d, "out.vrt") - write_vrt(vrt, [a]) - with open(vrt) as f: - xml = f.read() - assert 'dataType="Int16"' in xml diff --git a/xrspatial/geotiff/tests/test_vrt_dtype_1783.py b/xrspatial/geotiff/tests/test_vrt_dtype_1783.py deleted file mode 100644 index 32285f626..000000000 --- a/xrspatial/geotiff/tests/test_vrt_dtype_1783.py +++ /dev/null @@ -1,517 +0,0 @@ -"""Regression tests for issue #1783. - -Before this fix, :func:`xrspatial.geotiff._vrt.parse_vrt` resolved a -```` attribute with -``_DTYPE_MAP.get(dtype_name, np.float32)``. Any GDAL dataType not -present in ``_DTYPE_MAP`` -- the four complex types (``CInt16``, -``CInt32``, ``CFloat32``, ``CFloat64``), the 64-bit integer types that -the map did not yet list (``UInt64``, ``Int64``), or a typo -- silently -collapsed to ``Float32``. Complex sources lost their imaginary -component, 64-bit integer sources lost precision, and typos produced -wrong values with no diagnostic. - -The fix: - -* Adds ``UInt64`` / ``Int64`` to ``_DTYPE_MAP``. -* Splits the resolution into "attribute missing" (still defaults to - Float32 per the GDAL spec) and "attribute present but unsupported" - (now raises ``ValueError`` naming the band, the offending dataType, - and the supported types). -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._vrt import _parse_band_nodata, parse_vrt -from xrspatial.geotiff._writer import write - - -def _write(arr, path, **kw): - """Write a 2D array to ``path`` with sensible defaults for tests.""" - write(arr, str(path), compression='none', tiled=False, **kw) - - -def _build_single_band_vrt(tmp_path, *, dtype_attr, src_path, - filename='b.vrt', size=2, nodata=None): - """Hand-roll a single-band VRT with an arbitrary ``dataType`` attribute. - - ``dtype_attr`` is rendered verbatim into the ```` - element. Pass an empty string to omit the attribute entirely (the - "GDAL default" case). - - ``nodata`` (when not ``None``) is rendered verbatim into a - ```` child so callers can exercise sentinel-parsing - edge cases (scientific notation, ``nan``, full-range 64-bit - integers). - """ - if dtype_attr: - attr = f' dataType="{dtype_attr}"' - else: - attr = '' - nodata_elem = (f'{nodata}' - if nodata is not None else '') - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {nodata_elem} - - {src_path} - 1 - - - - -""" - p = tmp_path / filename - p.write_text(vrt_xml) - return str(p) - - -# --------------------------------------------------------------------------- -# 1. Complex dataType is rejected (no silent imaginary-component loss) -# --------------------------------------------------------------------------- - -@pytest.mark.parametrize('cdtype', ['CInt16', 'CInt32', 'CFloat32', 'CFloat64']) -def test_complex_dtype_raises_value_error(tmp_path, cdtype): - """A VRT declaring any complex ``dataType`` must raise ``ValueError`` - rather than silently substituting ``Float32``. The error message - must name both the band number and the offending dataType so the - operator can fix the VRT, and must mention that complex types are - explicitly unsupported. - """ - b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr=cdtype, src_path=str(src), - ) - with pytest.raises(ValueError) as ei: - read_vrt(vrt) - msg = str(ei.value) - assert cdtype in msg, f"error message must name {cdtype!r}: {msg!r}" - assert 'band=1' in msg or 'band 1' in msg, ( - f"error message must name the band: {msg!r}" - ) - assert 'complex' in msg.lower(), ( - f"error message must mention complex types: {msg!r}" - ) - - -# --------------------------------------------------------------------------- -# 2. Typo / arbitrary garbage dataType is rejected -# --------------------------------------------------------------------------- - -def test_garbage_dtype_raises_value_error(tmp_path): - """An unrecognised non-complex ``dataType`` (e.g. a typo) must also - raise ``ValueError`` rather than collapsing silently to Float32. - """ - b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Garbage', src_path=str(src), - ) - with pytest.raises(ValueError, match=r'Garbage'): - read_vrt(vrt) - - -def test_typo_for_supported_dtype_is_still_rejected(tmp_path): - """``Flaot32`` (typo of ``Float32``) is distinct from the empty / - missing case and must surface as ``ValueError`` instead of silently - falling back. - """ - b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Flaot32', src_path=str(src), - ) - with pytest.raises(ValueError, match=r'Flaot32'): - read_vrt(vrt) - - -# --------------------------------------------------------------------------- -# 3. UInt64 / Int64 are now supported and round-trip losslessly -# --------------------------------------------------------------------------- - -def test_uint64_round_trip(tmp_path): - """A VRT declaring ``dataType="UInt64"`` whose source GeoTIFF is - written as uint64 must read back as uint64 with the exact values - preserved, including values past the float32 / int53 boundary. - """ - big = np.iinfo(np.uint64).max # 2**64 - 1 - near_big = big - 7 - b = np.array([[1, 2], [near_big, big]], dtype=np.uint64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='UInt64', src_path=str(src), - ) - r = read_vrt(vrt) - assert r.dtype == np.uint64, ( - f"UInt64 VRT must read as uint64; got {r.dtype}" - ) - np.testing.assert_array_equal(r.values, b) - # Largest values must survive bit-for-bit, not collapse to float - assert int(r.values[1, 1]) == big - assert int(r.values[1, 0]) == near_big - - -def test_int64_round_trip(tmp_path): - """A VRT declaring ``dataType="Int64"`` must read back as int64 - with the full int64 range preserved (positive and negative - extremes). - """ - info = np.iinfo(np.int64) - b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Int64', src_path=str(src), - ) - r = read_vrt(vrt) - assert r.dtype == np.int64, ( - f"Int64 VRT must read as int64; got {r.dtype}" - ) - np.testing.assert_array_equal(r.values, b) - - -# --------------------------------------------------------------------------- -# 4. Missing dataType attribute still defaults to Float32 (GDAL default) -# --------------------------------------------------------------------------- - -def test_missing_dtype_attribute_defaults_to_float32(tmp_path): - """```` with no ``dataType`` attribute must - still default to ``Float32``. This is GDAL's documented default - and the previous fallback handled it correctly; the new - "unknown-attribute raises" path must not regress the - "missing-attribute defaults" path. - """ - b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='', src_path=str(src), - ) - r = read_vrt(vrt) - assert r.dtype == np.float32, ( - f"missing dataType must default to Float32; got {r.dtype}" - ) - np.testing.assert_allclose(r.values, b) - - -# --------------------------------------------------------------------------- -# 5. Pre-existing supported dtypes still read correctly (smoke regression) -# --------------------------------------------------------------------------- - -def test_byte_dtype_still_works(tmp_path): - """``Byte`` reads back as uint8 with values preserved. Smoke check - to confirm the rewritten dtype resolution did not break the - common-case integer path. - """ - b = np.array([[10, 11], [12, 13]], dtype=np.uint8) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Byte', src_path=str(src), - ) - r = read_vrt(vrt) - assert r.dtype == np.uint8 - np.testing.assert_array_equal(r.values, b) - - -def test_float64_dtype_still_works(tmp_path): - """``Float64`` reads back as float64 with values preserved. Smoke - check for the wider floating-point path. - """ - b = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Float64', src_path=str(src), - ) - r = read_vrt(vrt) - assert r.dtype == np.float64 - np.testing.assert_allclose(r.values, b) - - -# --------------------------------------------------------------------------- -# 6. Integer ```` is parsed as ``int`` for integer bands -# -# Regression for the Copilot review note on the original #1783 PR: now -# that UInt64 / Int64 are supported, parsing the sentinel as ``float`` -# silently drops precision near the 64-bit extremes. ``2**64 - 1`` -# rounds up to ``2**64`` in float64, ``INT64_MIN`` survives but only -# barely, and downstream exact-equality masks break. -# --------------------------------------------------------------------------- - -def test_parse_band_nodata_uint64_max_exact(): - """``_parse_band_nodata`` must return the exact ``int`` for - ``2**64 - 1`` (UInt64 max), not a float64 that rounds up to - ``2**64``. - """ - big = 2**64 - 1 - nd = _parse_band_nodata(str(big), np.dtype(np.uint64)) - assert isinstance(nd, int), ( - f"UInt64 nodata must parse as int, got {type(nd).__name__}" - ) - assert nd == big - # float64 round-trip would equal 2**64, off by one - assert nd != int(float(big)) - - -def test_parse_band_nodata_int64_min_exact(): - """``INT64_MIN`` (``-2**63``) must survive parsing as an int.""" - info = np.iinfo(np.int64) - nd = _parse_band_nodata(str(info.min), np.dtype(np.int64)) - assert isinstance(nd, int) - assert nd == info.min - - -def test_parse_band_nodata_int32_negative(): - """Common GDAL sentinel ``-9999`` for an Int32 band parses as int.""" - nd = _parse_band_nodata('-9999', np.dtype(np.int32)) - assert isinstance(nd, int) - assert nd == -9999 - - -def test_parse_band_nodata_int_scientific_notation(): - """GDAL occasionally emits integer nodata in scientific or - ``-9999.0`` form. Parsing should still land an int when the - value is integer-valued and in-range. - """ - nd = _parse_band_nodata('-9999.0', np.dtype(np.int32)) - assert isinstance(nd, int) and nd == -9999 - nd = _parse_band_nodata('1e3', np.dtype(np.int32)) - assert isinstance(nd, int) and nd == 1000 - - -def test_parse_band_nodata_int_out_of_range_falls_back(): - """An out-of-range sentinel for the band dtype is returned as the - parsed float so it surfaces via ``attrs['nodata']`` for round-trip - but can never match an integer pixel (mirroring - ``_resolve_masked_fill``'s tolerant behaviour). - """ - # -9999 cannot be represented as uint16; should not raise - nd = _parse_band_nodata('-9999', np.dtype(np.uint16)) - # ``int('-9999')`` succeeds and out-of-range so we *do* return the - # int in the cheap path -- _sentinel_for_dtype downstream is then - # responsible for refusing to use it as a mask sentinel. - assert nd == -9999 - - -def test_parse_band_nodata_float_nan(): - """Float bands keep NaN sentinels working (no integer-parse - regression for the floating path). - """ - nd = _parse_band_nodata('nan', np.dtype(np.float32)) - assert isinstance(nd, float) - assert np.isnan(nd) - - -def test_parse_band_nodata_float_scientific(): - """Float bands preserve scientific-notation sentinels.""" - nd = _parse_band_nodata('-1.5e10', np.dtype(np.float64)) - assert isinstance(nd, float) - assert nd == -1.5e10 - - -def test_parse_band_nodata_empty_or_none(): - """Empty / whitespace / ``None`` input returns ``None`` regardless - of dtype. - """ - assert _parse_band_nodata(None, np.dtype(np.int32)) is None - assert _parse_band_nodata('', np.dtype(np.int32)) is None - assert _parse_band_nodata(' ', np.dtype(np.float32)) is None - - -# --------------------------------------------------------------------------- -# 7. End-to-end VRT parse: ``vrt.bands[i].nodata`` is an int for integer -# bands, a float for float bands. -# --------------------------------------------------------------------------- - -def _make_minimal_vrt_xml(dtype_attr, nodata_text): - """Tiny VRT XML string suitable for direct ``parse_vrt`` calls. - - The SourceFilename here is intentionally minimal -- ``parse_vrt`` - only does the containment check after canonicalising the path, so - we pass a path inside the temp dir at the call site. - """ - return ( - '' - '0.0, 1.0, 0.0, 0.0, 0.0, -1.0' - f'' - f'{nodata_text}' - '' - '' - ) - - -def test_parse_vrt_uint64_nodata_is_int(tmp_path): - """The dataclass stored on ``_VRTBand.nodata`` is a Python ``int`` - for an integer-dtype band, with the exact 64-bit value. - """ - big = 2**64 - 1 - xml = _make_minimal_vrt_xml('UInt64', str(big)) - vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) - assert len(vrt.bands) == 1 - nd = vrt.bands[0].nodata - assert isinstance(nd, int) - assert nd == big - - -def test_parse_vrt_int64_min_nodata_is_int(tmp_path): - info = np.iinfo(np.int64) - xml = _make_minimal_vrt_xml('Int64', str(info.min)) - vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) - nd = vrt.bands[0].nodata - assert isinstance(nd, int) - assert nd == info.min - - -def test_parse_vrt_float32_nan_nodata_is_float(tmp_path): - xml = _make_minimal_vrt_xml('Float32', 'nan') - vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) - nd = vrt.bands[0].nodata - assert isinstance(nd, float) - assert np.isnan(nd) - - -# --------------------------------------------------------------------------- -# 8. Full read_vrt round-trip preserves precision and masks correctly. -# --------------------------------------------------------------------------- - -def test_uint64_nodata_round_trip_preserves_max_sentinel(tmp_path): - """A VRT declaring UInt64 + ``2**64 - 1`` - must surface ``attrs['nodata']`` as the exact integer value, not a - float that has rounded past the dtype's range. Downstream - consumers rely on exact equality. - """ - big = 2**64 - 1 - # Fill the source with non-sentinel values so the read keeps the - # uint64 dtype (a sentinel hit promotes to float64 + NaN, which - # would defeat the precision check in this test). - b = np.array([[1, 2], [3, 4]], dtype=np.uint64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big, - ) - r = read_vrt(vrt) - # Either the array stays uint64 (no sentinel hit) or it promotes to - # float64 (sentinel hit). In the no-hit case attrs['nodata'] must - # carry the exact int. - assert 'nodata' in r.attrs - assert int(r.attrs['nodata']) == big - # Critically, the stored attr must not be a float64 that has rounded - # the sentinel up to 2**64. ``isinstance`` allows int or np.integer - # but rejects float / np.floating. - assert isinstance(r.attrs['nodata'], (int, np.integer)) - - -def test_uint64_nodata_masks_max_sentinel_in_data(tmp_path): - """When the source pixel actually contains ``2**64 - 1``, the - masking pipeline must catch it: the result is promoted to float64 - with NaN at the sentinel position. This is the precision- - preservation acid test -- if the nodata was rounded to a float - that doesn't equal the source pixel, the mask never fires and the - sentinel survives as a 1.8e19 float. - """ - big = 2**64 - 1 - b = np.array([[1, 2], [3, big]], dtype=np.uint64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big, - ) - r = read_vrt(vrt) - # Sentinel hit -> promote to float64 with NaN - assert r.dtype == np.float64, ( - f"sentinel hit must promote to float64, got {r.dtype}" - ) - assert np.isnan(r.values[1, 1]), ( - f"the 2**64-1 cell must be masked to NaN; got {r.values[1, 1]!r}" - ) - # Non-sentinel cells survive as float64 values - assert r.values[0, 0] == 1.0 - assert r.values[0, 1] == 2.0 - assert r.values[1, 0] == 3.0 - - -def test_int64_min_nodata_masks_correctly(tmp_path): - """``INT64_MIN`` as both the nodata sentinel and a real pixel value - masks correctly without int64 -> float64 rounding aliasing the - sentinel onto adjacent values. - """ - info = np.iinfo(np.int64) - b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Int64', src_path=str(src), nodata=info.min, - ) - r = read_vrt(vrt) - assert r.dtype == np.float64 - assert np.isnan(r.values[0, 0]) - assert r.values[0, 1] == -1.0 - assert r.values[1, 0] == 0.0 - assert r.values[1, 1] == float(info.max) - - -def test_int32_negative_nodata_still_masks(tmp_path): - """Smoke regression for the common Int32 + ``-9999`` case. The - integer parsing path must not break this when there is no precision - pressure -- ``-9999`` survives ``float()`` fine but we still want - the new int-typed parse to mask the same way the old float-typed - parse did. - """ - b = np.array([[10, -9999], [-9999, 20]], dtype=np.int32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Int32', src_path=str(src), nodata=-9999, - ) - r = read_vrt(vrt) - assert r.dtype == np.float64 - assert np.isnan(r.values[0, 1]) - assert np.isnan(r.values[1, 0]) - assert r.values[0, 0] == 10.0 - assert r.values[1, 1] == 20.0 - - -def test_float32_nan_nodata_still_works(tmp_path): - """``Float32`` + ``nan`` still parses and - surfaces NaN via ``attrs['nodata']`` (no regression on the float - path). - """ - b = np.array([[1.0, np.nan], [3.0, 4.0]], dtype=np.float32) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Float32', src_path=str(src), nodata='nan', - ) - r = read_vrt(vrt) - assert r.dtype == np.float32 - assert np.isnan(r.attrs['nodata']) - assert np.isnan(r.values[0, 1]) - - -def test_float64_scientific_nodata_still_works(tmp_path): - """``Float64`` + scientific-notation ```` survives as - float (no integer-parse regression for the float path). - """ - b = np.array([[1.0, -1.5e10], [3.0, 4.0]], dtype=np.float64) - src = tmp_path / 'src.tif' - _write(b, src) - vrt = _build_single_band_vrt( - tmp_path, dtype_attr='Float64', src_path=str(src), nodata='-1.5e10', - ) - r = read_vrt(vrt) - assert r.dtype == np.float64 - assert r.attrs['nodata'] == -1.5e10 - # The matching pixel stays as-is for float -- there's no NaN - # promotion (it's already float64), so the sentinel surfaces as a - # literal value unless the float-source nodata-masking branch fires. - # Either behaviour is acceptable; just confirm nodata attr is set. diff --git a/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py b/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py deleted file mode 100644 index 84d1315cc..000000000 --- a/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Regression test for issue #1734. - -Under the lenient default (``XRSPATIAL_GEOTIFF_STRICT`` unset), -``read_vrt`` warns once per unreadable source and continues, producing -a mosaic with zero-filled holes for integer VRTs that downstream code -cannot distinguish from real data. The warning is easy to miss in a -pipeline that ignores ``UserWarning``s. - -This module pins the new behaviour: the returned DataArray now carries -an ``attrs['vrt_holes']`` list describing each skipped source so -callers can detect a partial mosaic with a single attribute lookup. -Strict mode is unchanged and still raises. -""" -from __future__ import annotations - -import warnings - -import pytest - -from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt - - -@pytest.fixture -def clear_strict_env(monkeypatch): - monkeypatch.delenv('XRSPATIAL_GEOTIFF_STRICT', raising=False) - - -@pytest.fixture -def set_strict_env(monkeypatch): - monkeypatch.setenv('XRSPATIAL_GEOTIFF_STRICT', '1') - - -def _write_vrt_with_missing_source(vrt_path, missing_src) -> None: - """Write a VRT with an Int32 band whose only source is missing. - - Integer ``dataType`` is the failure mode issue #1734 was about: the - pre-fix lenient path zero-fills the output buffer (``fill = 0`` for - integer dtypes) and the user cannot distinguish that hole from real - zero-valued data. ``NoDataValue`` is omitted on purpose -- having - one would let downstream code mask the hole and side-step the - regression. See the module docstring. - """ - vrt_path.write_text( - '\n' - ' \n' - ' 0, 1, 0, 0, 0, -1\n' - ' \n' - ' \n' - f' {missing_src}' - '\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - - -def test_skipped_source_records_vrt_holes_attr( - clear_strict_env, tmp_path, -): - """A VRT with a missing source returns a DataArray whose attrs - carry a ``vrt_holes`` entry naming the source, band, dst_rect, - and underlying error. - - Uses an Int32 VRT so the hole is zero-filled (the exact failure - mode #1734 was about): without the attr there is no way to tell - the all-zeros tile from real data. - """ - import numpy as np - - vrt_path = tmp_path / 'mosaic_1734_missing.vrt' - missing_src = f'{tmp_path}/does_not_exist_1734.tif' - _write_vrt_with_missing_source(vrt_path, missing_src) - - with warnings.catch_warnings(): - warnings.simplefilter('ignore', GeoTIFFFallbackWarning) - # Public ``read_vrt`` defaults to ``missing_sources='raise'`` - # since #1860; the lenient path that populates ``vrt_holes`` is - # now an explicit opt-in. - da = read_vrt(str(vrt_path), missing_sources='warn') - - # Confirm the integer-specific failure mode is in play: the hole is - # filled with zeros (not NaN), indistinguishable from real data - # without the attr. - assert np.issubdtype(da.dtype, np.integer) - assert (da.values == 0).all() - - assert 'vrt_holes' in da.attrs - holes = da.attrs['vrt_holes'] - assert isinstance(holes, list) - assert len(holes) == 1 - h = holes[0] - assert h['source'].endswith('does_not_exist_1734.tif') - assert h['band'] == 1 - assert h['dst_rect'] == (0, 0, 4, 4) - assert 'error' in h - assert h['error'] # non-empty - - -def test_no_holes_attr_when_all_sources_read(clear_strict_env, tmp_path): - """A successful VRT read does not advertise an empty ``vrt_holes`` - attr; the key is omitted entirely so ``"vrt_holes" in attrs`` is a - cheap completeness check.""" - import numpy as np - import xarray as xr - - from xrspatial.geotiff import to_geotiff - - # Write a real source the VRT can reference. - src_path = tmp_path / 'src_1734.tif' - arr = np.arange(16, dtype=np.float32).reshape(4, 4) - da_src = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': np.linspace(3.5, 0.5, 4), - 'x': np.linspace(0.5, 3.5, 4)}, - attrs={'crs': 4326}, - ) - to_geotiff(da_src, str(src_path), compression='none') - - vrt_path = tmp_path / 'mosaic_1734_ok.vrt' - vrt_path.write_text( - '\n' - ' \n' - ' 0, 1, 0, 0, 0, -1\n' - ' \n' - ' \n' - f' {src_path}' - '\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - - with warnings.catch_warnings(): - warnings.simplefilter('error', GeoTIFFFallbackWarning) - da = read_vrt(str(vrt_path)) - - assert 'vrt_holes' not in da.attrs - - -def test_strict_mode_still_raises(set_strict_env, tmp_path): - """Strict mode is unchanged: the missing source surfaces the - underlying ``FileNotFoundError`` (an ``OSError`` subclass) from - ``read_to_array`` instead of warning-and-skipping. - - Asserting the concrete exception class -- not a bare ``Exception`` - -- keeps the regression test honest: an unrelated bug somewhere in - the read path that happens to raise a different exception will - fail this test instead of silently satisfying it. - """ - vrt_path = tmp_path / 'mosaic_1734_strict.vrt' - missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif' - _write_vrt_with_missing_source(vrt_path, missing_src) - - with pytest.raises(FileNotFoundError, - match='does_not_exist_1734_strict.tif'): - read_vrt(str(vrt_path)) - - -def test_warning_mentions_how_to_detect_holes(clear_strict_env, tmp_path): - """The fallback warning now points callers at the attr or the - strict env var so the recovery path is discoverable from a single - captured warning.""" - vrt_path = tmp_path / 'mosaic_1734_msg.vrt' - missing_src = f'{tmp_path}/does_not_exist_1734_msg.tif' - _write_vrt_with_missing_source(vrt_path, missing_src) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - # The lenient path is now an explicit opt-in (#1860); the - # warning content this test pins is still emitted under - # ``missing_sources='warn'``. - read_vrt(str(vrt_path), missing_sources='warn') - - fallback = [ - x for x in w if issubclass(x.category, GeoTIFFFallbackWarning) - ] - assert fallback, "expected at least one GeoTIFFFallbackWarning" - msg = ' '.join(str(x.message) for x in fallback) - assert 'vrt_holes' in msg or 'XRSPATIAL_GEOTIFF_STRICT' in msg diff --git a/xrspatial/geotiff/tests/test_vrt_int_nodata_1564.py b/xrspatial/geotiff/tests/test_vrt_int_nodata_1564.py deleted file mode 100644 index a2a25b19a..000000000 --- a/xrspatial/geotiff/tests/test_vrt_int_nodata_1564.py +++ /dev/null @@ -1,187 +0,0 @@ -"""VRT integer-with-nodata parity tests for issue #1564. - -Before the fix, ``read_vrt`` set ``attrs['nodata']`` on the returned -DataArray but left the integer sentinel value (e.g. 65535) intact in the -pixel array. The eager ``open_geotiff`` path, the dask path, and both -GPU paths all promote integer-with-nodata rasters to float64 and replace -the sentinel with NaN -- the VRT route was the lone divergence. - -These tests pin the contract that ``read_vrt`` mirrors that promotion. -""" -from __future__ import annotations - -import numpy as np -import xarray as xr - -from xrspatial.geotiff import open_geotiff, read_vrt, to_geotiff, write_vrt - - -def _write_uint16_with_nodata_tif(path, sentinel): - """Write a small uint16 GeoTIFF with a nodata sentinel.""" - arr = np.array([[1, 2, 3], [sentinel, 5, 6]], dtype=np.uint16) - da = xr.DataArray( - arr, - dims=['y', 'x'], - coords={'y': np.arange(2), 'x': np.arange(3)}, - attrs={'crs': 4326, 'nodata': sentinel}, - ) - to_geotiff(da, path, compression='none', nodata=sentinel) - return arr - - -def test_vrt_uint16_nodata_promotes_to_float64(tmp_path): - """VRT route NaN-masks integer-with-nodata, matching open_geotiff.""" - tif = str(tmp_path / 'src_1564.tif') - _write_uint16_with_nodata_tif(tif, sentinel=65535) - - eager = open_geotiff(tif) - assert eager.dtype == np.float64 - assert np.isnan(eager.values[1, 0]) - - vrt_path = str(tmp_path / 'src_1564.vrt') - write_vrt(vrt_path, [tif]) - via_vrt = read_vrt(vrt_path) - - assert via_vrt.dtype == np.float64, ( - f"VRT integer-with-nodata should promote to float64; " - f"got {via_vrt.dtype}" - ) - assert np.isnan(via_vrt.values[1, 0]), ( - f"VRT sentinel pixel should be NaN; got " - f"{via_vrt.values[1, 0]} (literal sentinel survived)" - ) - assert via_vrt.attrs.get('nodata') == 65535.0 - - -def test_vrt_uint16_no_nodata_keeps_dtype(tmp_path): - """Without a nodata sentinel, the dtype stays integer.""" - arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16) - da = xr.DataArray( - arr, - dims=['y', 'x'], - coords={'y': np.arange(2), 'x': np.arange(3)}, - attrs={'crs': 4326}, - ) - tif = str(tmp_path / 'src_no_nodata_1564.tif') - to_geotiff(da, tif, compression='none') - - vrt_path = str(tmp_path / 'src_no_nodata_1564.vrt') - write_vrt(vrt_path, [tif]) - via_vrt = read_vrt(vrt_path) - - assert via_vrt.dtype == np.uint16 - np.testing.assert_array_equal(via_vrt.values, arr) - - -def test_vrt_float_nodata_still_masks(tmp_path): - """Regression guard: the existing float-with-nodata branch still - works after the integer-branch addition.""" - arr = np.array([[1.0, 2.0, -9999.0], [4.0, -9999.0, 6.0]], - dtype=np.float32) - da = xr.DataArray( - arr, - dims=['y', 'x'], - coords={'y': np.arange(2), 'x': np.arange(3)}, - attrs={'crs': 4326, 'nodata': -9999.0}, - ) - tif = str(tmp_path / 'srcf_1564.tif') - to_geotiff(da, tif, compression='none', nodata=-9999.0) - - vrt_path = str(tmp_path / 'srcf_1564.vrt') - write_vrt(vrt_path, [tif]) - via_vrt = read_vrt(vrt_path) - - assert via_vrt.dtype == np.float32 - assert np.isnan(via_vrt.values[0, 2]) - assert np.isnan(via_vrt.values[1, 1]) - - -def _rewrite_vrt_nodata(vrt_path, new_nodata_text): - """Rewrite the element of an existing VRT to a literal - string so we can exercise fractional / out-of-range cases without - going through ``write_vrt`` (which only accepts numeric values).""" - with open(vrt_path, 'r') as f: - xml = f.read() - import re - new_xml, n = re.subn( - r'[^<]*', - f'{new_nodata_text}', - xml, - ) - assert n == 1, f'expected 1 NoDataValue element, found {n}' - with open(vrt_path, 'w') as f: - f.write(new_xml) - - -def test_vrt_fractional_nodata_is_not_masked(tmp_path): - """Fractional VRT NoDataValue against an integer band must NOT mask: - truncating to int would alias a real pixel value as nodata.""" - arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16) - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': np.arange(2), 'x': np.arange(3)}, - attrs={'crs': 4326, 'nodata': 1}, - ) - tif = str(tmp_path / 'frac_1564.tif') - to_geotiff(da, tif, compression='none', nodata=1) - - vrt_path = str(tmp_path / 'frac_1564.vrt') - write_vrt(vrt_path, [tif]) - _rewrite_vrt_nodata(vrt_path, '1.9') - - via_vrt = read_vrt(vrt_path) - assert via_vrt.dtype == np.uint16, ( - 'Fractional NoDataValue must not trigger integer masking ' - f'(got dtype {via_vrt.dtype}, pixel @[0,0]={via_vrt.values[0, 0]})' - ) - np.testing.assert_array_equal(via_vrt.values, arr) - - -def test_vrt_out_of_range_nodata_is_not_masked(tmp_path): - """NoDataValue outside the dtype range must NOT mask: casting would - wrap and alias an in-range pixel.""" - arr = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16) - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': np.arange(2), 'x': np.arange(3)}, - attrs={'crs': 4326, 'nodata': 0}, - ) - tif = str(tmp_path / 'oor_1564.tif') - to_geotiff(da, tif, compression='none', nodata=0) - - vrt_path = str(tmp_path / 'oor_1564.vrt') - write_vrt(vrt_path, [tif]) - # uint16 wraps -1 to 65535, which is in-range but a different sentinel. - # Reject it rather than silently masking the wrong value. - _rewrite_vrt_nodata(vrt_path, '-1') - - via_vrt = read_vrt(vrt_path) - assert via_vrt.dtype == np.uint16, ( - 'Out-of-range NoDataValue must not trigger integer masking ' - f'(got dtype {via_vrt.dtype})' - ) - np.testing.assert_array_equal(via_vrt.values, arr) - - -def test_vrt_open_geotiff_parity_uint16_nodata(tmp_path): - """open_geotiff routing a .vrt path should produce the same dtype - and masked positions as a direct GeoTIFF read.""" - tif = str(tmp_path / 'parity_1564.tif') - _write_uint16_with_nodata_tif(tif, sentinel=65535) - - direct = open_geotiff(tif) - - vrt_path = str(tmp_path / 'parity_1564.vrt') - write_vrt(vrt_path, [tif]) - via_vrt = open_geotiff(vrt_path) - - assert direct.dtype == via_vrt.dtype - np.testing.assert_array_equal( - np.isnan(direct.values), np.isnan(via_vrt.values), - err_msg='VRT route should NaN-mask the same pixels as direct read', - ) - # Non-sentinel pixels equal - mask = ~np.isnan(direct.values) - np.testing.assert_array_equal( - direct.values[mask], via_vrt.values[mask], - ) diff --git a/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py b/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py deleted file mode 100644 index 026fb00bd..000000000 --- a/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Regression tests for issue #1616. - -A VRT whose ```` (or Float64) is fed -by an integer source GeoTIFF with an in-range ``GDAL_NODATA`` sentinel -used to leak the sentinel value through as a literal float in the -returned array. ``attrs['nodata']`` advertised the sentinel but the -pixels at the sentinel positions still held the integer value cast to -float (e.g. ``65535.0`` rather than ``np.nan``). NaN-aware downstream -code therefore saw the sentinel as valid data. - -The fix masks integer source arrays before they're placed into a float -``result`` buffer in ``_vrt.read_vrt`` so a float-dtype VRT result lands -with NaN at the sentinel pixels, matching what ``open_geotiff`` returns -for a single-file integer raster with the same sentinel. -""" -from __future__ import annotations - -import numpy as np - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='b0.tif'): - band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) - p = str(tmp_path / filename) - write(band, p, nodata=sentinel, compression='none', tiled=False) - return p - - -def _write_int16_with_sentinel(tmp_path, sentinel=-1, filename='b0.tif'): - band = np.array([[1, 2], [3, sentinel]], dtype=np.int16) - p = str(tmp_path / filename) - write(band, p, nodata=sentinel, compression='none', tiled=False) - return p - - -def _build_vrt(tmp_path, source_path, vrt_dtype, nodata_value, - filename='mismatch.vrt'): - """Hand-roll a VRT with the requested dataType / NoDataValue pair.""" - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {nodata_value} - - {source_path} - 1 - - - - -""" - p = str(tmp_path / filename) - with open(p, 'w') as f: - f.write(vrt_xml) - return p - - -def test_float32_vrt_uint16_source_masks_in_range_sentinel(tmp_path): - """Float32 VRT, uint16 source with in-range sentinel: pixel becomes NaN. - - Before the fix this returned dtype=float32 with values[1, 1] == 65535.0 - while ``attrs['nodata']`` advertised the sentinel. - """ - src = _write_uint16_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', 65535) - r = read_vrt(vrt) - assert r.dtype == np.float32, ( - f"Float32-declared VRT should return float32, got {r.dtype}") - assert np.isnan(r.values[1, 1]), ( - "Sentinel pixel (uint16 65535 -> float32) should be NaN-masked; " - f"got values[1, 1]={r.values[1, 1]}") - assert r.attrs.get('nodata') == 65535.0 - assert r.values[0, 0] == 1.0 - - -def test_float64_vrt_int16_source_masks_negative_sentinel(tmp_path): - """Float64 VRT, int16 source with negative sentinel: pixel becomes NaN.""" - src = _write_int16_with_sentinel(tmp_path, sentinel=-1) - vrt = _build_vrt(tmp_path, src, 'Float64', -1) - r = read_vrt(vrt) - assert r.dtype == np.float64 - assert np.isnan(r.values[1, 1]), ( - f"Sentinel pixel (-1) should be NaN-masked; " - f"got values[1, 1]={r.values[1, 1]}") - assert r.attrs.get('nodata') == -1.0 - - -def test_float32_vrt_out_of_range_sentinel_is_noop(tmp_path): - """An out-of-range sentinel (e.g. uint16 source + NoDataValue=-9999) - stays unmasked rather than raising ``OverflowError`` from the - ``uint16(-9999)`` cast. The pixel data is returned as-is and - ``attrs['nodata']`` still surfaces the declared sentinel so callers - can mask in user code or write through. - """ - arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) - p = str(tmp_path / 'b0_no_nodata.tif') - write(arr, p, compression='none', tiled=False) # no GDAL_NODATA on file - vrt = _build_vrt(tmp_path, p, 'Float32', -9999) - r = read_vrt(vrt) - assert r.dtype == np.float32 - # No pixels match the (out-of-range) sentinel, so nothing was masked. - assert not np.isnan(r.values).any() - assert r.attrs.get('nodata') == -9999.0 - - -def test_float32_vrt_uint16_source_no_sentinel_pixels(tmp_path): - """Float32 VRT, uint16 source whose pixels do not match the sentinel: - the result is a clean float array with no NaNs introduced. - - This exercises the early-out path inside the new mask branch -- a - declared sentinel that matches no pixels must not perturb the data - or cause an extra copy that would surface as a different dtype. - """ - arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) - p = str(tmp_path / 'b0_clean.tif') - write(arr, p, compression='none', tiled=False) - vrt = _build_vrt(tmp_path, p, 'Float32', 65535) - r = read_vrt(vrt) - assert r.dtype == np.float32 - assert not np.isnan(r.values).any() - np.testing.assert_array_equal(r.values, arr.astype(np.float32)) - - -def test_float_vrt_int_source_dask_path_masks_sentinel(tmp_path): - """The dask wrapper path (``chunks=...``) also returns NaN at the - sentinel pixel. The dask reader chunks the eager result after decode, - so the bug propagates if the eager path leaks the sentinel. - """ - src = _write_uint16_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', 65535) - r = read_vrt(vrt, chunks=2) - # Dask path keeps the float32 dtype declared by the VRT. - assert r.dtype == np.float32 - val = r.values - assert np.isnan(val[1, 1]) - - -def test_float_vrt_int_source_round_trip_nodata_attr(tmp_path): - """Even though the masking promotes pixels to NaN, the - ``attrs['nodata']`` value still carries the original sentinel so a - downstream write can restore the literal sentinel byte pattern. - """ - src = _write_uint16_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', 65535) - r = read_vrt(vrt) - assert r.attrs.get('nodata') == 65535.0 - - -def test_float_vrt_int_source_with_band_select(tmp_path): - """The band=N selection path also masks integer sentinels for a - float-declared VRT. The per-band ``NoDataValue`` from the VRT XML - must reach the source-side masking step, not just ``attrs['nodata']``. - """ - src_a = _write_uint16_with_sentinel(tmp_path, sentinel=65535, - filename='ba.tif') - src_b = _write_uint16_with_sentinel(tmp_path, sentinel=65000, - filename='bb.tif') - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - 65535 - - {src_a} - 1 - - - - - - 65000 - - {src_b} - 1 - - - - -""" - vrt_path = str(tmp_path / 'mb.vrt') - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - - # Mixed per-band sentinels in the VRT XML: after #1987 PR 5 this - # raises by default. ``band_nodata='first'`` opts back into the - # legacy flatten-to-first-band behaviour, which is what the - # band-selection regression in this test is asserting. - # band 0 -> 65535 sentinel masked - r0 = read_vrt(vrt_path, band=0, band_nodata='first') - assert r0.dtype == np.float32 - assert np.isnan(r0.values[1, 1]) - assert r0.attrs.get('nodata') == 65535.0 - - # band 1 -> 65000 sentinel masked, not 65535 - r1 = read_vrt(vrt_path, band=1, band_nodata='first') - assert r1.dtype == np.float32 - # band b had its sentinel at the same [1, 1] cell - assert np.isnan(r1.values[1, 1]) - assert r1.attrs.get('nodata') == 65000.0 diff --git a/xrspatial/geotiff/tests/test_vrt_lazy_chunks_1814.py b/xrspatial/geotiff/tests/test_vrt_lazy_chunks_1814.py deleted file mode 100644 index abf41d09a..000000000 --- a/xrspatial/geotiff/tests/test_vrt_lazy_chunks_1814.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Lazy chunked read_vrt builds a real dask graph (issue #1814). - -The pre-fix ``read_vrt(chunks=...)`` materialised the full VRT mosaic -on host RAM, then wrapped the resulting numpy array via ``.chunk()``. -That defeated the purpose of ``chunks=`` for memory protection and -made ``gpu=True`` + ``chunks=`` even worse: the entire mosaic was -moved to the device before chunking. - -These tests cover the new lazy path: - -* construction does not decode any pixels; -* per-chunk decode happens at ``.compute()`` time; -* the resulting array is byte-identical to the eager read; -* the chunk task count is bounded so a typo in ``chunks=`` cannot - build a graph the scheduler refuses to dispatch. -""" -from __future__ import annotations - -import os -import tempfile - -import dask.array as da -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import read_vrt, to_geotiff -from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal - - -def _gpu_available() -> bool: - try: - import cupy - except ImportError: - return False - try: - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() - - -@pytest.fixture -def single_tile_vrt(): - """One 128x128 float32 tile wrapped in a VRT.""" - arr = np.arange(128 * 128, dtype=np.float32).reshape(128, 128) - y = np.linspace(41.0, 40.0, 128) - x = np.linspace(-106.0, -105.0, 128) - raster = xr.DataArray(arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}) - td = tempfile.mkdtemp(prefix='tmp_1814_single_') - tile_path = os.path.join(td, 'tile.tif') - to_geotiff(raster, tile_path) - vrt_path = os.path.join(td, 'mosaic.vrt') - _write_vrt_internal(vrt_path, [tile_path]) - yield vrt_path, arr - - -@pytest.fixture -def two_by_two_vrt(): - """4-tile mosaic via the to_geotiff(.vrt, ...) dask path.""" - arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) - y = np.linspace(41.0, 40.0, 256) - x = np.linspace(-106.0, -105.0, 256) - raster = xr.DataArray(arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}) - td = tempfile.mkdtemp(prefix='tmp_1814_2x2_') - vrt_path = os.path.join(td, 'mosaic.vrt') - # ``tile_size=128`` produces a 2x2 mosaic of 128x128 tiles. - to_geotiff(raster, vrt_path, tile_size=128) - yield vrt_path, arr - - -@pytest.fixture -def multiband_vrt(): - """3-band single-tile VRT.""" - rng = np.random.default_rng(1814) - arr = rng.random((64, 64, 3), dtype=np.float32) - y = np.linspace(41.0, 40.0, 64) - x = np.linspace(-106.0, -105.0, 64) - raster = xr.DataArray( - arr, - dims=['y', 'x', 'band'], - coords={'y': y, 'x': x, 'band': np.arange(3)}, - attrs={'crs': 4326}, - ) - td = tempfile.mkdtemp(prefix='tmp_1814_mb_') - tile_path = os.path.join(td, 'tile.tif') - to_geotiff(raster, tile_path) - vrt_path = os.path.join(td, 'mosaic.vrt') - _write_vrt_internal(vrt_path, [tile_path]) - yield vrt_path, arr - - -# --------------------------------------------------------------------------- -# 1. Construction is lazy: no pixels are decoded before .compute(). -# --------------------------------------------------------------------------- - -def test_chunks_builds_dask_array_with_multiple_blocks(two_by_two_vrt): - """``read_vrt(chunks=(N,N))`` returns a dask-backed DataArray - whose underlying array has more than one chunk along each spatial - axis. Before the fix the array was numpy-backed under - ``result.chunk()``, so this asserts the new lazy graph is in - play. - """ - vrt_path, _ = two_by_two_vrt - result = read_vrt(vrt_path, chunks=(64, 64)) - assert isinstance(result.data, da.Array), ( - f"expected dask Array, got {type(result.data).__name__}" - ) - # 256 / 64 = 4 blocks per axis. - assert result.data.numblocks == (4, 4), ( - f"expected 4x4 blocks, got {result.data.numblocks}" - ) - - -def test_chunks_is_lazy_does_not_call_internal_reader(monkeypatch, - two_by_two_vrt): - """Construction-time call count of the internal VRT reader is zero; - after ``.compute()`` it equals the chunk count. - """ - vrt_path, _ = two_by_two_vrt - - from xrspatial.geotiff import _vrt as vrt_module - - counter = {'calls': 0} - real_read = vrt_module.read_vrt - - def counting_read(*args, **kwargs): - counter['calls'] += 1 - return real_read(*args, **kwargs) - - monkeypatch.setattr(vrt_module, 'read_vrt', counting_read) - - result = read_vrt(vrt_path, chunks=(64, 64)) - - assert counter['calls'] == 0, ( - f"_read_vrt_internal called {counter['calls']} times before " - f".compute(); the chunked path leaked an eager decode" - ) - - computed = result.compute() - # 4 row blocks * 4 col blocks = 16 expected decodes. - assert counter['calls'] == 16, ( - f"expected 16 per-chunk decodes after compute, got {counter['calls']}" - ) - assert computed.shape == (256, 256) - - -# --------------------------------------------------------------------------- -# 2. Byte-identical to the eager path. -# --------------------------------------------------------------------------- - -def test_chunked_compute_matches_eager(two_by_two_vrt): - vrt_path, _ = two_by_two_vrt - eager = read_vrt(vrt_path) - chunked = read_vrt(vrt_path, chunks=(64, 64)).compute() - assert eager.shape == chunked.shape - assert np.array_equal(eager.values, chunked.values), ( - "chunked compute diverged from eager read" - ) - # Coords and key attrs must match too. - np.testing.assert_array_equal(eager['x'].values, chunked['x'].values) - np.testing.assert_array_equal(eager['y'].values, chunked['y'].values) - assert eager.attrs.get('transform') == chunked.attrs.get('transform') - assert eager.attrs.get('crs') == chunked.attrs.get('crs') - - -def test_chunked_single_tile_matches_eager(single_tile_vrt): - """Single-tile VRT (one source) should still match eager when - chunked. Exercises the path where many chunk windows hit the - same single source. - """ - vrt_path, _ = single_tile_vrt - eager = read_vrt(vrt_path) - chunked = read_vrt(vrt_path, chunks=(32, 32)).compute() - assert np.array_equal(eager.values, chunked.values) - - -# --------------------------------------------------------------------------- -# 3. Task-count cap. -# --------------------------------------------------------------------------- - -def test_chunks_task_cap_raises(two_by_two_vrt): - """``chunks=(1, 1)`` on a 256x256 VRT would build 65,536 tasks, - blowing past the 50,000-task cap. The reader should refuse with - a ValueError that names ``chunks=`` and suggests a larger size. - """ - vrt_path, _ = two_by_two_vrt - with pytest.raises(ValueError, match=r"chunks=.*task"): - read_vrt(vrt_path, chunks=(1, 1)) - - -# --------------------------------------------------------------------------- -# 4. Window + chunks: chunks tile the window, not the full extent. -# --------------------------------------------------------------------------- - -def test_window_plus_chunks_matches_eager(two_by_two_vrt): - """When both ``window=`` and ``chunks=`` are passed, the dask - graph must tile the window (not the full VRT extent). The output - shape and pixel values match an eager windowed read. - """ - vrt_path, _ = two_by_two_vrt - window = (32, 48, 160, 192) # 128 high, 144 wide - - eager = read_vrt(vrt_path, window=window) - chunked = read_vrt(vrt_path, window=window, chunks=(64, 64)) - - assert isinstance(chunked.data, da.Array) - # The chunk grid is sized off the window extent (128, 144) with - # chunks=64 => (2, 3) numblocks. - assert chunked.data.numblocks == (2, 3), ( - f"expected (2, 3) numblocks over the window, got " - f"{chunked.data.numblocks}" - ) - - computed = chunked.compute() - assert computed.shape == eager.shape == (128, 144) - assert np.array_equal(eager.values, computed.values) - - -# --------------------------------------------------------------------------- -# 5. GPU + chunks: each block is a cupy array. -# --------------------------------------------------------------------------- - -@pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") -def test_gpu_plus_chunks_returns_dask_on_cupy(two_by_two_vrt): - """``read_vrt(gpu=True, chunks=...)`` must build a dask graph whose - blocks are cupy-backed (not numpy that gets cupy-wrapped at - compute time on the host). - """ - import cupy - - vrt_path, _ = two_by_two_vrt - result = read_vrt(vrt_path, gpu=True, chunks=(64, 64)) - - assert isinstance(result.data, da.Array) - assert isinstance(result.data._meta, cupy.ndarray), ( - f"expected cupy _meta, got " - f"{type(result.data._meta).__module__}." - f"{type(result.data._meta).__name__}" - ) - computed = result.compute() - assert isinstance(computed.data, cupy.ndarray) - - -# --------------------------------------------------------------------------- -# 6. Multi-band VRT + chunks. -# --------------------------------------------------------------------------- - -def test_multiband_plus_chunks_preserves_band_dim(multiband_vrt): - """3-band VRT read with ``chunks=`` keeps the band dimension on - every block and the assembled DataArray. - """ - vrt_path, src = multiband_vrt - result = read_vrt(vrt_path, chunks=(32, 32)) - - assert isinstance(result.data, da.Array) - assert result.dims == ('y', 'x', 'band') - assert result.shape == (64, 64, 3) - # Per-block shape on the band axis is 3 (whole band axis in one - # chunk because we did not pass a band-chunk size). - assert result.data.chunks[2] == (3,) - - computed = result.compute() - np.testing.assert_allclose(computed.values, src, rtol=0, atol=0) - - -# --------------------------------------------------------------------------- -# 7. Copilot review: ``attrs['vrt_holes']`` must propagate to the chunked -# path so users switching from eager to chunked keep the #1734 contract. -# --------------------------------------------------------------------------- - -def test_chunked_propagates_vrt_holes_when_source_missing(two_by_two_vrt): - """When a source referenced by the VRT does not exist on disk and - the caller opts into the lenient ``missing_sources='warn'`` path, - the chunked reader must populate ``attrs['vrt_holes']`` with the - same schema the eager reader uses, so callers can branch on - ``"vrt_holes" in da.attrs`` regardless of which code path produced - the DataArray. - - Note: the default ``missing_sources='raise'`` raises at build time - under #2265, so this test exercises the explicit ``'warn'`` opt-in. - """ - import warnings - - from xrspatial.geotiff import GeoTIFFFallbackWarning - from xrspatial.geotiff._reader import _mmap_cache - - vrt_path, _ = two_by_two_vrt - vrt_dir = os.path.dirname(vrt_path) - # Remove one of the four source tiles. ``to_geotiff(.vrt, tile_size=128)`` - # writes tile files into a ``_tiles/`` subdirectory next to the - # .vrt; walk the tree for any .tif and unlink the first one. - tile_files = [] - for root, _dirs, files in os.walk(vrt_dir): - for f in files: - if f.endswith('.tif'): - tile_files.append(os.path.join(root, f)) - assert len(tile_files) >= 1 - # write_vrt() opens each tile via _FileSource to read its header; - # _FileSource.close() decrements the refcount but the mmap stays - # cached. On Windows an active mmap blocks os.unlink (WinError 32). - _mmap_cache.clear() - os.unlink(tile_files[0]) - - with warnings.catch_warnings(): - warnings.simplefilter('ignore', GeoTIFFFallbackWarning) - result = read_vrt( - vrt_path, chunks=(64, 64), missing_sources='warn', - ) - - assert 'vrt_holes' in result.attrs, ( - "chunked path dropped vrt_holes contract from #1734" - ) - holes = result.attrs['vrt_holes'] - assert isinstance(holes, list) and len(holes) >= 1 - entry = holes[0] - # Schema parity with the eager path (see read_vrt at ~line 3963). - assert set(entry.keys()) >= {'source', 'band', 'dst_rect', 'error'} - assert isinstance(entry['dst_rect'], tuple) - assert len(entry['dst_rect']) == 4 - - -def test_chunked_no_vrt_holes_attr_when_complete(two_by_two_vrt): - """When every source is on disk the chunked reader must not set - ``attrs['vrt_holes']`` (eager parity: empty hole list is omitted). - """ - vrt_path, _ = two_by_two_vrt - result = read_vrt(vrt_path, chunks=(64, 64)) - assert 'vrt_holes' not in result.attrs - - -# --------------------------------------------------------------------------- -# 8. Copilot review: integer source with no declared nodata must keep its -# integer dtype through the chunked path (no spurious float64 promotion). -# --------------------------------------------------------------------------- - -def test_chunked_integer_no_nodata_keeps_source_dtype(): - """A uint16 source with no declared must produce a - uint16 chunked DataArray, not float64. The eager path stays integer - in this case because its runtime ``mask.any()`` is False; the - chunked path approximates with a static "any band declares nodata?" - check, which yields the same answer here. - """ - arr = np.arange(128 * 128, dtype=np.uint16).reshape(128, 128) - y = np.linspace(41.0, 40.0, 128) - x = np.linspace(-106.0, -105.0, 128) - raster = xr.DataArray(arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}) - td = tempfile.mkdtemp(prefix='tmp_1814_uint16_nonodata_') - tile_path = os.path.join(td, 'tile.tif') - to_geotiff(raster, tile_path) - vrt_path = os.path.join(td, 'mosaic.vrt') - # No ``nodata=`` passed: the VRT will not declare for - # this band, exercising the no-promotion branch. - _write_vrt_internal(vrt_path, [tile_path]) - - result = read_vrt(vrt_path, chunks=(32, 32)) - assert result.dtype == np.uint16, ( - f"expected uint16 (source dtype), got {result.dtype}; " - f"chunked path promoted to float64 despite no declared nodata" - ) - computed = result.compute() - assert computed.dtype == np.uint16 - np.testing.assert_array_equal(computed.values, arr) diff --git a/xrspatial/geotiff/tests/test_vrt_mask_nodata_float_source_2158.py b/xrspatial/geotiff/tests/test_vrt_mask_nodata_float_source_2158.py deleted file mode 100644 index d85431f1c..000000000 --- a/xrspatial/geotiff/tests/test_vrt_mask_nodata_float_source_2158.py +++ /dev/null @@ -1,293 +0,0 @@ -"""Regression tests for issue #2158. - -Before the fix, ``read_vrt(vrt, mask_nodata=False)`` silently rewrote -float source sentinels (e.g. ``-9999.0``) to NaN inside the internal -VRT reader (``_vrt._read_data``), so the documented opt-out only -worked for integer sources. The public backend's ``mask_nodata=False`` -branch skipped the integer post-decode helper but never reached the -inline float masking, which had already destroyed the literal -sentinel pixels by the time the buffer surfaced to the public layer. - -The fix threads ``mask_nodata`` into the internal reader and gates -both inline masking branches (float source NaN substitution and -integer-source-feeding-float-VRT promotion) on the kwarg. The -``attrs['masked_nodata']`` stamp on the eager and chunked VRT paths -is also AND-ed with ``mask_nodata`` so the attr does not lie when -the inline masking is skipped. -""" -from __future__ import annotations - -import numpy as np - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_float32_with_sentinel(tmp_path, sentinel=-9999.0, - filename='float_2158.tif'): - """float32 GeoTIFF with a non-NaN sentinel and matching pixels. - - The middle row has a literal ``-9999.0`` so the inline masking - actually has something to rewrite. - """ - band = np.array([[1.0, 2.0, 3.0], - [4.0, sentinel, 6.0], - [7.0, sentinel, 9.0]], dtype=np.float32) - p = str(tmp_path / filename) - write(band, p, nodata=sentinel, compression='none', tiled=False) - return p, band - - -def _write_float64_with_fractional_sentinel(tmp_path, sentinel=-9999.25, - filename='float64_2158.tif'): - """float64 GeoTIFF with a fractional sentinel. - - Float32's exact-cast rounding would clobber a fractional value - like ``-9999.25``; the float64 path is the only one where the - sentinel survives lossless. - """ - band = np.array([[1.0, 2.0], - [sentinel, 4.0]], dtype=np.float64) - p = str(tmp_path / filename) - write(band, p, nodata=sentinel, compression='none', tiled=False) - return p, band - - -def _build_vrt(tmp_path, source_path, vrt_dtype, nodata_value, - filename='float_2158.vrt', shape=(3, 3)): - """Hand-roll a single-source VRT pointing at the float source.""" - h, w = shape - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {nodata_value} - - {source_path} - 1 - - - - -""" - p = str(tmp_path / filename) - with open(p, 'w') as f: - f.write(vrt_xml) - return p - - -# --------------------------------------------------------------------------- -# Baseline: ``mask_nodata=True`` still rewrites the float sentinel. -# --------------------------------------------------------------------------- - - -def test_default_mask_nodata_true_rewrites_float_sentinel(tmp_path): - """The default behaviour (mask_nodata=True) still substitutes NaN. - - Pins the existing contract so the fix below does not regress the - masking happy path. - """ - src, _ = _write_float32_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', -9999.0) - - r = read_vrt(vrt) - - assert r.dtype == np.float32 - # The two sentinel positions in the source array are at (1, 1) - # and (2, 1). Both should be NaN after masking. - assert np.isnan(r.values[1, 1]) - assert np.isnan(r.values[2, 1]) - # Non-sentinel pixels untouched. - assert r.values[0, 0] == 1.0 - assert r.values[1, 0] == 4.0 - assert r.attrs.get('nodata') == -9999.0 - assert r.attrs.get('masked_nodata') is True - - -# --------------------------------------------------------------------------- -# The fix: ``mask_nodata=False`` preserves the literal float sentinel. -# --------------------------------------------------------------------------- - - -def test_eager_mask_nodata_false_preserves_float_sentinel(tmp_path): - """Eager VRT path: ``mask_nodata=False`` keeps the literal sentinel. - - Before #2158 this assertion failed -- the sentinel pixels were - silently rewritten to NaN inside ``_vrt._read_data`` regardless - of the kwarg. - """ - src, original = _write_float32_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', -9999.0) - - r = read_vrt(vrt, mask_nodata=False) - - assert r.dtype == np.float32 - # No NaNs anywhere; sentinel survives as the literal float value. - assert not np.isnan(r.values).any() - assert r.values[1, 1] == np.float32(-9999.0) - assert r.values[2, 1] == np.float32(-9999.0) - # The full array round-trips exactly when the opt-out is honored. - np.testing.assert_array_equal(r.values, original) - # The declared sentinel is still surfaced for downstream maskers. - assert r.attrs.get('nodata') == -9999.0 - # The buffer is not NaN-aware: ``masked_nodata`` should reflect - # that, not lie. - assert r.attrs.get('masked_nodata') is False - - -def test_chunked_mask_nodata_false_preserves_float_sentinel(tmp_path): - """Chunked VRT path: ``mask_nodata=False`` keeps the literal sentinel. - - The chunked path used to call ``_read_vrt_internal`` from - ``_vrt_chunk_read`` without forwarding the kwarg, so per-chunk - decodes silently rewrote float sentinels too. With #2158 the - kwarg is forwarded into the internal reader and both paths agree. - """ - src, original = _write_float32_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', -9999.0) - - r = read_vrt(vrt, chunks=2, mask_nodata=False) - - assert r.dtype == np.float32 - computed = r.compute() - assert not np.isnan(computed.values).any() - assert computed.values[1, 1] == np.float32(-9999.0) - assert computed.values[2, 1] == np.float32(-9999.0) - np.testing.assert_array_equal(computed.values, original) - assert computed.attrs.get('nodata') == -9999.0 - assert computed.attrs.get('masked_nodata') is False - - -def test_eager_and_chunked_agree_under_mask_nodata_false(tmp_path): - """Cross-path parity: eager and chunked produce the same buffer. - - Before #2158 the two paths could disagree because both rewrote - the sentinel inline but at slightly different points in the - pipeline. With the opt-out honored, both paths land on the - untouched source array. - """ - src, _ = _write_float32_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', -9999.0) - - eager = read_vrt(vrt, mask_nodata=False) - chunked = read_vrt(vrt, chunks=2, mask_nodata=False).compute() - - np.testing.assert_array_equal(eager.values, chunked.values) - assert eager.attrs.get('masked_nodata') == chunked.attrs.get( - 'masked_nodata') - - -# --------------------------------------------------------------------------- -# Fractional sentinel: float64 source with a non-integer nodata. -# --------------------------------------------------------------------------- - - -def test_mask_nodata_false_float64_fractional_sentinel(tmp_path): - """A fractional sentinel survives the float64 opt-out path. - - Float32 would round ``-9999.25`` to the nearest representable - value, so this corner is float64-only. With the opt-out honored - the pixel keeps its exact bit pattern. - """ - src, original = _write_float64_with_fractional_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float64', -9999.25, - filename='float64_2158.vrt', shape=(2, 2)) - - r = read_vrt(vrt, mask_nodata=False) - - assert r.dtype == np.float64 - assert r.values[1, 0] == -9999.25 - np.testing.assert_array_equal(r.values, original) - - -# --------------------------------------------------------------------------- -# Bit-exact equivalence: masked NaN positions match unmasked sentinel positions. -# --------------------------------------------------------------------------- - - -def test_masked_vs_unmasked_differ_only_at_sentinels(tmp_path): - """``mask_nodata=True`` and ``=False`` differ only where the sentinel hits. - - Every pixel that is NaN in the masked output equals the declared - sentinel in the unmasked output, and every non-sentinel pixel is - bit-identical between the two reads. This pins the contract that - the opt-out is a pure passthrough on the non-sentinel positions. - """ - src, _ = _write_float32_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', -9999.0) - - masked = read_vrt(vrt).values - unmasked = read_vrt(vrt, mask_nodata=False).values - - nan_positions = np.isnan(masked) - sentinel_positions = unmasked == np.float32(-9999.0) - np.testing.assert_array_equal(nan_positions, sentinel_positions) - # Non-sentinel pixels bit-identical between the two reads. - np.testing.assert_array_equal(masked[~nan_positions], - unmasked[~sentinel_positions]) - - -# --------------------------------------------------------------------------- -# Integer source feeding a float-dataType VRT: the other branch the fix gated. -# --------------------------------------------------------------------------- - - -def _write_uint16_with_sentinel(tmp_path, sentinel=65535, - filename='uint16_2158.tif'): - """uint16 GeoTIFF with a matching sentinel. - - Used to exercise the integer-source-feeding-float-VRT promotion at - ``_vrt.py:1351-1390``. With ``mask_nodata=True`` the sentinel pixel - surfaces as NaN in the float buffer; with ``mask_nodata=False`` the - literal integer value flows through the int->float cast and lands - as ``65535.0``. - """ - band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) - p = str(tmp_path / filename) - write(band, p, nodata=sentinel, compression='none', tiled=False) - return p, band - - -def test_int_source_float_vrt_mask_nodata_false_keeps_literal(tmp_path): - """Integer source feeding a Float32 VRT preserves the literal sentinel. - - Pins the second branch of the inline masking that #2158 gated. - Before the fix, ``_vrt._read_data`` ran the int->float-with-NaN - promotion unconditionally, so even ``mask_nodata=False`` lost the - sentinel. After the fix the integer source pixel survives the - int->float cast as ``65535.0`` and ``masked_nodata`` reflects - that no masking ran. - """ - src, _ = _write_uint16_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', 65535, - filename='int_float_2158.vrt', shape=(2, 2)) - - r = read_vrt(vrt, mask_nodata=False) - - assert r.dtype == np.float32 - # No NaN substitution -- the sentinel survives as a literal float. - assert not np.isnan(r.values).any() - assert r.values[1, 1] == np.float32(65535.0) - assert r.values[0, 0] == 1.0 - assert r.attrs.get('nodata') == 65535.0 - # Buffer is not NaN-aware even though dtype is float. - assert r.attrs.get('masked_nodata') is False - - -def test_int_source_float_vrt_default_still_promotes(tmp_path): - """Default ``mask_nodata=True`` still NaN-masks the int->float promotion. - - Baseline that documents the pre-#2158 contract for the integer - source path: the existing #1616 behavior is unchanged when the - opt-out is not requested. - """ - src, _ = _write_uint16_with_sentinel(tmp_path) - vrt = _build_vrt(tmp_path, src, 'Float32', 65535, - filename='int_float_default_2158.vrt', shape=(2, 2)) - - r = read_vrt(vrt) - - assert r.dtype == np.float32 - assert np.isnan(r.values[1, 1]) - assert r.values[0, 0] == 1.0 - assert r.attrs.get('nodata') == 65535.0 - assert r.attrs.get('masked_nodata') is True diff --git a/xrspatial/geotiff/tests/test_vrt_masked_nodata_attr_2159.py b/xrspatial/geotiff/tests/test_vrt_masked_nodata_attr_2159.py deleted file mode 100644 index 111ec43d8..000000000 --- a/xrspatial/geotiff/tests/test_vrt_masked_nodata_attr_2159.py +++ /dev/null @@ -1,247 +0,0 @@ -"""``masked_nodata`` on VRT paths must honour ``mask_nodata=`` (#2159). - -Both VRT call sites (eager at ``vrt.py:462`` and chunked at -``vrt.py:883``) used to compute ``attrs['masked_nodata']`` from the -buffer dtype alone: - - masked=(pre_cast_dtype.kind == 'f') # eager - masked=(declared_dtype.kind == 'f') # chunked - -That ignored the caller's ``mask_nodata`` opt-out. The dask, GPU, and -dask+GPU backends already gate on both signals -(``mask_nodata and dtype.kind == 'f'``); the contract at -``_attrs._set_nodata_attrs`` says the same. The VRT paths now match. - -These tests pin the attr-side fix only. Sibling issue #2158 covers the -behavioural mismatch where the VRT internal reader still NaN-masks -float sources under ``mask_nodata=False``; this test file does not -assert on whether the buffer contains literal sentinels or NaN, only -on what the attr reports. - -Coverage: -* Eager VRT: float source + ``mask_nodata=False`` reports ``False``. -* Chunked VRT: float source + ``mask_nodata=False`` reports ``False``. -* Eager VRT: float source + ``mask_nodata=True`` still reports ``True`` - (canonical direction, regression guard against an over-eager fix). -* Chunked VRT: float source + ``mask_nodata=True`` still reports - ``True``. -* Both VRT paths: int source + ``mask_nodata=False`` reports ``False`` - (the pre-fix rule already got this right; keep it green). -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff import open_geotiff, read_geotiff_dask - - -def _write_float_vrt(tmp_path, src_basename, vrt_basename, sentinel=-9999.0): - """Build a single-band float32 VRT with a declared sentinel. - - Layout mirrors the working pattern from - ``test_masked_nodata_attr_2092.py``: ``GeoTransform`` plus explicit - ``SrcRect`` / ``DstRect`` are required by the in-repo VRT reader. - """ - tifffile = pytest.importorskip("tifffile") - src = str(tmp_path / src_basename) - tifffile.imwrite(src, np.array( - [[1.0, 2.0, sentinel], - [4.0, sentinel, 6.0]], - dtype=np.float32, - ), metadata=None) - vrt = str(tmp_path / vrt_basename) - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {sentinel} - - {src} - 1 - - - - - -""" - with open(vrt, 'w') as fh: - fh.write(vrt_xml) - return vrt - - -def _write_int_vrt(tmp_path, src_basename, vrt_basename, sentinel=30): - """Single-band int16 VRT with a declared sentinel.""" - tifffile = pytest.importorskip("tifffile") - src = str(tmp_path / src_basename) - tifffile.imwrite(src, np.array( - [[10, 20, 30], [40, 50, 60]], dtype=np.int16, - ), metadata=None) - vrt = str(tmp_path / vrt_basename) - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {sentinel} - - {src} - 1 - - - - - -""" - with open(vrt, 'w') as fh: - fh.write(vrt_xml) - return vrt - - -# --- Eager VRT ---------------------------------------------------------- - - -def test_vrt_eager_float_source_mask_off_reports_false(tmp_path): - """Eager VRT + float source + ``mask_nodata=False`` must report - ``masked_nodata=False``. Pre-fix rule (dtype alone) said ``True``.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_eager_float_src.tif", - "tmp_2159_eager_unmasked.vrt", - ) - out = open_geotiff(vrt, mask_nodata=False) - assert out.attrs.get('nodata') == -9999.0 - assert out.attrs.get('masked_nodata') is False, ( - f"caller opted out of masking but attrs say " - f"masked_nodata={out.attrs.get('masked_nodata')!r}") - - -def test_vrt_eager_float_source_mask_on_reports_true(tmp_path): - """Canonical direction: float source + masking on. The masking - step runs, attr says True. Regression guard.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_eager_float_src_masked.tif", - "tmp_2159_eager_masked.vrt", - ) - out = open_geotiff(vrt) # mask_nodata defaults to True - assert out.attrs.get('nodata') == -9999.0 - assert out.attrs.get('masked_nodata') is True - - -def test_vrt_eager_int_source_mask_off_reports_false(tmp_path): - """Eager VRT + int source + ``mask_nodata=False``: integer helper - skipped, dtype stays int, attr says False. Pre-fix rule already - got this right (int dtype -> False); keep it green under the - new ``mask_nodata and dtype.kind == 'f'`` rule.""" - vrt = _write_int_vrt( - tmp_path, - "tmp_2159_eager_int_src.tif", - "tmp_2159_eager_int_unmasked.vrt", - ) - out = open_geotiff(vrt, mask_nodata=False) - assert out.dtype.kind == 'i' - assert out.attrs.get('masked_nodata') is False - - -def test_vrt_eager_float_source_mask_off_with_cast_reports_false(tmp_path): - """Eager VRT + float source + ``mask_nodata=False`` + ``dtype=float64`` - cast. Pre-fix used ``pre_cast_dtype.kind == 'f'`` so pre-cast is - float anyway and the rule said True. New rule short-circuits on - ``mask_nodata=False`` and says False. The caller-supplied cast is - still recorded via ``nodata_dtype_cast``.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_eager_float_src_cast.tif", - "tmp_2159_eager_unmasked_cast.vrt", - ) - out = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) - assert out.dtype == np.float64 - assert out.attrs.get('masked_nodata') is False - assert out.attrs.get('nodata_dtype_cast') == 'float64' - - -# --- Chunked VRT -------------------------------------------------------- - - -def test_vrt_chunked_float_source_mask_off_reports_false(tmp_path): - """Chunked VRT path (``chunks=`` triggers ``_read_vrt_chunked``) - + float source + ``mask_nodata=False`` must report False.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_chunked_float_src.tif", - "tmp_2159_chunked_unmasked.vrt", - ) - out = read_geotiff_dask(vrt, chunks=2, mask_nodata=False) - assert out.attrs.get('nodata') == -9999.0 - assert out.attrs.get('masked_nodata') is False, ( - f"chunked VRT path: caller opted out of masking but attrs say " - f"masked_nodata={out.attrs.get('masked_nodata')!r}") - - -def test_vrt_chunked_float_source_mask_on_reports_true(tmp_path): - """Canonical direction on the chunked path: masking on, attr True.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_chunked_float_src_masked.tif", - "tmp_2159_chunked_masked.vrt", - ) - out = read_geotiff_dask(vrt, chunks=2) - assert out.attrs.get('nodata') == -9999.0 - assert out.attrs.get('masked_nodata') is True - - -def test_vrt_chunked_int_source_mask_off_reports_false(tmp_path): - """Chunked VRT + int source + ``mask_nodata=False``. ``declared_dtype`` - stays integer because the masking-driven float-promotion gate - earlier in the function is itself gated on ``mask_nodata``. - The attr says False under both the old and the new rule.""" - vrt = _write_int_vrt( - tmp_path, - "tmp_2159_chunked_int_src.tif", - "tmp_2159_chunked_int_unmasked.vrt", - ) - out = read_geotiff_dask(vrt, chunks=2, mask_nodata=False) - assert out.dtype.kind == 'i' - assert out.attrs.get('masked_nodata') is False - - -def test_vrt_chunked_float_source_mask_off_with_cast_reports_false(tmp_path): - """Chunked VRT + float source + ``mask_nodata=False`` + ``dtype=float64`` - cast. Same logic as the eager equivalent: caller opted out of - masking, attr is False even though the lazy graph dtype is float.""" - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_chunked_float_src_cast.tif", - "tmp_2159_chunked_unmasked_cast.vrt", - ) - out = read_geotiff_dask( - vrt, chunks=2, mask_nodata=False, dtype=np.float64, - ) - assert out.dtype == np.float64 - assert out.attrs.get('masked_nodata') is False - assert out.attrs.get('nodata_dtype_cast') == 'float64' - - -# --- Cross-backend invariant ------------------------------------------- - - -def test_vrt_attr_matches_dask_backend_under_mask_off(tmp_path): - """Both VRT backends should report the same ``masked_nodata`` as - the regular dask backend does for an equivalent input. Pins the - cross-backend invariant the contract at - ``_attrs._set_nodata_attrs`` calls out.""" - # Float source, mask off, with an explicit float64 cast so the - # buffer dtype is float on every backend. - vrt = _write_float_vrt( - tmp_path, - "tmp_2159_xbackend_src.tif", - "tmp_2159_xbackend.vrt", - ) - - eager = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) - chunked = read_geotiff_dask( - vrt, chunks=2, mask_nodata=False, dtype=np.float64, - ) - - assert eager.attrs.get('masked_nodata') is False - assert chunked.attrs.get('masked_nodata') is False - assert (eager.attrs.get('masked_nodata') - == chunked.attrs.get('masked_nodata')) diff --git a/xrspatial/geotiff/tests/test_vrt_metadata_parity_2321.py b/xrspatial/geotiff/tests/test_vrt_metadata_parity_2321.py deleted file mode 100644 index f51feca2f..000000000 --- a/xrspatial/geotiff/tests/test_vrt_metadata_parity_2321.py +++ /dev/null @@ -1,967 +0,0 @@ -"""VRT metadata parity tests across backends (issue #2321, sub-PR 3). - -Most VRT regression coverage today asserts pixel values. A VRT read -can return the right pixels with the wrong georeferencing attrs and -nothing in the current suite catches it -- the attrs sweep gets -single-source coverage from ``test_vrt_finalization_parity_2162`` but -no cross-backend pin on the metadata the contract promises. - -This module locks the cross-backend metadata contract for VRT reads: - -* eager (numpy) vs dask via ``open_geotiff(..., chunks=...)`` and - ``read_vrt(..., chunks=...)`` -- the public dispatcher path -* GPU eager via ``read_vrt(gpu=True)`` guarded by - ``pytest.importorskip`` - -Scope of coverage for this file. The following attrs get cross-backend -parity asserts here: - -* ``transform`` -* ``crs`` -* ``nodata`` -* ``masked_nodata`` -* ``georef_status`` -* ``raster_type`` (when the source is AREA_OR_POINT=Point; the area - default leaves the attr unset, so it is not in the required-key list) - -The following keys are intentionally OUT of scope for this file -- -the VRT path is documented to omit them, and the non-VRT backend -parity suite owns their cross-backend pin: - -* ``crs_wkt`` (compared via the ``crs`` EPSG integer instead, because - WKT text can re-emit under pyproj normalisation) -* ``gdal_metadata_xml`` -* ``extra_tags`` - -The negative tests pin the fail-closed posture for ambiguous VRT input: -mixed CRS, mixed per-band nodata, unsupported resampling, malformed -SrcRect / DstRect, and missing sources under both ``'raise'`` and -``'warn'`` policies. The VRT path must refuse to silently flatten -ambiguous metadata to one value -- a pixel-only check would miss this. - -PR 2 of the parent epic (``VRTUnsupportedError``) is not landed yet. -The negative tests assert against the current error type and carry a -``TODO(#2321)`` so the upgrade is mechanical when PR 2 lands. - -Temp file names include ``_2321_`` per ``CLAUDE.md`` to avoid -collisions in parallel runs. -""" -from __future__ import annotations - -import os -import pathlib - -import numpy as np -import pytest - -# Two writer imports because the fixture builders below have two -# shapes of input: -# - ``to_geotiff`` (public surface, takes an ``xr.DataArray``) for the -# full-coords / CRS-on-DataArray fixtures -# - ``write`` (``xrspatial.geotiff._writer``, takes a raw numpy array -# plus a ``nodata=`` kwarg) for the per-band integer fixtures where -# constructing a DataArray just to round-trip via to_geotiff would -# add nothing -from xrspatial.geotiff import (GeoTIFFFallbackWarning, MixedBandMetadataError, - open_geotiff, read_vrt, to_geotiff) -from xrspatial.geotiff._attrs import (GEOREF_STATUS_FULL, - GEOREF_STATUS_TRANSFORM_ONLY) -from xrspatial.geotiff._errors import VRTUnsupportedError -from xrspatial.geotiff._writer import write -from xrspatial.geotiff.tests.conftest import requires_gpu - - -# WKT for EPSG:4326. Same constant as the finalization parity module so -# the WKT-vs-EPSG comparison surface matches. -_WGS84_WKT = ( - 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,' - 'AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,' - 'AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,' - 'AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]' -) - - -# Attrs the VRT path is documented to omit even when the underlying -# TIFF carries them. ``test_vrt_finalization_parity_2162`` already pins -# the precise omission list; the parity asserts below intersect rather -# than mirror it because the backend-vs-backend comparison only needs -# both sides to agree, not match an absolute set. -_VRT_OMITTED_ATTR_KEYS = frozenset({ - 'extra_tags', - 'image_description', - 'extra_samples', - 'gdal_metadata', - 'gdal_metadata_xml', - 'x_resolution', - 'y_resolution', - 'resolution_unit', - 'colormap', -}) - -# Attrs whose textual representation can differ between two readers -# even when the logical value matches. ``crs_wkt`` may be re-emitted -# by pyproj on one path and pass-through verbatim on another; the -# integer ``crs`` (EPSG) carries the same information for parity. -# ``transform`` shifts by a half-pixel between AREA_OR_POINT writers -# but the windowed reads in this module all share one writer, so the -# 6-tuple is comparable element-wise across backends. -_REPRESENTATION_KEYS = frozenset({'crs_wkt'}) - -# Attrs the eager path stamps but the dask path is documented to omit -# under the lazy-attrs contract (issue #2135). The dask backend cannot -# compute the presence flag without forcing a materialise, so the lazy -# build legitimately ships without it. Drop these from the cross-backend -# comparison so a dask read can match a numpy read on the rest of the -# contract. -_BACKEND_LIFECYCLE_KEYS = frozenset({'nodata_pixels_present'}) - - -# --------------------------------------------------------------------------- -# VRT fixture builders. Single-source, well-formed, contract-positive. -# --------------------------------------------------------------------------- - - -def _write_single_source_vrt( - tiff_path: str, - vrt_path: str, - *, - width: int, - height: int, - dtype_xml: str = "Float32", - nodata: float | int | None = None, - geo_transform: str | None = '0.0, 1.0, 0.0, 0.0, 0.0, -1.0', - srs: str | None = None, -) -> None: - """Write a 1-band VRT pointing at ``tiff_path``. - - Same writer style as ``test_vrt_finalization_parity_2162`` so the - two test modules share fixture geometry conventions. - """ - nodata_xml = ( - f" {nodata}\n" - if nodata is not None else '' - ) - srs_xml = f' {srs}\n' if srs is not None else '' - gt_xml = ( - f' {geo_transform}\n' - if geo_transform is not None else '' - ) - vrt_xml = ( - f'\n' - f'{gt_xml}' - f'{srs_xml}' - f' \n' - f'{nodata_xml}' - f' \n' - f' {tiff_path}' - f'\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - - -def _build_full_georef_vrt(tmp_path: pathlib.Path) -> str: - """4x4 float32 single-source VRT with full georef + nodata.""" - import xarray as xr - - tiff = str(tmp_path / 'tmp_2321_full_src.tif') - vrt = str(tmp_path / 'tmp_2321_full.vrt') - data = np.arange(16, dtype=np.float32).reshape(4, 4) - da = xr.DataArray( - data, - coords={ - 'y': np.array([200.0, 199.0, 198.0, 197.0]), - 'x': np.array([100.0, 101.0, 102.0, 103.0]), - }, - dims=('y', 'x'), - attrs={'crs': 4326}, - ) - to_geotiff(da, tiff) - _write_single_source_vrt( - tiff, vrt, width=4, height=4, dtype_xml='Float32', - nodata=-9999.0, - geo_transform='100.0, 1.0, 0.0, 200.0, 0.0, -1.0', - srs=_WGS84_WKT, - ) - return vrt - - -def _build_transform_only_vrt(tmp_path: pathlib.Path) -> str: - """4x4 single-source VRT with transform but no SRS (CRS absent).""" - import xarray as xr - - tiff = str(tmp_path / 'tmp_2321_tonly_src.tif') - vrt = str(tmp_path / 'tmp_2321_tonly.vrt') - data = np.arange(16, dtype=np.float32).reshape(4, 4) - da = xr.DataArray( - data, - coords={ - 'y': np.array([200.0, 199.0, 198.0, 197.0]), - 'x': np.array([100.0, 101.0, 102.0, 103.0]), - }, - dims=('y', 'x'), - ) - to_geotiff(da, tiff) - _write_single_source_vrt( - tiff, vrt, width=4, height=4, dtype_xml='Float32', - geo_transform='100.0, 1.0, 0.0, 200.0, 0.0, -1.0', - srs=None, - ) - return vrt - - -def _build_integer_with_nodata_vrt(tmp_path: pathlib.Path) -> str: - """4x4 uint16 single-source VRT with declared nodata sentinel. - - Used for ``masked_nodata`` parity: the integer-with-sentinel source - must promote to float64 with NaN-masked sentinel pixels in every - backend and stamp ``attrs['masked_nodata']=True``. - """ - src_arr = np.array( - [[1, 2, 3, 4], - [5, 6, 7, 65535], - [9, 10, 11, 12], - [13, 14, 15, 16]], - dtype=np.uint16, - ) - tiff = str(tmp_path / 'tmp_2321_int_src.tif') - vrt = str(tmp_path / 'tmp_2321_int.vrt') - write(src_arr, tiff, nodata=65535, compression='none', tiled=False) - _write_single_source_vrt( - tiff, vrt, width=4, height=4, dtype_xml='UInt16', - nodata=65535, - geo_transform='0.0, 1.0, 0.0, 0.0, 0.0, -1.0', - srs=_WGS84_WKT, - ) - return vrt - - -# --------------------------------------------------------------------------- -# Cross-backend reader helpers. -# --------------------------------------------------------------------------- - - -def _read_eager_numpy(vrt_path: str): - """Eager numpy via the dispatcher (mirrors public surface).""" - return open_geotiff(vrt_path) - - -def _read_dask(vrt_path: str): - """Dask via the dispatcher, then ``compute()`` for value parity.""" - lazy = open_geotiff(vrt_path, chunks=2) - return lazy.compute() - - -def _read_dask_chunks_2(vrt_path: str): - """Dask via the dispatcher, lazy (no compute). - - Used for negative-tests that pin the build-time raise contract - (e.g., ``test_mixed_nodata_vrt_fails_closed_by_default``). Named - at module scope so pytest test ids render as - ``[dask_chunks_2-_read_dask_chunks_2]`` rather than the cryptic - ``[dask_chunks_2-]`` an inline lambda would produce. - """ - return open_geotiff(vrt_path, chunks=2) - - -def _read_gpu_eager(vrt_path: str): - """GPU eager via ``read_vrt(gpu=True)``. - - ``open_geotiff(..., gpu=True)`` rejects ``.vrt`` sources up front - (the dispatcher routes ``.vrt`` to ``read_vrt`` and ``read_vrt`` - owns the ``gpu`` kwarg, see ``_backends/vrt.py``). Use the direct - entry point here so the GPU eager path is exercised. - """ - return read_vrt(vrt_path, gpu=True) - - -# Backends used by the cross-backend parity sweep. The GPU entry is -# parametrized in but skipped without cupy + a working CUDA device. -# Reuses the project-wide ``requires_gpu`` skip marker from -# ``xrspatial.geotiff.tests.conftest`` so the import-time CUDA probe -# stays canonical -- a local re-implementation would risk drift from -# the shared ``gpu_available()`` helper. -_BACKENDS = [ - pytest.param('numpy', _read_eager_numpy, id='numpy'), - pytest.param('dask', _read_dask, id='dask'), - pytest.param('gpu', _read_gpu_eager, id='gpu', marks=requires_gpu), -] - - -def _comparable_attrs(attrs: dict) -> dict: - """Filter attrs down to the cross-backend comparable subset. - - Drops the documented VRT-omitted keys (which may differ if one - backend stamps a TIFF-specific key while another does not) and the - representation-only keys (``crs_wkt``). - """ - return { - k: v for k, v in attrs.items() - if k not in _VRT_OMITTED_ATTR_KEYS - and k not in _REPRESENTATION_KEYS - and k not in _BACKEND_LIFECYCLE_KEYS - } - - -def _to_numpy(arr) -> np.ndarray: - """Return a host-side numpy view of ``arr.values`` regardless of - backend. - - CuPy DataArrays have a ``.values`` accessor that triggers an - implicit host transfer in some xarray versions but not others; use - the explicit ``.data.get()`` path for cupy buffers per CLAUDE.md. - """ - data = arr.data - if hasattr(data, 'get'): # cupy ndarray - return data.get() - return np.asarray(data) - - -# --------------------------------------------------------------------------- -# Positive parity tests: same VRT, every backend, identical metadata. -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_full_georef_vrt_attrs_match_eager_numpy( - tmp_path, _label, reader, -): - """Each non-numpy backend's attrs must match the eager numpy baseline. - - The full-georef VRT carries CRS, transform, nodata, and an - integer-source-promotes-to-float lifecycle. Every attr the contract - promises (``transform``, ``crs``, ``nodata``, ``masked_nodata``, - ``georef_status``, ``raster_type``) must compare equal across - backends. ``crs_wkt`` is compared via the ``crs`` integer instead - because the WKT text can re-emit under pyproj normalisation. - - Without this assertion a backend regression that drops one of - these attrs but still returns correct pixels would slip through - every existing pixel-only test. - """ - vrt = _build_full_georef_vrt(tmp_path) - - baseline = _read_eager_numpy(vrt) - candidate = reader(vrt) - - base_attrs = _comparable_attrs(dict(baseline.attrs)) - cand_attrs = _comparable_attrs(dict(candidate.attrs)) - - # Hard equality on the helper-stamped attr block. - base_keys = set(base_attrs) - cand_keys = set(cand_attrs) - assert base_keys == cand_keys, ( - f"Attr-key drift between numpy and {_label}: " - f"numpy-only={base_keys - cand_keys}, " - f"{_label}-only={cand_keys - base_keys}" - ) - differing = [ - k for k in base_keys - if base_attrs[k] != cand_attrs[k] - ] - assert not differing, ( - f"Attr value drift between numpy and {_label}: " - f"{[(k, base_attrs[k], cand_attrs[k]) for k in differing]}" - ) - - # The promises of the contract: each key is present and well-formed. - # ``raster_type`` is only stamped for AREA_OR_POINT=Point sources; - # the area default leaves the attr unset, so it is not part of the - # required-key list. Pin the keys that the contract guarantees on - # every full-georef read here. - for key in ('transform', 'crs', 'georef_status'): - assert key in cand_attrs, ( - f"{_label} backend missing required attr {key!r}" - ) - assert cand_attrs['georef_status'] == GEOREF_STATUS_FULL - assert cand_attrs['crs'] == 4326 - assert len(cand_attrs['transform']) == 6 - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_full_georef_vrt_pixels_match_eager_numpy( - tmp_path, _label, reader, -): - """Pixel-value parity for the full-georef VRT. - - Twin of the attrs test above: a regression that fixed attrs but - broke pixels (or vice versa) must surface on at least one of the - two. Asserting both side-by-side keeps the surface explicit. - """ - vrt = _build_full_georef_vrt(tmp_path) - - base = _to_numpy(_read_eager_numpy(vrt)) - cand = _to_numpy(reader(vrt)) - - assert base.shape == cand.shape, ( - f"shape drift numpy vs {_label}: {base.shape} vs {cand.shape}" - ) - np.testing.assert_array_equal(base, cand) - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_full_georef_vrt_coords_match_eager_numpy( - tmp_path, _label, reader, -): - """Coord-array parity for the full-georef VRT. - - The transform attr alone does not guarantee correct coords: the - half-pixel AREA_OR_POINT shift can drift between backends. Compare - the actual coord arrays so a coord regression surfaces directly. - """ - vrt = _build_full_georef_vrt(tmp_path) - - base = _read_eager_numpy(vrt) - cand = reader(vrt) - - assert list(cand.dims) == list(base.dims), ( - f"dim drift numpy vs {_label}: {base.dims} vs {cand.dims}" - ) - for axis in ('y', 'x'): - np.testing.assert_array_equal( - np.asarray(cand[axis].values), - np.asarray(base[axis].values), - ) - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_transform_only_vrt_attrs_match_eager_numpy( - tmp_path, _label, reader, -): - """Same parity sweep on a transform-only VRT (no CRS). - - ``georef_status`` must be ``transform_only`` on every backend and - ``attrs['crs']`` must be absent on every backend. A regression - that emits a stale CRS from a TIFF-tag fallback would show up here - as a key-set diff. - """ - vrt = _build_transform_only_vrt(tmp_path) - - baseline = _read_eager_numpy(vrt) - candidate = reader(vrt) - - base_attrs = _comparable_attrs(dict(baseline.attrs)) - cand_attrs = _comparable_attrs(dict(candidate.attrs)) - - assert set(base_attrs) == set(cand_attrs) - assert base_attrs == cand_attrs - assert cand_attrs['georef_status'] == GEOREF_STATUS_TRANSFORM_ONLY - assert 'crs' not in cand_attrs - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_integer_nodata_vrt_attrs_match_eager_numpy( - tmp_path, _label, reader, -): - """``masked_nodata`` and ``nodata`` lifecycle parity on integer VRT. - - The integer-with-sentinel source must promote to float on every - backend and stamp ``attrs['masked_nodata']=True`` plus - ``attrs['nodata']=65535`` (the original sentinel). A backend that - forgets to stamp ``masked_nodata`` would silently mislead callers - who branch on the attr to decide whether NaN is real or a mask. - """ - vrt = _build_integer_with_nodata_vrt(tmp_path) - - baseline = _read_eager_numpy(vrt) - candidate = reader(vrt) - - base_attrs = _comparable_attrs(dict(baseline.attrs)) - cand_attrs = _comparable_attrs(dict(candidate.attrs)) - - assert set(base_attrs) == set(cand_attrs) - assert base_attrs == cand_attrs - # Lifecycle invariants regardless of backend: - assert cand_attrs.get('masked_nodata') is True - assert cand_attrs.get('nodata') == 65535 - - -@pytest.mark.parametrize('_label, reader', _BACKENDS) -def test_integer_nodata_vrt_pixels_match_eager_numpy( - tmp_path, _label, reader, -): - """Pixel parity for the integer-VRT case. - - Twin of the attrs test so a backend regression that masks but - forgets the attr (or stamps the attr but masks the wrong cell) - fails one assertion or the other, never both silently. - """ - vrt = _build_integer_with_nodata_vrt(tmp_path) - - base = _to_numpy(_read_eager_numpy(vrt)) - cand = _to_numpy(reader(vrt)) - - assert base.shape == cand.shape - # Promoted-to-float64 array with NaN at the sentinel cell. - np.testing.assert_array_equal(np.isnan(base), np.isnan(cand)) - base_finite = base[~np.isnan(base)] - cand_finite = cand[~np.isnan(cand)] - np.testing.assert_array_equal(base_finite, cand_finite) - - -# --------------------------------------------------------------------------- -# Negative tests: ambiguous metadata must fail closed. -# --------------------------------------------------------------------------- - - -def _write_mixed_crs_vrt(tmp_path: pathlib.Path) -> str: - """Two single-band sources with disagreeing CRS at the VRT. - - The VRT XML carries one SRS (WGS84) but the second underlying TIFF - carries a UTM CRS. The fail-closed contract calls for the read to - reject this up front, but today the per-source CRS check does NOT - surface the conflict: the read succeeds and silently flattens to - the VRT-declared SRS. See the xfail on - ``test_mixed_crs_vrt_does_not_silently_flatten`` for the - consumer-side pin and the gap PR 2 must close. - - TODO(#2321): when sub-PR 2 (`VRTUnsupportedError`) lands, the - centralised validator must reject the mixed-CRS VRT up front with - a typed error; switch the ``pytest.raises`` on the consumer test - to that type and drop the broad ``Exception`` fallback. - """ - import xarray as xr - - # Same shape, same transform, deliberately different CRS per source. - src0 = tmp_path / 'tmp_2321_mix_crs_src0.tif' - src1 = tmp_path / 'tmp_2321_mix_crs_src1.tif' - - data = np.arange(16, dtype=np.float32).reshape(4, 4) - da0 = xr.DataArray( - data, - coords={ - 'y': np.array([200.0, 199.0, 198.0, 197.0]), - 'x': np.array([100.0, 101.0, 102.0, 103.0]), - }, - dims=('y', 'x'), - attrs={'crs': 4326}, - ) - da1 = xr.DataArray( - data, - coords={ - # Adjacent on x so the mosaic could in principle assemble. - 'y': np.array([200.0, 199.0, 198.0, 197.0]), - 'x': np.array([104.0, 105.0, 106.0, 107.0]), - }, - dims=('y', 'x'), - attrs={'crs': 32633}, - ) - to_geotiff(da0, str(src0)) - to_geotiff(da1, str(src1)) - - vrt_path = tmp_path / 'tmp_2321_mixed_crs.vrt' - # VRT XML declares WGS84 but the underlying second source is UTM. - vrt_xml = f""" - 100.0, 1.0, 0.0, 200.0, 0.0, -1.0 - {_WGS84_WKT} - - - {src0} - 1 - - - - - {src1} - 1 - - - - - -""" - vrt_path.write_text(vrt_xml) - return str(vrt_path) - - -@pytest.mark.xfail( - reason=( - "Mixed-CRS VRT currently silently flattens to the VRT-declared " - "SRS (#2321 gap). The validator from sub-PR 2 must reject this " - "with a typed error at graph build / eager-read setup; once that " - "lands, drop the xfail and tighten the assertion to " - "VRTUnsupportedError. Today the read produces a mosaic whose " - "attrs['crs'] reports only the VRT-declared CRS while the " - "second source's UTM data has been silently incorporated." - ), - strict=True, -) -def test_mixed_crs_vrt_does_not_silently_flatten(tmp_path): - """A mixed-CRS VRT must not return a mosaic that silently inherits - one source's CRS while pixels came from a CRS-incompatible source. - - This is the gap that motivates sub-PR 2 of the parent epic: the - VRT XML declares one SRS, the underlying sources disagree, and - the reader hands back a single ``attrs['crs']`` as if everything - were homogeneous. The pixel content is no longer geospatially - meaningful once the underlying CRSs disagree, but no error fires. - - TODO(#2321): drop the xfail once sub-PR 2's validator rejects the - mixed-CRS input with ``VRTUnsupportedError`` at graph build time - (or eager-read setup). The expected assertion will be - ``pytest.raises(VRTUnsupportedError)`` around ``read_vrt(vrt)``. - - ``strict=True`` so the test flips to XPASS the moment the gap is - fixed -- CI will fail loudly, prompting the upgrade to a proper - raise assertion. That is the desired posture: a finding pinned in - test form, not silently passing. - """ - vrt = _write_mixed_crs_vrt(tmp_path) - # Today this read succeeds and produces an attrs blob that names - # only the VRT-declared CRS, ignoring the second source's UTM CRS. - # The xfail above documents the gap; this assertion is what the - # contract requires after sub-PR 2 lands. Catching Exception is - # intentional until PR 2 lands and a typed error class exists; - # narrow this to ``VRTUnsupportedError`` once that imports cleanly. - with pytest.raises(Exception): - read_vrt(vrt) - - -def _write_mixed_nodata_vrt(tmp_path: pathlib.Path) -> str: - """Two-band uint16 VRT with disagreeing per-band ````. - - Mirrors the fixture in ``test_vrt_multiband_int_nodata_1611``: the - fail-closed default (band_nodata=None) must raise - ``MixedBandMetadataError``. The opt-out - ``band_nodata='first'`` is the explicit escape hatch. - """ - b0_arr = np.array( - [[1, 2], [3, 65535]], dtype=np.uint16 - ) - b1_arr = np.array( - [[7, 8], [9, 65000]], dtype=np.uint16 - ) - p0 = tmp_path / 'tmp_2321_mix_nodata_b0.tif' - p1 = tmp_path / 'tmp_2321_mix_nodata_b1.tif' - write(b0_arr, str(p0), nodata=65535, compression='none', tiled=False) - write(b1_arr, str(p1), nodata=65000, compression='none', tiled=False) - - vrt_path = tmp_path / 'tmp_2321_mix_nodata.vrt' - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - 65535 - - {p0} - 1 - - - - - - 65000 - - {p1} - 1 - - - - - -""" - vrt_path.write_text(vrt_xml) - return str(vrt_path) - - -@pytest.mark.parametrize( - 'reader_label, reader', - [ - ('eager_numpy', _read_eager_numpy), - ('dask_chunks_2', _read_dask_chunks_2), - ], -) -def test_mixed_nodata_vrt_fails_closed_by_default( - tmp_path, reader_label, reader, -): - """Per-band disagreeing nodata raises ``MixedBandMetadataError`` - by default on every backend route. - - The dask path's check fires at graph-build time (the metadata - sweep runs before dask materialises any chunk). The eager path - raises during the dispatcher's metadata validation. Both must - refuse rather than flattening to band 0's sentinel. - - TODO(#2321): if sub-PR 2 reroutes this through - ``VRTUnsupportedError``, accept either type here (subclassing or - composition). - """ - vrt = _write_mixed_nodata_vrt(tmp_path) - - with pytest.raises(MixedBandMetadataError): - result = reader(vrt) - # Dask path can defer to ``.compute()`` for value reads, but - # the parity contract requires the build-time raise per #2265. - # If the build call returned without raising, force a compute - # so a regression that defers the check still trips the - # assertion. ``result`` is unreachable on the eager path. - if hasattr(result, 'compute'): - result.compute() - - -def test_mixed_nodata_vrt_opt_in_first_succeeds(tmp_path): - """``band_nodata='first'`` is the documented opt-out for the - mixed-nodata fail-closed check. - - Positive pin so a future change that breaks the escape hatch - surfaces here. The opt-out flattens to band 0's sentinel, which - is the legacy behaviour callers may explicitly want. - """ - vrt = _write_mixed_nodata_vrt(tmp_path) - result = read_vrt(vrt, band_nodata='first') - # Returned at all -> opt-out is still wired. - assert result.shape == (2, 2, 2) - - -def _write_unsupported_resample_vrt(tmp_path: pathlib.Path) -> str: - """VRT with ``Bilinear`` and a size-changing DstRect. - - A 4x4 source projected into a 2x2 destination with Bilinear must - raise because the implementation only honours nearest-neighbour - resampling at the placement site. See #1751. - """ - src_arr = np.arange(16, dtype=np.uint16).reshape(4, 4) - src_path = tmp_path / 'tmp_2321_resample_src.tif' - write(src_arr, str(src_path), compression='none', tiled=False) - - vrt_path = tmp_path / 'tmp_2321_unsupported_resample.vrt' - vrt_xml = f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - - {src_path} - 1 - - - Bilinear - - - -""" - vrt_path.write_text(vrt_xml) - return str(vrt_path) - - -def test_unsupported_resample_alg_raises(tmp_path): - """A non-nearest resampling algorithm with a size-changing DstRect - must raise ``NotImplementedError`` rather than return - silently-nearest-sampled pixels mislabelled as Bilinear. - - The ``match=`` clause pins the algorithm name and the issue number - so an unrelated ``NotImplementedError`` from some other VRT code - path cannot keep the test green. See ``_vrt.py`` for the existing - raise that names both fields. - - TODO(#2321): when sub-PR 2 lands the typed ``VRTUnsupportedError`` - should be raised here instead; accept either today. - """ - vrt = _write_unsupported_resample_vrt(tmp_path) - # Sub-PR 2 (#2329) landed the centralised VRT validator, which - # now raises ``VRTUnsupportedError`` for this case. - with pytest.raises( - (NotImplementedError, VRTUnsupportedError), match=r"Bilinear|1751", - ): - read_vrt(vrt) - - -def _write_bad_srcrect_vrt( - tmp_path: pathlib.Path, *, x_size: int = -50, -) -> str: - """VRT with a negative-size ````. - - See #1784: the validator must reject this up front rather than - swallow it in the missing-source ``try/except``. - """ - src_arr = np.zeros((10, 10), dtype=np.uint8) - src_path = tmp_path / 'tmp_2321_bad_srcrect_src.tif' - to_geotiff(src_arr, str(src_path), compression='none') - - vrt_path = tmp_path / 'tmp_2321_bad_srcrect.vrt' - vrt_xml = ( - f'\n' - f' \n' - f' \n' - f' {src_path}' - f'\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - vrt_path.write_text(vrt_xml) - return str(vrt_path) - - -def test_negative_srcrect_size_rejected(tmp_path): - """Malformed ``SrcRect`` rejected with a ``ValueError`` that names - the offending field. - - TODO(#2321): centralise this rejection in PR 2's validator and - upgrade to ``VRTUnsupportedError``. - """ - vrt = _write_bad_srcrect_vrt(tmp_path, x_size=-50) - # Sub-PR 2 (#2329) centralised this rejection in the validator and - # upgraded it to ``VRTUnsupportedError``; accept either today. - with pytest.raises( - (ValueError, VRTUnsupportedError), - match=r"SrcRect.*negative", - ): - read_vrt(vrt) - - -def _write_bad_dstrect_vrt( - tmp_path: pathlib.Path, *, x_size: int = -10, -) -> str: - """VRT with a negative-size ```` for the negative test. - - Mirrors the DstRect rejection added for #1737; the regression - coverage today targets oversized DstRects, this test pins the - sister case for negative dimensions. - """ - src_arr = np.zeros((10, 10), dtype=np.uint8) - src_path = tmp_path / 'tmp_2321_bad_dstrect_src.tif' - to_geotiff(src_arr, str(src_path), compression='none') - - vrt_path = tmp_path / 'tmp_2321_bad_dstrect.vrt' - vrt_xml = ( - f'\n' - f' \n' - f' \n' - f' {src_path}' - f'\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - vrt_path.write_text(vrt_xml) - return str(vrt_path) - - -def test_negative_dstrect_size_rejected(tmp_path): - """Malformed ``DstRect`` must not survive into the read path. - - Accept ``ValueError`` (today's posture; the SimpleSource DstRect - validator raises ``VRT SimpleSource DstRect has negative size - (...)`` before any pixel work begins). The ``match=`` clause pins - the field name and the rejection reason so an unrelated - ``ValueError`` from some other VRT code path cannot silently keep - the test green. - - TODO(#2321): tighten to ``VRTUnsupportedError`` when PR 2 ships. - """ - vrt = _write_bad_dstrect_vrt(tmp_path, x_size=-10) - # Sub-PR 2 (#2329) centralised this rejection in the validator and - # upgraded it to ``VRTUnsupportedError``; accept either today. - with pytest.raises( - (ValueError, VRTUnsupportedError), - match=r"DstRect.*negative", - ): - read_vrt(vrt) - - -def _write_missing_source_vrt( - tmp_path: pathlib.Path, *, name: str = 'tmp_2321_missing.vrt', -) -> str: - """VRT pointing at a single source path that does not exist. - - The dispatcher's static missing-source sweep (#2265) raises at - construction time for both eager and dask routes when - ``missing_sources='raise'`` is in effect. - """ - vrt_path = tmp_path / name - # Reference a path inside the tmp dir that we never create. - missing = tmp_path / 'tmp_2321_missing_src.tif' - vrt_xml = ( - f'\n' - f' 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' - f' \n' - f' \n' - f' {missing}' - f'\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - vrt_path.write_text(vrt_xml) - assert not os.path.exists(str(missing)), ( - "fixture leak: missing-source path exists on disk" - ) - return str(vrt_path) - - -def test_missing_sources_raise_eager(tmp_path): - """``missing_sources='raise'`` (the public default since #1860) - must abort the read up front on the eager path.""" - vrt = _write_missing_source_vrt(tmp_path, name='tmp_2321_miss_eager.vrt') - with pytest.raises((OSError, ValueError, FileNotFoundError)): - read_vrt(vrt) - - -def test_missing_sources_raise_dask(tmp_path): - """``missing_sources='raise'`` (default) on the dask path raises - at graph-build time per #2265, not at ``.compute()``. - - Pin both the build-time raise and the value path so a regression - that defers the check to compute surfaces here. - """ - vrt = _write_missing_source_vrt(tmp_path, name='tmp_2321_miss_dask.vrt') - with pytest.raises((OSError, ValueError, FileNotFoundError)): - # Build-time raise required by the contract; if the implementation - # ever defers to compute, the test still fails on the materialise - # call because the exception type is the same. - lazy = open_geotiff(vrt, chunks=2) - lazy.compute() - - -def test_missing_sources_warn_records_holes(tmp_path): - """``missing_sources='warn'`` is the documented escape hatch. - - The lenient path must emit ``GeoTIFFFallbackWarning`` and populate - ``attrs['vrt_holes']`` so callers branching on the attr can detect - a partial mosaic. This is the contract documented in #1734 / #1843; - the test pins it via the public ``read_vrt`` entry point so a - regression in the warn-policy attr emission surfaces. - - The plan calls for parity tests against ``missing_sources='skip'``; - the public API exposes ``'warn'`` as the lenient option (skip is - used internally inside ``_vrt.read_vrt``). Use the documented public - value here so the test pins the user-facing contract. - """ - vrt = _write_missing_source_vrt(tmp_path, name='tmp_2321_miss_warn.vrt') - - with pytest.warns(GeoTIFFFallbackWarning, match="could not be read"): - result = read_vrt(vrt, missing_sources='warn') - - # The attrs contract for the lenient path requires both keys: - # ``vrt_holes`` lists the skipped sources, and the array exists. - assert 'vrt_holes' in result.attrs, ( - "missing_sources='warn' did not stamp attrs['vrt_holes']" - ) - holes = result.attrs['vrt_holes'] - assert len(holes) == 1 - # The hole entry should name the skipped source so downstream - # consumers can audit what was dropped. The shape pinned in #1734 - # is a dict with a ``source`` key; pin it as a hard assertion so a - # future refactor that changes the entry type (e.g., dataclass) - # surfaces here instead of silently weakening the path check. - assert isinstance(holes[0], dict), ( - f"vrt_holes entry type drifted: {type(holes[0]).__name__}; " - f"#1734 documents a dict shape" - ) - hole_source = holes[0]['source'] - assert 'tmp_2321_missing_src.tif' in hole_source, ( - f"hole source path drifted: {hole_source!r}" - ) diff --git a/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py b/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py deleted file mode 100644 index 10bf1b23a..000000000 --- a/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py +++ /dev/null @@ -1,435 +0,0 @@ -"""Regression tests for issue #1696. - -``read_vrt`` on a multi-band VRT used to allocate the output buffer from -``selected_bands[0].dtype`` only. Each band's source array was then -placed into the buffer via ``result[..., band_idx] = src_arr[...]``, -which silently casts to the LHS dtype. So a ``Byte`` band 0 followed by -a ``Float32`` band 1 returned uint8 output, with band 1's fractional -values truncated. - -The ``ComplexSource`` ``ScaleRatio`` / ``ScaleOffset`` path made this -worse. The decoded source is explicitly promoted to ``float64`` in the -``# Apply ComplexSource scaling`` block of ``read_vrt`` in ``_vrt.py`` -(``src_arr.astype(np.float64) * src.scale``), but the destination -buffer stays uint8 if all VRT bands declare ``Byte``, so the -post-scale fractional values are lost on assignment. - -The fix computes the common result dtype across all selected bands, -applying the same float64 promotion rule used for the scaled source -arrays, before allocating ``result``. The single-band branch uses the -same logic. -""" -from __future__ import annotations - -import numpy as np - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._writer import write - - -def _write(arr, path, **kw): - """Write a 2D array to ``path`` with sensible defaults for tests.""" - write(arr, str(path), compression='none', tiled=False, **kw) - - -def _build_two_band_vrt(tmp_path, *, b0_dtype_str, b0_path, b1_dtype_str, - b1_path, b1_extra="", b0_extra="", - filename='mb.vrt', size=2): - """Hand-roll a two-band VRT with arbitrary dataType strings.""" - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - -{b0_extra} - {b0_path} - 1 - - - - - -{b1_extra} - {b1_path} - 1 - - - - -""" - p = tmp_path / filename - p.write_text(vrt_xml) - return str(p) - - -def _build_complex_source_vrt(tmp_path, *, dtype_str, src_path, - scale_ratio=None, scale_offset=None, - filename='cs.vrt', size=2, - band_num=2, other_band_dtype="Byte", - other_band_path=None, - extra_band=True): - """Hand-roll a VRT where band 2 (or the only band) uses ComplexSource. - - ``extra_band=False`` writes a single-band VRT. - """ - cs_lines = [] - if scale_ratio is not None: - cs_lines.append(f" {scale_ratio}") - if scale_offset is not None: - cs_lines.append(f" {scale_offset}") - cs_inner = "\n".join(cs_lines) - - complex_block = f""" - {src_path} - 1 -{cs_inner} - - - -""" - - if extra_band and other_band_path is not None: - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - - {other_band_path} - 1 - - - - - -{complex_block} -""" - else: - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - -{complex_block} -""" - p = tmp_path / filename - p.write_text(vrt_xml) - return str(p) - - -# --------------------------------------------------------------------------- -# 1. Mixed-dtype bands: Byte + Float32 must not truncate band 1 -# --------------------------------------------------------------------------- - -def test_mixed_byte_and_float32_bands_preserve_fractional(tmp_path): - """``Byte`` band 0 + ``Float32`` band 1: band 1's fractional values - must survive the read. Before the fix the buffer was allocated as - uint8 and ``1.5, 2.5, 3.5, 4.5`` truncated to ``1, 2, 3, 4``. - """ - b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) - b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0) - _write(b1, p1) - - vrt_path = _build_two_band_vrt( - tmp_path, - b0_dtype_str='Byte', b0_path=str(p0), - b1_dtype_str='Float32', b1_path=str(p1), - ) - r = read_vrt(vrt_path) - # numpy 2.x: result_type(uint8, float32) == float32; numpy 1.x: float64. - # Either is acceptable; both express the float values losslessly. - assert r.dtype.kind == 'f', ( - f"Mixed Byte+Float32 must widen to float; got {r.dtype}" - ) - np.testing.assert_allclose(r.values[..., 1], b1.astype(r.dtype)) - np.testing.assert_array_equal(r.values[..., 0], b0.astype(r.dtype)) - - -# --------------------------------------------------------------------------- -# 2. ComplexSource ScaleRatio forces float promotion of the buffer -# --------------------------------------------------------------------------- - -def test_complex_source_scale_promotes_buffer_to_float(tmp_path): - """Both bands declare ``Byte`` but band 1 has ``0.5``. - The scaled source values include fractional results (11 * 0.5 = 5.5) - which must survive. Before the fix the buffer stayed uint8 and the - fractional values rounded down to 5. - """ - b = np.array([[10, 11], [12, 13]], dtype=np.uint8) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b, p0) - _write(b, p1) - - vrt_path = _build_complex_source_vrt( - tmp_path, - dtype_str='Byte', src_path=str(p1), - scale_ratio=0.5, - other_band_dtype='Byte', other_band_path=str(p0), - ) - r = read_vrt(vrt_path) - assert r.dtype.kind == 'f', ( - f"ScaleRatio on a Byte band must widen the buffer to float; " - f"got {r.dtype}" - ) - expected = b.astype(np.float64) * 0.5 - np.testing.assert_allclose(r.values[..., 1], expected) - # Band 0 (unscaled Byte) survives losslessly through the wider dtype. - np.testing.assert_array_equal( - r.values[..., 0].astype(np.uint8), b - ) - - -# --------------------------------------------------------------------------- -# 3. All-Byte bands with no scaling stay uint8 (no needless widening) -# --------------------------------------------------------------------------- - -def test_all_byte_no_scaling_stays_uint8(tmp_path): - """Two ``Byte`` bands with no ``ComplexSource`` scaling: the result - must stay uint8 (memory regression guard). The fix must not widen - unconditionally to float64. - """ - b0 = np.array([[10, 20], [30, 40]], dtype=np.uint8) - b1 = np.array([[50, 60], [70, 80]], dtype=np.uint8) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0) - _write(b1, p1) - - vrt_path = _build_two_band_vrt( - tmp_path, - b0_dtype_str='Byte', b0_path=str(p0), - b1_dtype_str='Byte', b1_path=str(p1), - ) - r = read_vrt(vrt_path) - assert r.dtype == np.uint8, ( - f"All-Byte VRT with no scaling must stay uint8; got {r.dtype}" - ) - np.testing.assert_array_equal(r.values[..., 0], b0) - np.testing.assert_array_equal(r.values[..., 1], b1) - - -# --------------------------------------------------------------------------- -# 4. Per-band ScaleRatio + ScaleOffset combinations preserve precision -# --------------------------------------------------------------------------- - -def test_complex_source_scale_and_offset_preserve_precision(tmp_path): - """``ScaleRatio=0.25`` plus ``ScaleOffset=1.5`` on a uint8 band: - the scaled-and-offset values (e.g. ``10 * 0.25 + 1.5 = 4.0``, - ``11 * 0.25 + 1.5 = 4.25``) must survive without truncation. - - Note: the ``ComplexSource`` branch of ``parse_vrt`` in ``_vrt.py`` - maps the XML ```` to the dataclass ``scale`` attribute - and ```` to the ``offset`` attribute, then the - ``# Apply ComplexSource scaling`` block in ``read_vrt`` applies - ``src_arr = src_arr * scale + offset``. - """ - b = np.array([[10, 11], [12, 13]], dtype=np.uint8) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b, p0) - _write(b, p1) - - vrt_path = _build_complex_source_vrt( - tmp_path, - dtype_str='Byte', src_path=str(p1), - scale_ratio=0.25, scale_offset=1.5, - other_band_dtype='Byte', other_band_path=str(p0), - ) - r = read_vrt(vrt_path) - assert r.dtype.kind == 'f' - expected = b.astype(np.float64) * 0.25 + 1.5 - np.testing.assert_allclose(r.values[..., 1], expected) - - -# --------------------------------------------------------------------------- -# 5. NoData round-trip through widened dtype -# --------------------------------------------------------------------------- - -def test_nodata_round_trip_through_widened_int_dtype(tmp_path): - """Band 0 = uint8 with NoData=255; band 1 = int16 with NoData=-9999. - ``np.result_type(uint8, int16) = int16``. Band 0's value 255 is - representable as int16 so the nodata fast-path still fires; the - surviving values must be preserved through the wider buffer. - """ - b0 = np.array([[1, 2], [3, 255]], dtype=np.uint8) - b1 = np.array([[100, 200], [300, -9999]], dtype=np.int16) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0, nodata=255) - _write(b1, p1, nodata=-9999) - - vrt_path = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - 255 - - {p0} - 1 - - - - - - -9999 - - {p1} - 1 - - - - -""" - out = tmp_path / 'mixed.vrt' - out.write_text(vrt_path) - # ``255`` (band 0) and ``-9999`` (band 1) are distinct sentinels, so - # after #1987 PR 5 the default read raises. The widened-dtype - # behaviour this regression covers is still reachable through - # ``band_nodata='first'``. - r = read_vrt(str(out), band_nodata='first') - # Per-band nodata masking in __init__.py promotes uint/int VRT - # buffers to float64 when at least one band's sentinel hits a pixel. - # Either we land on float64 (NaN-masked) or stay int16 (sentinel - # literal). Both branches must preserve the non-sentinel values. - if r.dtype.kind == 'f': - # Non-sentinel pixels survive - assert r.values[0, 0, 0] == 1 - assert r.values[0, 1, 0] == 2 - assert r.values[1, 0, 0] == 3 - # Sentinel pixels masked to NaN - assert np.isnan(r.values[1, 1, 0]) - assert np.isnan(r.values[1, 1, 1]) - # Band 1 non-sentinels survive - assert r.values[0, 0, 1] == 100 - else: - # Integer dtype kept; sentinels remain as literal values - assert r.dtype == np.int16 - assert r.values[0, 0, 0] == 1 - assert r.values[0, 0, 1] == 100 - - -# --------------------------------------------------------------------------- -# 6. Single-band scaled VRT also widens -# --------------------------------------------------------------------------- - -def test_single_band_complex_source_scale_widens_buffer(tmp_path): - """Single-band ``Byte`` VRT with ``0.5``. - The single-band branch in ``read_vrt`` must mirror the multi-band - widening logic; previously it used ``selected_bands[0].dtype`` - directly, so the scaled source values truncated back to uint8. - """ - b = np.array([[10, 11], [12, 13]], dtype=np.uint8) - p = tmp_path / 'b.tif' - _write(b, p) - - vrt_path = _build_complex_source_vrt( - tmp_path, - dtype_str='Byte', src_path=str(p), - scale_ratio=0.5, - extra_band=False, - ) - r = read_vrt(vrt_path) - assert r.ndim == 2, ( - f"Single-band VRT must return a 2D array; got shape {r.shape}" - ) - assert r.dtype.kind == 'f', ( - f"Single-band scaled VRT must widen to float; got {r.dtype}" - ) - expected = b.astype(np.float64) * 0.5 - np.testing.assert_allclose(r.values, expected) - - -# --------------------------------------------------------------------------- -# 7. band=N selection from a mixed-dtype VRT picks the right per-band dtype -# --------------------------------------------------------------------------- - -def test_band_select_uint8_first_then_float_returns_float_for_band_1(tmp_path): - """When the caller selects ``band=1`` from a ``Byte`` + ``Float32`` VRT, - the result dtype must be float (the selected band's declared dtype), - not uint8 carried over from band 0. The previous code allocated based - on ``selected_bands[0].dtype`` -- which is correct after band selection - -- so this is the non-regression check that the new code still does - the right thing when only one band is selected. - """ - b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) - b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0) - _write(b1, p1) - - vrt_path = _build_two_band_vrt( - tmp_path, - b0_dtype_str='Byte', b0_path=str(p0), - b1_dtype_str='Float32', b1_path=str(p1), - ) - r = read_vrt(vrt_path, band=1) - assert r.dtype == np.float32 - np.testing.assert_allclose(r.values, b1) - - -def test_band_select_uint8_first_then_float_returns_uint8_for_band_0(tmp_path): - """Selecting ``band=0`` from a ``Byte`` + ``Float32`` VRT must return - uint8 (band 0's declared dtype) without widening. - """ - b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) - b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0) - _write(b1, p1) - - vrt_path = _build_two_band_vrt( - tmp_path, - b0_dtype_str='Byte', b0_path=str(p0), - b1_dtype_str='Float32', b1_path=str(p1), - ) - r = read_vrt(vrt_path, band=0) - assert r.dtype == np.uint8 - np.testing.assert_array_equal(r.values, b0) - - -# --------------------------------------------------------------------------- -# 8. All-Float32 multi-band stays float32 (do not over-widen) -# --------------------------------------------------------------------------- - -def test_all_float32_multiband_stays_float32(tmp_path): - """Two ``Float32`` bands with no scaling: the buffer must stay - float32 rather than widening to float64. ``np.result_type`` of two - identical dtypes returns the same dtype. - """ - b0 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) - b1 = np.array([[5.5, 6.5], [7.5, 8.5]], dtype=np.float32) - p0 = tmp_path / 'b0.tif' - p1 = tmp_path / 'b1.tif' - _write(b0, p0) - _write(b1, p1) - - vrt_path = _build_two_band_vrt( - tmp_path, - b0_dtype_str='Float32', b0_path=str(p0), - b1_dtype_str='Float32', b1_path=str(p1), - ) - r = read_vrt(vrt_path) - assert r.dtype == np.float32 - np.testing.assert_allclose(r.values[..., 0], b0) - np.testing.assert_allclose(r.values[..., 1], b1) - - -# --------------------------------------------------------------------------- -# 9. VRT with zero elements raises a clear ValueError -# --------------------------------------------------------------------------- - -def test_zero_band_vrt_raises_value_error(tmp_path): - """A malformed VRT with zero ```` children must - surface a clear ``ValueError`` from ``read_vrt`` rather than the - generic ``"at least one array or dtype is required"`` message - raised by ``np.result_type`` when called with no arguments. - """ - import pytest - - vrt_xml = """ - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 -""" - p = tmp_path / 'empty.vrt' - p.write_text(vrt_xml) - - with pytest.raises(ValueError, match=r"no "): - read_vrt(str(p)) diff --git a/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py b/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py deleted file mode 100644 index a9557524a..000000000 --- a/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py +++ /dev/null @@ -1,236 +0,0 @@ -"""Regression tests for issue #1611. - -``read_vrt(path)`` on a multi-band integer VRT with per-band -```` tags used to only mask band 0's sentinel. Bands 1 -and up retained their integer sentinels as literal finite values in -the returned float64 array, breaking the convention that -``attrs['nodata']`` is present iff sentinel pixels are already NaN. - -The float-VRT path masks per-band correctly in ``_vrt._read_data`` -(lines 296-297, 347-351). For integer rasters the masking moved to -``__init__.py:read_vrt`` and used ``vrt.bands[0].nodata`` for every -band. The fix walks ``vrt.bands`` when ``band is None`` and masks -each ``arr[..., i]`` slice against its own ````. - -This file mirrors test_vrt_band_nodata_1598.py for the multi-band -``band=None`` path. PR #1602 fixed ``band=N`` single-band selection. - -After #1987 PR 5 (mixed-band-metadata fail-closed), the fixtures here -- -which mosaic bands with deliberately distinct per-band sentinels -- -raise ``MixedBandMetadataError`` by default. Each call passes -``band_nodata='first'`` to assert the legacy flatten-to-first-band -behaviour is still reachable via the documented opt-out, which is the -exact semantics the regression covers. -""" -from __future__ import annotations - -import numpy as np - -from xrspatial.geotiff import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_two_band_per_band_nodata_vrt(tmp_path, *, dtype_str="UInt16", - np_dtype=np.uint16, - band0_sentinel=65535, - band1_sentinel=65000, - band0_other=(1, 2, 3), - band1_other=(7, 8, 9)): - """Two single-band integer sources, each with a distinct nodata - sentinel, exposed as bands 1 and 2 of a hand-rolled VRT. - - Used to be band 0's sentinel was the only one masked. Now every - band gets its own sentinel. - """ - b0_arr = np.array([[band0_other[0], band0_other[1]], - [band0_other[2], band0_sentinel]], dtype=np_dtype) - b1_arr = np.array([[band1_other[0], band1_other[1]], - [band1_other[2], band1_sentinel]], dtype=np_dtype) - p0 = str(tmp_path / 'vrt_b0_1611.tif') - p1 = str(tmp_path / 'vrt_b1_1611.tif') - write(b0_arr, p0, nodata=band0_sentinel, compression='none', - tiled=False) - write(b1_arr, p1, nodata=band1_sentinel, compression='none', - tiled=False) - - vrt_path = str(tmp_path / 'two_band_per_band_nodata_1611.vrt') - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - {band0_sentinel} - - {p0} - 1 - - - - - - {band1_sentinel} - - {p1} - 1 - - - - -""" - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -def test_multiband_uint16_per_band_sentinel_each_masked(tmp_path): - """The previously-broken case: every band's sentinel must be NaN. - - Before the fix this returned dtype=float64 with band 0's (1,1) cell - as NaN but band 1's (1,1) cell as the literal 65000.0. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band_nodata="first") - assert r.shape == (2, 2, 2) - assert r.dtype == np.float64, ( - f"expected float64 promotion, got {r.dtype}" - ) - assert np.isnan(r.values[1, 1, 0]), ( - "band 0's sentinel pixel was not NaN-masked." - ) - assert np.isnan(r.values[1, 1, 1]), ( - "band 1's sentinel pixel was not NaN-masked; " - "the regression from issue #1611 has returned." - ) - # Non-sentinel pixels survive unchanged - assert r.values[0, 0, 0] == 1 - assert r.values[0, 0, 1] == 7 - assert r.values[1, 0, 0] == 3 - assert r.values[1, 0, 1] == 9 - - -def test_multiband_int32_negative_per_band_sentinel(tmp_path): - """Negative sentinels in a signed integer VRT also mask per-band. - - The original bug was dtype-independent: any integer dtype with - per-band would have hit it. Cover int32 + negative - sentinels to make sure the helper handles signed types and the - range guard accepts negatives. - """ - vrt_path = _write_two_band_per_band_nodata_vrt( - tmp_path, dtype_str="Int32", np_dtype=np.int32, - band0_sentinel=-9999, band1_sentinel=-7777, - band0_other=(10, 20, 30), band1_other=(40, 50, 60)) - r = read_vrt(vrt_path, band_nodata="first") - assert r.dtype == np.float64 - assert np.isnan(r.values[1, 1, 0]) - assert np.isnan(r.values[1, 1, 1]) - assert r.values[0, 0, 0] == 10 - assert r.values[0, 0, 1] == 40 - - -def test_multiband_only_one_band_has_sentinel_present(tmp_path): - """If only one band's sentinel actually appears in the data, only - that band should change. The non-hitting band stays the same float64 - value (no spurious NaN introduced). - - Force band 1's sentinel never to appear by writing 99 instead. - """ - vrt_path = _write_two_band_per_band_nodata_vrt( - tmp_path, - band0_sentinel=65535, band1_sentinel=65000, - band1_other=(7, 8, 9)) - # Overwrite the band 1 source so the sentinel value 65000 is NOT - # present in band 1's actual data. - b1_no_sentinel = np.array([[7, 8], [9, 99]], dtype=np.uint16) - import os - p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif') - write(b1_no_sentinel, p1, nodata=65000, compression='none', - tiled=False) - - r = read_vrt(vrt_path, band_nodata="first") - assert r.dtype == np.float64, ( - "Even when only band 0 has a present sentinel, the array still " - "needs promotion so band 0's NaN can be expressed." - ) - assert np.isnan(r.values[1, 1, 0]) - assert r.values[1, 1, 1] == 99.0 # band 1 sentinel absent, no NaN - - -def test_multiband_no_sentinel_present_anywhere_keeps_int_dtype(tmp_path): - """When no band actually contains its declared sentinel, skip - promotion entirely. Avoids a needless float64 cast on integer data. - """ - vrt_path = _write_two_band_per_band_nodata_vrt( - tmp_path, - band0_sentinel=65535, band1_sentinel=65000, - band0_other=(1, 2, 3), band1_other=(7, 8, 9)) - # Replace BOTH source files so neither contains its sentinel - import os - b0 = np.array([[1, 2], [3, 4]], dtype=np.uint16) - b1 = np.array([[7, 8], [9, 10]], dtype=np.uint16) - p0 = os.path.join(os.path.dirname(vrt_path), 'vrt_b0_1611.tif') - p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif') - write(b0, p0, nodata=65535, compression='none', tiled=False) - write(b1, p1, nodata=65000, compression='none', tiled=False) - - r = read_vrt(vrt_path, band_nodata="first") - # Sentinels not present -> integer dtype preserved (matches the - # eager open_geotiff fast-path which also skips promotion when the - # mask is empty). - assert r.dtype == np.uint16 - assert r.values[1, 1, 0] == 4 - assert r.values[1, 1, 1] == 10 - - -def test_multiband_per_band_out_of_range_sentinel_is_no_op(tmp_path): - """A sentinel out of the integer dtype's range should be a no-op - for that band rather than raising. Mirrors PR #1583's behaviour - (#1581): the helper ``_int_nodata_in_range`` gates the cast. - """ - # uint16 cannot represent -9999; the helper should skip band 1. - vrt_path = _write_two_band_per_band_nodata_vrt( - tmp_path, - dtype_str="UInt16", np_dtype=np.uint16, - band0_sentinel=65535, band1_sentinel=10, # placeholder; rewrite below - band0_other=(1, 2, 3), band1_other=(7, 8, 9)) - - # Hand-rewrite the VRT so band 1's NoDataValue is the out-of-range -9999. - with open(vrt_path) as f: - xml = f.read() - xml = xml.replace("10", - "-9999") - with open(vrt_path, 'w') as f: - f.write(xml) - - # Should not raise and should still mask band 0. - r = read_vrt(vrt_path, band_nodata="first") - assert np.isnan(r.values[1, 1, 0]) # band 0 sentinel still masked - # Band 1's sentinel (-9999) is out of uint16 range; the value 10 - # in band1[1,1] survives unchanged. - assert r.values[1, 1, 1] == 10.0 or r.values[1, 1, 1] == 10 - - -def test_multiband_band_kwarg_still_per_band_post_pr1602(tmp_path): - """Non-regression check that PR #1602's band=N path still works. - - The fix here only changes the ``band is None`` branch; ``band=N`` - must still route through the single-band masking with its own - sentinel. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r0 = read_vrt(vrt_path, band=0, band_nodata="first") - r1 = read_vrt(vrt_path, band=1, band_nodata="first") - assert r0.dtype == np.float64 - assert r1.dtype == np.float64 - assert r0.attrs.get('nodata') == 65535.0 - assert r1.attrs.get('nodata') == 65000.0 - assert np.isnan(r0.values[1, 1]) - assert np.isnan(r1.values[1, 1]) - - -def test_multiband_attrs_nodata_still_band0(tmp_path): - """``attrs['nodata']`` for band=None reads is documented as band - 0's sentinel (the canonical attr cannot encode per-band values). - The pixel-level fix must not change that contract. - """ - vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band_nodata="first") - assert r.attrs.get('nodata') == 65535.0 diff --git a/xrspatial/geotiff/tests/test_vrt_resample_alg_1751.py b/xrspatial/geotiff/tests/test_vrt_resample_alg_1751.py deleted file mode 100644 index 944042835..000000000 --- a/xrspatial/geotiff/tests/test_vrt_resample_alg_1751.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Regression tests for issue #1751. - -``read_vrt`` parses ```` from a ``ComplexSource`` but always -calls ``_resample_nearest`` during placement, regardless of the parsed -algorithm. A VRT declaring ``Bilinear`` / ``Cubic`` / ``Average`` / -``Mode`` therefore receives nearest-sampled pixels mislabelled as the -requested algorithm -- silently wrong analytics. - -The fix raises ``NotImplementedError`` at the resample call site when -the source declares an unsupported algorithm and the SrcRect/DstRect -sizes actually differ (a 1:1 placement is nearest-equivalent and stays -permissive). Nearest, NearestNeighbour, NEAR, empty, and an absent -```` element are all accepted. -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff._errors import VRTUnsupportedError -from xrspatial.geotiff._vrt import read_vrt -from xrspatial.geotiff._writer import write - -# Accept either the historical ``NotImplementedError`` raised by the -# placement-site ``_check_resample_alg_supported`` gate or the newer -# ``VRTUnsupportedError`` raised by the centralised -# ``validate_parsed_vrt`` once it preempts the placement check (see -# issue #2371). Both encode the same contract: nearest must not be -# silently substituted for an unsupported alg. -_UNSUPPORTED_RESAMPLE_EXC = (NotImplementedError, VRTUnsupportedError) - - -def _write_src(tmp_path) -> str: - """Write a 4x4 uint16 source TIFF and return its path.""" - src = np.arange(16, dtype=np.uint16).reshape(4, 4) - src_path = str(tmp_path / 'src.tif') - write(src, src_path, compression='none', tiled=False) - return src_path - - -def _write_vrt(tmp_path, xml: str, name: str = 'test.vrt') -> str: - p = str(tmp_path / name) - with open(p, 'w') as f: - f.write(xml) - return p - - -def _vrt_xml(src_path: str, *, alg_elem: str, - dst_x: int = 2, dst_y: int = 2) -> str: - """Render a VRT XML with a 4x4 SrcRect and configurable DstRect+Alg. - - ``alg_elem`` is the raw ``...`` element - to splice into the ````, or the empty string to - omit it entirely. - """ - return f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - - {src_path} - 1 - - - {alg_elem} - - -""" - - -@pytest.mark.parametrize('alg', ['Bilinear', 'Cubic', 'CubicSpline', - 'Lanczos', 'Average', 'Mode']) -def test_unsupported_resample_alg_raises(tmp_path, alg): - """A ComplexSource declaring any non-nearest algorithm with a size - change must raise ``NotImplementedError`` rather than return - silently nearest-sampled pixels.""" - src_path = _write_src(tmp_path) - xml = _vrt_xml(src_path, - alg_elem=f'{alg}') - vrt_path = _write_vrt(tmp_path, xml, f'{alg.lower()}.vrt') - - with pytest.raises(_UNSUPPORTED_RESAMPLE_EXC) as excinfo: - read_vrt(vrt_path) - msg = str(excinfo.value) - assert alg in msg - assert '1751' in msg - - -def test_unsupported_resample_alg_case_insensitive(tmp_path): - """Algorithm names are matched case-insensitively: ``bilinear`` - (lowercase) is the same unsupported request as ``Bilinear``.""" - src_path = _write_src(tmp_path) - xml = _vrt_xml(src_path, - alg_elem='bilinear') - vrt_path = _write_vrt(tmp_path, xml, 'lower.vrt') - - with pytest.raises(_UNSUPPORTED_RESAMPLE_EXC, match='bilinear'): - read_vrt(vrt_path) - - -@pytest.mark.parametrize('alg', ['Nearest', 'NearestNeighbour', - 'NearestNeighbor', 'NEAR', 'nearest', - 'NEAREST', '']) -def test_nearest_variants_accepted(tmp_path, alg): - """Nearest (and its case / spelling variants, plus empty text) is - the implemented algorithm and must round-trip without raising.""" - src_path = _write_src(tmp_path) - xml = _vrt_xml(src_path, - alg_elem=f'{alg}') - vrt_path = _write_vrt(tmp_path, xml, f'near_{alg or "empty"}.vrt') - - arr, _ = read_vrt(vrt_path) - assert arr.shape == (2, 2) - - -def test_missing_resample_alg_accepted(tmp_path): - """Absent ```` (GDAL's nearest default) must still - round-trip without raising.""" - src_path = _write_src(tmp_path) - xml = _vrt_xml(src_path, alg_elem='') - vrt_path = _write_vrt(tmp_path, xml, 'absent.vrt') - - arr, _ = read_vrt(vrt_path) - assert arr.shape == (2, 2) - - -def test_bilinear_at_same_size_does_not_raise(tmp_path): - """A ``Bilinear`` declaration with matching SrcRect/DstRect sizes - is nearest-equivalent (no resample step runs) so the read is - accepted. This pins down the resample-site placement of the - check -- a parse-time check would have rejected this case too.""" - src_path = _write_src(tmp_path) - # SrcRect 4x4, DstRect 4x4: needs_resample is False, no kernel - # call, no silent wrongness. - xml = _vrt_xml(src_path, - alg_elem='Bilinear', - dst_x=4, dst_y=4) - vrt_path = _write_vrt(tmp_path, xml, 'bilinear_1to1.vrt') - - arr, _ = read_vrt(vrt_path) - assert arr.shape == (4, 4) diff --git a/xrspatial/geotiff/tests/test_vrt_resample_window_inverse_1704.py b/xrspatial/geotiff/tests/test_vrt_resample_window_inverse_1704.py deleted file mode 100644 index 4f8cac4bb..000000000 --- a/xrspatial/geotiff/tests/test_vrt_resample_window_inverse_1704.py +++ /dev/null @@ -1,299 +0,0 @@ -"""Regression tests for issue #1704. - -When a VRT ```` has a ```` size that differs from -its ```` size (i.e. the source feeds an up- or downsample into -its destination cell), ``read_vrt`` used to decode the full SrcRect from -disk even when the caller passed a tiny ``window=`` clipping into the -middle. For very large source rects this caused multi-GB decodes and -resample intermediates. - -The fix inverts the nearest-neighbour mapping: for the clipped -destination sub-window, compute the smallest SrcRect-relative range of -rows / cols that ``_resample_nearest`` would gather from, read only -that, and resample directly into the sub-window output. The result is -byte-identical to the old "read full, resample full, then slice" path. -""" -from __future__ import annotations - -from unittest import mock - -import numpy as np -import pytest - -from xrspatial.geotiff._vrt import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_vrt_xml(tmp_path, xml: str, name: str) -> str: - p = str(tmp_path / name) - with open(p, 'w') as f: - f.write(xml) - return p - - -def _write_src(tmp_path, arr: np.ndarray, name: str = 'tmp_1704_src.tif') -> str: - src_path = str(tmp_path / name) - write(arr, src_path, compression='none', tiled=False) - return src_path - - -def _single_source_vrt(src_path: str, *, - raster_x: int, raster_y: int, - src_x: int, src_y: int, - src_xsize: int, src_ysize: int, - dst_x: int, dst_y: int, - dst_xsize: int, dst_ysize: int, - dtype: str = "UInt16", - nodata: str | None = None) -> str: - nodata_xml = f" {nodata}\n" if nodata is not None else "" - return ( - f'\n' - f' 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' - f' \n' - f' \n' - f' {src_path}\n' - f' 1\n' - f' \n' - f' \n' - f'{nodata_xml}' - f' \n' - f' \n' - f'\n' - ) - - -# --------------------------------------------------------------------------- -# Parity: byte-identical to full-then-slice for upsample and downsample -# --------------------------------------------------------------------------- - -def test_upsample_window_matches_full_then_slice(tmp_path): - """4x upsample, then read a small window from the middle. The - windowed read must equal the full read sliced at the same offsets.""" - src = (np.arange(10 * 10, dtype=np.uint16).reshape(10, 10) + 1) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=40, raster_y=40, - src_x=0, src_y=0, src_xsize=10, src_ysize=10, - dst_x=0, dst_y=0, dst_xsize=40, dst_ysize=40, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_up.vrt') - - full, _ = read_vrt(vrt_path) - assert full.shape == (40, 40) - # Pick a middle clip; covers a region with non-trivial neighbour mapping. - r0, c0, r1, c1 = 7, 11, 33, 29 - windowed, _ = read_vrt(vrt_path, window=(r0, c0, r1, c1)) - np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) - - -def test_downsample_window_matches_full_then_slice(tmp_path): - """4x downsample, windowed read parity with full-then-slice.""" - src = (np.arange(40 * 40, dtype=np.uint16).reshape(40, 40) + 1) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=10, raster_y=10, - src_x=0, src_y=0, src_xsize=40, src_ysize=40, - dst_x=0, dst_y=0, dst_xsize=10, dst_ysize=10, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_down.vrt') - - full, _ = read_vrt(vrt_path) - assert full.shape == (10, 10) - r0, c0, r1, c1 = 2, 3, 9, 8 - windowed, _ = read_vrt(vrt_path, window=(r0, c0, r1, c1)) - np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) - - -# --------------------------------------------------------------------------- -# Non-integer ratio: floor-rounding edge cases -# --------------------------------------------------------------------------- - -@pytest.mark.parametrize("r0,c0,r1,c1", [ - (0, 0, 11, 11), # full extent - (1, 1, 10, 10), # inner shrink - (3, 2, 7, 9), # off-centre - (0, 0, 1, 1), # single pixel at origin - (10, 10, 11, 11), # single pixel at corner - (5, 0, 6, 11), # one-row strip - (0, 5, 11, 6), # one-col strip -]) -def test_non_integer_ratio_7_to_11_window_parity(tmp_path, r0, c0, r1, c1): - """SrcRect 7x7, DstRect 11x11 (irrational ratio 7/11). The - nearest-neighbour mapping has uneven step sizes so the inverse - mapping has to handle each output index individually; this is the - case that breaks Option-2 "resample sub-shape into sub-shape" - implementations. - """ - src = (np.arange(7 * 7, dtype=np.uint16).reshape(7, 7) + 100) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=11, raster_y=11, - src_x=0, src_y=0, src_xsize=7, src_ysize=7, - dst_x=0, dst_y=0, dst_xsize=11, dst_ysize=11, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_7_11.vrt') - - full, _ = read_vrt(vrt_path) - windowed, _ = read_vrt(vrt_path, window=(r0, c0, r1, c1)) - np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) - - -# --------------------------------------------------------------------------- -# Edge alignment -# --------------------------------------------------------------------------- - -def test_window_starting_at_origin(tmp_path): - src = (np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=20, raster_y=20, - src_x=0, src_y=0, src_xsize=8, src_ysize=8, - dst_x=0, dst_y=0, dst_xsize=20, dst_ysize=20, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_origin.vrt') - full, _ = read_vrt(vrt_path) - windowed, _ = read_vrt(vrt_path, window=(0, 0, 5, 5)) - np.testing.assert_array_equal(windowed, full[0:5, 0:5]) - - -def test_window_ending_at_last_pixel(tmp_path): - src = (np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=20, raster_y=20, - src_x=0, src_y=0, src_xsize=8, src_ysize=8, - dst_x=0, dst_y=0, dst_xsize=20, dst_ysize=20, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_last.vrt') - full, _ = read_vrt(vrt_path) - windowed, _ = read_vrt(vrt_path, window=(15, 15, 20, 20)) - np.testing.assert_array_equal(windowed, full[15:20, 15:20]) - - -def test_window_crossing_multiple_sources(tmp_path): - """Two SimpleSources tiled side by side, each with non-1:1 SrcRect / - DstRect. A window that spans both sources must equal the full read - sliced over the same range. Both sources go through the new windowed - resample path. - """ - left = (np.arange(5 * 5, dtype=np.uint16).reshape(5, 5) + 1) - right = (np.arange(5 * 5, dtype=np.uint16).reshape(5, 5) + 1000) - left_path = _write_src(tmp_path, left, 'tmp_1704_left.tif') - right_path = _write_src(tmp_path, right, 'tmp_1704_right.tif') - vrt_xml = ( - '\n' - ' 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' - ' \n' - ' \n' - f' {left_path}\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - f' {right_path}\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_multi.vrt') - - full, _ = read_vrt(vrt_path) - assert full.shape == (10, 20) - # Window crosses x=10 boundary so both sources are clipped, not - # one-or-the-other. - windowed, _ = read_vrt(vrt_path, window=(2, 7, 8, 14)) - np.testing.assert_array_equal(windowed, full[2:8, 7:14]) - - -# --------------------------------------------------------------------------- -# Nodata: masking happens on the read buffer; sub-window read still masks. -# --------------------------------------------------------------------------- - -def test_nodata_round_trip_through_window(tmp_path): - """SimpleSource with ````; the sentinel inside the windowed - region must surface as NaN in a float-typed VRT. Both the full read - and the windowed read must agree on which pixels are NaN. - """ - src = (np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1).astype(np.uint16) - # Sprinkle the sentinel through the source so the sub-window catches - # at least one masked pixel under the 2x upsample. - src[3, 4] = 65535 - src[5, 2] = 65535 - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=16, raster_y=16, - src_x=0, src_y=0, src_xsize=8, src_ysize=8, - dst_x=0, dst_y=0, dst_xsize=16, dst_ysize=16, - dtype="Float32", - nodata="65535", - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_nodata.vrt') - - full, _ = read_vrt(vrt_path) - windowed, _ = read_vrt(vrt_path, window=(4, 4, 12, 12)) - # NaN equality needs assert_array_equal with equal_nan=True. - np.testing.assert_array_equal(windowed, full[4:12, 4:12]) - # Sanity: the sub-window contains at least one NaN, otherwise the - # test is vacuous. - assert np.isnan(windowed).any() - - -# --------------------------------------------------------------------------- -# Read-bound assertion: only the minimal source sub-rect is decoded. -# --------------------------------------------------------------------------- - -def test_only_minimal_source_rect_is_read(tmp_path): - """Patch ``read_to_array`` to record the windows requested. Under - the new path the source window must be much smaller than the full - SrcRect when the caller asks for a small sub-window. - """ - src = (np.arange(40 * 40, dtype=np.uint16).reshape(40, 40) + 1) - src_path = _write_src(tmp_path, src) - vrt_xml = _single_source_vrt( - src_path, - raster_x=160, raster_y=160, - src_x=0, src_y=0, src_xsize=40, src_ysize=40, - dst_x=0, dst_y=0, dst_xsize=160, dst_ysize=160, - ) - vrt_path = _write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_bound.vrt') - - seen_windows: list[tuple[int, int, int, int]] = [] - # ``read_vrt`` does ``from ._reader import read_to_array`` at call - # time, so the spy must live on ``_reader`` (the module that owns - # the name), not on ``_vrt``. - from xrspatial.geotiff import _reader as _reader_mod - real_read = _reader_mod.read_to_array - - def spy(filename, *, window, **kw): - seen_windows.append(tuple(window)) - return real_read(filename, window=window, **kw) - - with mock.patch.object(_reader_mod, 'read_to_array', spy): - # Window is 8x8 pixels in destination coords starting at (80, 80). - # Mapping back through floor((d+0.5)*40/160) = floor((d+0.5)/4) - # gives source rows / cols 20..21 (inclusive) so the read should - # be 2x2 source pixels, not 40x40. - arr, _ = read_vrt(vrt_path, window=(80, 80, 88, 88)) - - assert arr.shape == (8, 8) - assert len(seen_windows) == 1 - r0, c0, r1, c1 = seen_windows[0] - read_h = r1 - r0 - read_w = c1 - c0 - assert read_h < 10, ( - f"expected a small source row range, got {read_h} rows; " - f"the full SrcRect is 40 rows so the fix is not reducing the read." - ) - assert read_w < 10 diff --git a/xrspatial/geotiff/tests/test_vrt_scaled_rects_1694.py b/xrspatial/geotiff/tests/test_vrt_scaled_rects_1694.py deleted file mode 100644 index d607539bd..000000000 --- a/xrspatial/geotiff/tests/test_vrt_scaled_rects_1694.py +++ /dev/null @@ -1,332 +0,0 @@ -"""Regression tests for issue #1694. - -``read_vrt`` did not resample source pixel data when a source band's -```` size differed from its ```` size. Downsampling -raised ``ValueError: could not broadcast input array from shape (S,S) -into shape (D,D)`` and upsampling silently left holes -- only the -top-left ``sr.x_size``/``sr.y_size`` pixels of each destination cell -were written. - -The fix: - -* when ``sr.size != dr.size``, read the full source rect, apply nodata - masking, resample to ``(dr.y_size, dr.x_size)`` with nearest-neighbour - (matching GDAL's SimpleSource semantics), and then clip; -* the same-size case still uses windowed reads as before. -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff._vrt import _resample_nearest, read_vrt -from xrspatial.geotiff._writer import write - - -def _write_vrt(tmp_path, xml: str, name: str = 'test.vrt') -> str: - p = str(tmp_path / name) - with open(p, 'w') as f: - f.write(xml) - return p - - -def test_downsample_4x4_to_2x2_does_not_raise_and_uses_nearest(tmp_path): - """SrcRect 4x4 -> DstRect 2x2: result is (2,2), nearest-neighbour. - - Before the fix the source (4,4) array was assigned directly into the - (2,2) destination slice, raising the broadcast error documented in - issue #1694. - """ - src = np.arange(16, dtype=np.uint16).reshape(4, 4) - src_path = str(tmp_path / 'src.tif') - write(src, src_path, compression='none', tiled=False) - - vrt_xml = f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - - {src_path} - 1 - - - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'down.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (2, 2), ( - f"expected (2,2), got {result.shape}; resample step missing." - ) - # Nearest-neighbour with the centre-of-output-pixel rule samples - # source indices floor((i+0.5)*4/2) = floor((i+0.5)*2) -> 1, 3 for - # i=0, 1. So we expect src[1, 1], src[1, 3], src[3, 1], src[3, 3]. - expected = np.array([[src[1, 1], src[1, 3]], - [src[3, 1], src[3, 3]]], dtype=np.uint16) - np.testing.assert_array_equal(result, expected) - - -def test_upsample_2x2_to_4x4_repeats_each_source_pixel(tmp_path): - """SrcRect 2x2 -> DstRect 4x4: each source pixel repeated 2x2. - - Before the fix only the top-left 2x2 of the destination was written - and the rest stayed at the fill value (0 for integer, NaN for - float). - """ - src = np.array([[1, 2], [3, 4]], dtype=np.uint16) - src_path = str(tmp_path / 'src.tif') - write(src, src_path, compression='none', tiled=False) - - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - - {src_path} - 1 - - - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'up.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (4, 4) - expected = np.array([ - [1, 1, 2, 2], - [1, 1, 2, 2], - [3, 3, 4, 4], - [3, 3, 4, 4], - ], dtype=np.uint16) - np.testing.assert_array_equal(result, expected) - # No holes -- every cell was written. - assert not (result == 0).any(), ( - "upsample left zero-filled cells; resample not propagated." - ) - - -def test_non_integer_scale_3x3_to_2x2_no_holes(tmp_path): - """Non-integer source / destination ratio: covers index-mapping path. - - With src=(3,3) -> dst=(2,2), neither integer-ratio fast path applies. - Confirms the general nearest-neighbour gather produces the correct - shape, no holes, no out-of-bounds writes. - """ - src = np.arange(9, dtype=np.uint16).reshape(3, 3) - src_path = str(tmp_path / 'src.tif') - write(src, src_path, compression='none', tiled=False) - - vrt_xml = f""" - 0.0, 1.5, 0.0, 0.0, 0.0, -1.5 - - - {src_path} - 1 - - - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'nonint.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (2, 2) - # Nearest mapping: floor((i+0.5) * 3/2) = floor((i+0.5)*1.5) - # i=0 -> floor(0.75) = 0 - # i=1 -> floor(2.25) = 2 - # So output samples src[0,0], src[0,2], src[2,0], src[2,2]. - expected = np.array([[src[0, 0], src[0, 2]], - [src[2, 0], src[2, 2]]], dtype=np.uint16) - np.testing.assert_array_equal(result, expected) - - -def test_per_band_scale_mix(tmp_path): - """Mixed: band 1 downsampled, band 2 at native resolution. - - Both bands must land in the right places without a broadcast error - and without bleeding band 1's resampled values into band 2. - """ - # Band 1 source: 4x4 -- will be downsampled to 2x2 destination. - band1_src = (np.arange(16, dtype=np.uint16) * 10).reshape(4, 4) - # Band 2 source: 2x2 -- native resolution. - band2_src = np.array([[100, 200], [300, 400]], dtype=np.uint16) - - p1 = str(tmp_path / 'b1.tif') - p2 = str(tmp_path / 'b2.tif') - write(band1_src, p1, compression='none', tiled=False) - write(band2_src, p2, compression='none', tiled=False) - - vrt_xml = f""" - 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 - - - {p1} - 1 - - - - - - - {p2} - 1 - - - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'mix.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (2, 2, 2) - # Band 1 nearest-neighbour from 4x4 -> 2x2: src[1,1], src[1,3], src[3,1], src[3,3] - expected_b1 = np.array([[band1_src[1, 1], band1_src[1, 3]], - [band1_src[3, 1], band1_src[3, 3]]], - dtype=np.uint16) - np.testing.assert_array_equal(result[..., 0], expected_b1) - # Band 2 native: untouched. - np.testing.assert_array_equal(result[..., 1], band2_src) - - -def test_window_on_downsampled_source_returns_correct_subwindow(tmp_path): - """``window=(0,0,1,1)`` on a 4x4 -> 2x2 source returns the (0,0) cell. - - The destination cell maps to the source pixel that the resample - routine would sample for that location. Confirms the clip-after- - resample ordering: clipping in source coordinates first (as the old - code effectively did) would feed the wrong source slice into the - resampler. - """ - src = np.arange(16, dtype=np.uint16).reshape(4, 4) - src_path = str(tmp_path / 'src.tif') - write(src, src_path, compression='none', tiled=False) - - vrt_xml = f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - - {src_path} - 1 - - - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'win.vrt') - - result, _ = read_vrt(vrt_path, window=(0, 0, 1, 1)) - - assert result.shape == (1, 1) - # Full-resample value at destination (0,0) is src[1,1] == 5. - assert result[0, 0] == src[1, 1] - - -def test_nodata_preserved_across_downsample(tmp_path): - """Source sentinel pixels survive the resample as NaN in the result. - - Source is uint16 with sentinel=65535. Pixels at the sampled-from - positions whose values are 65535 must appear as NaN in the float64 - VRT output. - """ - sentinel = np.uint16(65535) - src = np.array([ - [10, 20, 30, 40], - [50, sentinel, 70, sentinel], - [90, 100, 110, 120], - [130, sentinel, 150, sentinel], - ], dtype=np.uint16) - src_path = str(tmp_path / 'src_nd.tif') - write(src, src_path, nodata=int(sentinel), compression='none', - tiled=False) - - # Float64 VRT so the integer-into-float promotion path runs and - # leaves NaN at the sentinel pixels. Use ```` on the source - # so the masking branch fires regardless of band-level . - vrt_xml = f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - -9999 - - {src_path} - 1 - - - 65535 - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'nd.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (2, 2) - assert result.dtype == np.float64 - # Nearest sampler picks src[1,1], src[1,3], src[3,1], src[3,3]. - # Of those, src[1,1]=65535, src[1,3]=65535, src[3,1]=65535, - # src[3,3]=65535 -- so every output pixel is the sentinel and must - # be NaN after masking. - assert np.isnan(result).all(), ( - f"sentinel did not survive resample as NaN; got {result!r}" - ) - - -def test_nodata_with_mixed_sentinel_and_valid_pixels(tmp_path): - """Mixed sentinel / valid source -> mixed NaN / valid destination. - - Confirms the mask resamples *with* the data, not against the - pre-resampled source. - """ - sentinel = np.uint16(65535) - # Build a 4x4 source where sample sites (1,1), (1,3), (3,1), (3,3) - # are valid, sentinel, valid, sentinel respectively. - src = np.zeros((4, 4), dtype=np.uint16) - src[1, 1] = 11 - src[1, 3] = sentinel - src[3, 1] = 31 - src[3, 3] = sentinel - src_path = str(tmp_path / 'src_mixed.tif') - write(src, src_path, nodata=int(sentinel), compression='none', - tiled=False) - - vrt_xml = f""" - 0.0, 2.0, 0.0, 0.0, 0.0, -2.0 - - -9999 - - {src_path} - 1 - - - 65535 - - -""" - vrt_path = _write_vrt(tmp_path, vrt_xml, 'nd_mixed.vrt') - - result, _ = read_vrt(vrt_path) - - assert result.shape == (2, 2) - assert result[0, 0] == 11.0 - assert np.isnan(result[0, 1]) - assert result[1, 0] == 31.0 - assert np.isnan(result[1, 1]) - - -@pytest.mark.parametrize('shape', [(0, 5), (5, 0), (0, 0)]) -def test_resample_nearest_rejects_empty_source(shape): - """``_resample_nearest`` raises ValueError on an empty source array. - - A SimpleSource with ``SrcRect xSize=0`` or ``ySize=0`` -- or a - windowed read that clamps to an empty slice -- would otherwise feed - a zero-dim array to the integer-ratio fast paths, which compute - ``out_h % src_h`` and divide by ``src_h``/``src_w`` and so would - raise an opaque ``ZeroDivisionError``. Surface the bad input with - a clear ``ValueError`` instead. - """ - src_arr = np.zeros(shape, dtype=np.float64) - with pytest.raises(ValueError, match='empty source array'): - _resample_nearest(src_arr, 2, 2) diff --git a/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py b/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py deleted file mode 100644 index b0b8f6f87..000000000 --- a/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py +++ /dev/null @@ -1,518 +0,0 @@ -"""Positive coverage for simple VRT mosaics over GeoTIFF sources (#2369). - -Part of epic #2342. The release contract lists "simple GDAL VRT mosaics -backed by GeoTIFF sources" as a supported subset on both the eager -numpy and dask read paths. Other VRT tests in this directory hit that -subset indirectly while exercising codecs, dtypes, or chunk policies, -but nothing pins the contract down on its own. - -This module covers the supported mosaic shape explicitly: - -* 2x1 horizontal mosaic of two compatible tiles -* 2x2 mosaic of four compatible tiles -* Windowed read where the request window lines up with source pixels -* Dask read of both mosaics with multiple chunks -* One multi-band 2x1 mosaic - -Each test asserts values, coords, and key attrs (``crs``, ``transform``, -``nodata``). Shape-only assertions are not enough -- the point of these -tests is that the contract holds at the pixel level. - -Implementation uses ``to_geotiff`` to build the source tiles and the -internal ``write_vrt`` to wire up the mosaic XML, mirroring the pattern -already used in ``test_vrt_lazy_chunks_1814.py``. -""" -from __future__ import annotations - -import os - -import dask.array as da -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import read_vrt, to_geotiff -from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal - - -# --------------------------------------------------------------------------- -# Tile builders -# --------------------------------------------------------------------------- - -# Per-tile pixel size and CRS shared by every mosaic in this module. -# Picking constants once keeps the assertion blocks below short and makes -# transform comparisons unambiguous. -_PIXEL_W = 0.001 -_PIXEL_H = -0.001 -_CRS = 4326 -_NODATA = -9999.0 - - -def _make_tile( - tmp_dir, - name: str, - data: np.ndarray, - origin_x: float, - origin_y: float, - *, - nodata: float | None = _NODATA, -) -> str: - """Write ``data`` as a single-band GeoTIFF anchored at the given origin. - - Returns the on-disk path. ``data`` shape is ``(H, W)``. - """ - height, width = data.shape - y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) - x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) - attrs = {'crs': _CRS} - if nodata is not None: - attrs['nodata'] = nodata - raster = xr.DataArray( - data, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs=attrs, - ) - path = os.path.join(tmp_dir, name) - to_geotiff(raster, path, nodata=nodata) - return path - - -def _make_multiband_tile( - tmp_dir, - name: str, - data: np.ndarray, - origin_x: float, - origin_y: float, -) -> str: - """Write a multi-band GeoTIFF anchored at the given origin. - - ``data`` shape is ``(H, W, B)``. - """ - height, width, nbands = data.shape - y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) - x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) - raster = xr.DataArray( - data, - dims=['y', 'x', 'band'], - coords={'y': y, 'x': x, 'band': np.arange(nbands)}, - attrs={'crs': _CRS}, - ) - path = os.path.join(tmp_dir, name) - to_geotiff(raster, path) - return path - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - -@pytest.fixture -def mosaic_2x1(tmp_path): - """Two 32x32 float32 tiles side-by-side, west and east. - - Yields ``(vrt_path, expected_array, origin_x, origin_y)``. The - expected array is the horizontal concatenation of the two source - arrays. - """ - td = tmp_path / 'tmp_2369_2x1' - td.mkdir() - td = str(td) - height, width = 32, 32 - left_data = np.arange(height * width, dtype=np.float32).reshape(height, width) - right_data = ( - left_data + (height * width) - ).astype(np.float32) - - origin_x, origin_y = -120.0, 45.0 - left_path = _make_tile(td, 'left.tif', left_data, origin_x, origin_y) - right_path = _make_tile( - td, 'right.tif', right_data, - origin_x + _PIXEL_W * width, origin_y, - ) - - vrt_path = os.path.join(td, 'mosaic_2x1.vrt') - _write_vrt_internal(vrt_path, [left_path, right_path]) - - expected = np.concatenate([left_data, right_data], axis=1) - yield vrt_path, expected, origin_x, origin_y - - -@pytest.fixture -def mosaic_2x2(tmp_path): - """Four 32x32 float32 tiles arranged 2 rows by 2 cols. - - Yields ``(vrt_path, expected_array, origin_x, origin_y)`` with the - expected array stitched in (row, col) order. - """ - td = tmp_path / 'tmp_2369_2x2' - td.mkdir() - td = str(td) - h, w = 32, 32 - # Use distinct constant values per tile so any swap shows up loudly - # in the assertion diff. - tile_nw = np.full((h, w), 1.0, dtype=np.float32) - tile_ne = np.full((h, w), 2.0, dtype=np.float32) - tile_sw = np.full((h, w), 3.0, dtype=np.float32) - tile_se = np.full((h, w), 4.0, dtype=np.float32) - - origin_x, origin_y = -120.0, 45.0 - nw_path = _make_tile(td, 'nw.tif', tile_nw, origin_x, origin_y) - ne_path = _make_tile( - td, 'ne.tif', tile_ne, - origin_x + _PIXEL_W * w, origin_y, - ) - sw_path = _make_tile( - td, 'sw.tif', tile_sw, - origin_x, origin_y + _PIXEL_H * h, - ) - se_path = _make_tile( - td, 'se.tif', tile_se, - origin_x + _PIXEL_W * w, origin_y + _PIXEL_H * h, - ) - - vrt_path = os.path.join(td, 'mosaic_2x2.vrt') - _write_vrt_internal( - vrt_path, [nw_path, ne_path, sw_path, se_path], - ) - - top = np.concatenate([tile_nw, tile_ne], axis=1) - bottom = np.concatenate([tile_sw, tile_se], axis=1) - expected = np.concatenate([top, bottom], axis=0) - yield vrt_path, expected, origin_x, origin_y - - -@pytest.fixture -def mosaic_multiband_2x1(tmp_path): - """Two 3-band 32x32 float32 tiles side-by-side.""" - td = tmp_path / 'tmp_2369_mb_2x1' - td.mkdir() - td = str(td) - h, w, b = 32, 32, 3 - rng = np.random.default_rng(2369) - left_data = rng.random((h, w, b), dtype=np.float32) - right_data = rng.random((h, w, b), dtype=np.float32) - - origin_x, origin_y = -120.0, 45.0 - left_path = _make_multiband_tile( - td, 'left_mb.tif', left_data, origin_x, origin_y, - ) - right_path = _make_multiband_tile( - td, 'right_mb.tif', right_data, - origin_x + _PIXEL_W * w, origin_y, - ) - - vrt_path = os.path.join(td, 'mosaic_mb.vrt') - _write_vrt_internal(vrt_path, [left_path, right_path]) - - # read_vrt mirrors the on-disk layout: per-band plane stacked on - # the trailing axis to match what ``to_geotiff`` wrote, so the - # expected mosaic is (H, W, B). - expected = np.stack( - [ - np.concatenate([left_data[..., k], right_data[..., k]], axis=1) - for k in range(b) - ], - axis=-1, - ) - yield vrt_path, expected, origin_x, origin_y - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _assert_attrs_ok( - result, - *, - expected_nodata=None, - expected_origin_x=None, - expected_origin_y=None, -): - """Common attr assertions for VRT reads in this module. - - Checks that ``crs`` and ``transform`` are present and consistent - with the fixture constants, and optionally that ``nodata`` matches. - When ``expected_origin_x`` / ``expected_origin_y`` are passed, the - transform's origin entries are checked too -- pixel size alone is - not enough to catch a translation bug. - """ - assert 'crs' in result.attrs, ( - f"crs missing from attrs; have {sorted(result.attrs)}" - ) - # write_vrt may normalise the CRS to a WKT string; accept either - # form as long as it is non-empty and references EPSG:4326. - crs_val = result.attrs['crs'] - if isinstance(crs_val, int): - assert crs_val == _CRS - else: - assert crs_val, "crs attr is present but empty" - # Best-effort string check: WKT for 4326 mentions WGS 84 - assert 'WGS' in str(crs_val) or '4326' in str(crs_val), ( - f"crs attr does not look like EPSG:4326: {crs_val!r}" - ) - - assert 'transform' in result.attrs, ( - f"transform missing from attrs; have {sorted(result.attrs)}" - ) - transform = result.attrs['transform'] - # transform format is (pixel_w, 0, origin_x, 0, pixel_h, origin_y) - # in the affine 6-tuple convention this module uses elsewhere. - assert len(transform) == 6, ( - f"transform should be a 6-tuple, got {transform!r}" - ) - assert transform[0] == pytest.approx(_PIXEL_W), ( - f"transform pixel width = {transform[0]}, expected {_PIXEL_W}" - ) - assert transform[4] == pytest.approx(_PIXEL_H), ( - f"transform pixel height = {transform[4]}, expected {_PIXEL_H}" - ) - if expected_origin_x is not None: - assert transform[2] == pytest.approx(expected_origin_x), ( - f"transform origin_x = {transform[2]}, " - f"expected {expected_origin_x}" - ) - if expected_origin_y is not None: - assert transform[5] == pytest.approx(expected_origin_y), ( - f"transform origin_y = {transform[5]}, " - f"expected {expected_origin_y}" - ) - - if expected_nodata is not None: - assert 'nodata' in result.attrs, ( - f"nodata missing from attrs; have {sorted(result.attrs)}" - ) - assert result.attrs['nodata'] == pytest.approx(expected_nodata) - - -def _assert_coords_monotonic(result, *, expected_origin_x, expected_origin_y): - """Check that x/y coords are monotonic and start at the expected origin - (within half a pixel: TIFF coords are pixel centers, not corners). - """ - x = np.asarray(result['x'].values) - y = np.asarray(result['y'].values) - - # x increases west to east, y decreases north to south. - assert np.all(np.diff(x) > 0), "x coord is not strictly increasing" - assert np.all(np.diff(y) < 0), "y coord is not strictly decreasing" - - # First pixel center: origin + half pixel. - assert x[0] == pytest.approx(expected_origin_x + _PIXEL_W * 0.5) - assert y[0] == pytest.approx(expected_origin_y + _PIXEL_H * 0.5) - - -# --------------------------------------------------------------------------- -# 1. Eager numpy reads of horizontal and 2x2 mosaics -# --------------------------------------------------------------------------- - -def test_eager_2x1_mosaic_values_coords_attrs(mosaic_2x1): - """Eager read of a 2x1 horizontal mosaic returns the concatenated - pixel block, with monotonic coords and the fixture's crs / transform - / nodata on attrs. - """ - vrt_path, expected, ox, oy = mosaic_2x1 - - result = read_vrt(vrt_path) - - assert result.shape == expected.shape, ( - f"eager 2x1 shape {result.shape}, expected {expected.shape}" - ) - np.testing.assert_array_equal(result.values, expected) - _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok( - result, - expected_nodata=_NODATA, - expected_origin_x=ox, - expected_origin_y=oy, - ) - - -def test_eager_2x2_mosaic_values_coords_attrs(mosaic_2x2): - """Eager read of a 2x2 mosaic stitches tiles in the right order. - - Each tile has a distinct constant value, so a misordered placement - surfaces immediately in the value assertion rather than appearing - only as a numeric diff. - """ - vrt_path, expected, ox, oy = mosaic_2x2 - - result = read_vrt(vrt_path) - - assert result.shape == expected.shape, ( - f"eager 2x2 shape {result.shape}, expected {expected.shape}" - ) - np.testing.assert_array_equal(result.values, expected) - _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok( - result, - expected_nodata=_NODATA, - expected_origin_x=ox, - expected_origin_y=oy, - ) - - -# --------------------------------------------------------------------------- -# 2. Windowed read that maps cleanly into source windows -# --------------------------------------------------------------------------- - -def test_windowed_read_aligned_with_source_boundary(mosaic_2x1): - """A window crossing the seam between the two source tiles returns - the same pixels as slicing the full mosaic. - - The window picked here covers the right half of the left tile and - the left half of the right tile: both halves land on whole-pixel - boundaries inside their respective sources, so this is the - "request lines up with source pixels" case from the issue. - """ - vrt_path, expected, ox, oy = mosaic_2x1 - h = expected.shape[0] # 32 - - # Window: full height, last 16 cols of left tile through first - # 16 cols of right tile. 32 / 2 = 16 keeps each side aligned to - # the source's own pixel grid. - r0, c0, r1, c1 = 0, 16, h, 48 - - result = read_vrt(vrt_path, window=(r0, c0, r1, c1)) - - np.testing.assert_array_equal(result.values, expected[r0:r1, c0:c1]) - - # Coords on the windowed result should correspond to the windowed - # slice of the full mosaic's coords. - full = read_vrt(vrt_path) - np.testing.assert_array_equal( - np.asarray(result['x'].values), - np.asarray(full['x'].values)[c0:c1], - ) - np.testing.assert_array_equal( - np.asarray(result['y'].values), - np.asarray(full['y'].values)[r0:r1], - ) - # Windowed transform origin shifts by the row / col offset times - # pixel size. (r0=0 keeps origin_y at the fixture's oy.) - expected_window_ox = ox + _PIXEL_W * c0 - expected_window_oy = oy + _PIXEL_H * r0 - _assert_attrs_ok( - result, - expected_nodata=_NODATA, - expected_origin_x=expected_window_ox, - expected_origin_y=expected_window_oy, - ) - - -# --------------------------------------------------------------------------- -# 3. Dask reads with multiple chunks -# --------------------------------------------------------------------------- - -def test_dask_2x1_mosaic_multi_chunk_matches_eager(mosaic_2x1): - """Dask read with chunks smaller than the mosaic returns the same - pixels as the eager read, and uses a real multi-block dask graph. - """ - vrt_path, expected, ox, oy = mosaic_2x1 - - chunked = read_vrt(vrt_path, chunks=(16, 16)) - - # Real dask graph with the expected number of blocks: 32/16 = 2 - # rows, 64/16 = 4 cols. - assert isinstance(chunked.data, da.Array), ( - f"expected dask Array, got {type(chunked.data).__name__}" - ) - assert chunked.data.numblocks == (2, 4), ( - f"expected 2x4 blocks, got {chunked.data.numblocks}" - ) - - computed = chunked.compute() - np.testing.assert_array_equal(computed.values, expected) - _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok( - computed, - expected_nodata=_NODATA, - expected_origin_x=ox, - expected_origin_y=oy, - ) - - -def test_dask_2x2_mosaic_multi_chunk_matches_eager(mosaic_2x2): - """Dask read of the 2x2 mosaic with chunk size below tile size. - - Chunks of 16 split each 32x32 tile into 2x2 blocks. The full - mosaic is 64x64 so the resulting dask array is 4x4 blocks. - """ - vrt_path, expected, ox, oy = mosaic_2x2 - - chunked = read_vrt(vrt_path, chunks=(16, 16)) - - assert isinstance(chunked.data, da.Array) - assert chunked.data.numblocks == (4, 4), ( - f"expected 4x4 blocks, got {chunked.data.numblocks}" - ) - - computed = chunked.compute() - np.testing.assert_array_equal(computed.values, expected) - _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok( - computed, - expected_nodata=_NODATA, - expected_origin_x=ox, - expected_origin_y=oy, - ) - - -# --------------------------------------------------------------------------- -# 4. Multi-band mosaic -# --------------------------------------------------------------------------- - -def test_eager_multiband_2x1_mosaic(mosaic_multiband_2x1): - """Eager read of a multi-band 2x1 mosaic returns one stitched plane - per band. - - Multi-band VRT reads return shape ``(H, W, B)`` to match the - on-disk layout; assert per-band values against the stack built in - the fixture. - """ - vrt_path, expected, ox, oy = mosaic_multiband_2x1 - - result = read_vrt(vrt_path) - - # Multi-band path: trailing axis is band. - assert result.shape == expected.shape, ( - f"multiband 2x1 shape {result.shape}, expected {expected.shape}" - ) - np.testing.assert_array_equal(result.values, expected) - - # Coords use the same x/y dims as the single-band case; only the - # number of bands changed. - _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok( - result, expected_origin_x=ox, expected_origin_y=oy, - ) - - -def test_dask_multiband_2x1_mosaic_matches_eager(mosaic_multiband_2x1): - """Dask read of the multi-band 2x1 mosaic with sub-tile chunks must - match the eager read pixel-for-pixel across every band. - - Chunking exercises per-block band handling: a bug that loses a - band on one chunk but not another would not appear in the eager - test above. - """ - vrt_path, expected, ox, oy = mosaic_multiband_2x1 - - eager = read_vrt(vrt_path) - chunked = read_vrt(vrt_path, chunks=(16, 16)) - - # The chunked array should be dask-backed; the band axis may or - # may not be split depending on the implementation, but the - # spatial axes must be. - assert isinstance(chunked.data, da.Array), ( - f"expected dask Array, got {type(chunked.data).__name__}" - ) - - computed = chunked.compute() - assert computed.shape == eager.shape - np.testing.assert_array_equal(computed.values, eager.values) - np.testing.assert_array_equal(computed.values, expected) - _assert_coords_monotonic( - computed, expected_origin_x=ox, expected_origin_y=oy, - ) - _assert_attrs_ok( - computed, expected_origin_x=ox, expected_origin_y=oy, - ) diff --git a/xrspatial/geotiff/tests/test_vrt_single_parse_1825.py b/xrspatial/geotiff/tests/test_vrt_single_parse_1825.py deleted file mode 100644 index d98cedee9..000000000 --- a/xrspatial/geotiff/tests/test_vrt_single_parse_1825.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Chunked ``read_vrt`` parses the VRT XML once (issue #1825). - -Before the refactor each per-chunk task re-parsed the VRT XML and -re-validated source-path containment, so an N-chunk read paid an N+1 -parse cost. The dispatcher now parses once and threads the parsed -``VRTDataset`` into every task via the dask graph, removing the -per-task XML parse and allowlist validation. - -These tests pin the new behaviour: - -* the dispatcher calls ``parse_vrt`` exactly once during construction, - and ``.compute()`` does not parse the XML again per task; -* the parsed VRT object survives pickling, so the dask graph can ship - it to workers under any scheduler; -* numerical results match the eager path byte-for-byte (regression - guard for the helper extraction). -""" -from __future__ import annotations - -import os -import pickle -import tempfile - -import dask.array as da -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import read_vrt, to_geotiff -from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal - - -@pytest.fixture -def two_by_two_vrt_1825(): - """4-tile mosaic via the to_geotiff(.vrt, ...) dask path.""" - arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) - y = np.linspace(41.0, 40.0, 256) - x = np.linspace(-106.0, -105.0, 256) - raster = xr.DataArray(arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}) - td = tempfile.mkdtemp(prefix='tmp_1825_2x2_') - vrt_path = os.path.join(td, 'mosaic_1825.vrt') - to_geotiff(raster, vrt_path, tile_size=128) - yield vrt_path, arr - - -@pytest.fixture -def single_tile_vrt_1825(): - """One 64x64 float32 tile wrapped in a VRT.""" - arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) - y = np.linspace(41.0, 40.0, 64) - x = np.linspace(-106.0, -105.0, 64) - raster = xr.DataArray(arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}) - td = tempfile.mkdtemp(prefix='tmp_1825_single_') - tile_path = os.path.join(td, 'tile_1825.tif') - to_geotiff(raster, tile_path) - vrt_path = os.path.join(td, 'single_1825.vrt') - _write_vrt_internal(vrt_path, [tile_path]) - yield vrt_path, arr - - -def test_chunked_path_parses_xml_once(monkeypatch, two_by_two_vrt_1825): - """Construction parses once, and ``.compute()`` adds zero parses. - - The previous implementation re-parsed inside every per-chunk task, - so a 4x4 chunk grid produced 17 parses total. After #1825 the - dispatcher parses once and threads the already-parsed VRTDataset - through the task graph. - """ - vrt_path, _ = two_by_two_vrt_1825 - - from xrspatial.geotiff import _vrt as vrt_module - - counter = {'parses': 0} - real_parse = vrt_module.parse_vrt - - def counting_parse(*args, **kwargs): - counter['parses'] += 1 - return real_parse(*args, **kwargs) - - monkeypatch.setattr(vrt_module, 'parse_vrt', counting_parse) - - result = read_vrt(vrt_path, chunks=(64, 64)) - - # Construction parses exactly once. - assert counter['parses'] == 1, ( - f"expected 1 parse during construction, got {counter['parses']}" - ) - - computed = result.compute() - - # 4x4 chunk grid would re-parse 16 more times under the old code. - assert counter['parses'] == 1, ( - f"expected 1 parse total (construction only); got " - f"{counter['parses']} -- per-chunk tasks are still reparsing" - ) - - # Sanity: the computed array is the original data. - assert computed.shape == (256, 256) - assert computed.dtype == np.float32 - - -def test_chunked_path_reads_xml_file_once(monkeypatch, two_by_two_vrt_1825): - """The chunked dispatcher reads the VRT XML file exactly once. - - Pin the file-read side too: before #1825 every per-chunk task - re-opened the .vrt file via ``_read_vrt_xml``. After the refactor - only the dispatcher reads it. - """ - vrt_path, _ = two_by_two_vrt_1825 - - from xrspatial.geotiff import _vrt as vrt_module - - counter = {'reads': 0} - real_read_xml = vrt_module._read_vrt_xml - - def counting_read_xml(*args, **kwargs): - counter['reads'] += 1 - return real_read_xml(*args, **kwargs) - - monkeypatch.setattr(vrt_module, '_read_vrt_xml', counting_read_xml) - - result = read_vrt(vrt_path, chunks=(64, 64)) - assert counter['reads'] == 1, ( - f"expected 1 XML file read during construction, got " - f"{counter['reads']}" - ) - - result.compute() - assert counter['reads'] == 1, ( - f"expected 1 XML file read total; got {counter['reads']} -- " - f"per-chunk tasks are still re-opening the .vrt file" - ) - - -def test_parsed_vrt_is_picklable(single_tile_vrt_1825): - """The parsed VRTDataset round-trips through pickle. - - The chunked dispatcher embeds the parsed VRT into the dask graph, - so dask must be able to serialise it for the distributed and - process-pool schedulers. Pin picklability with the stdlib pickler - (cloudpickle is a strict superset). - """ - vrt_path, _ = single_tile_vrt_1825 - from xrspatial.geotiff._vrt import _read_vrt_xml, parse_vrt - - xml_str = _read_vrt_xml(vrt_path) - vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) - vrt = parse_vrt(xml_str, vrt_dir) - - blob = pickle.dumps(vrt) - restored = pickle.loads(blob) - - assert restored.width == vrt.width - assert restored.height == vrt.height - assert len(restored.bands) == len(vrt.bands) - assert restored.bands[0].dtype == vrt.bands[0].dtype - assert [s.filename for s in restored.bands[0].sources] == [ - s.filename for s in vrt.bands[0].sources - ] - - -def test_chunked_matches_eager_after_refactor(two_by_two_vrt_1825): - """Byte-identical eager vs chunked results after the helper consolidation. - - The eager path uses ``_apply_integer_sentinel_mask`` / - ``_effective_dtype_for_bands`` / ``_sentinel_for_dtype`` from - ``_vrt`` directly; the chunked path imports the same helpers. A - regression in either call site would surface here. - """ - vrt_path, original = two_by_two_vrt_1825 - eager = read_vrt(vrt_path) - chunked = read_vrt(vrt_path, chunks=(64, 64)).compute() - assert eager.dtype == chunked.dtype - np.testing.assert_array_equal(eager.values, chunked.values) - np.testing.assert_array_equal(eager.values, original) - - -def test_no_path_containment_revalidation_per_chunk(monkeypatch, - two_by_two_vrt_1825): - """Per-chunk tasks skip the source-path containment check. - - ``parse_vrt`` is the only place that resolves and validates source - paths against the VRT directory / ``XRSPATIAL_VRT_ALLOWED_ROOTS``. - Because each task now receives the already-parsed VRT, ``parse_vrt`` - must not run during ``.compute()`` even when the graph is hydrated. - """ - vrt_path, _ = two_by_two_vrt_1825 - - from xrspatial.geotiff import _vrt as vrt_module - - parse_calls = {'n': 0} - real_parse = vrt_module.parse_vrt - - def counting_parse(*args, **kwargs): - parse_calls['n'] += 1 - return real_parse(*args, **kwargs) - - monkeypatch.setattr(vrt_module, 'parse_vrt', counting_parse) - - result = read_vrt(vrt_path, chunks=(64, 64)) - parses_after_construction = parse_calls['n'] - - # Compute one block via dask's sliced API; confirm parse count - # stays at the construction baseline (no extra parses fired). - da_arr = result.data - if isinstance(da_arr, da.Array): - _block = da_arr.blocks[0, 0].compute() - assert _block.shape[0] > 0 and _block.shape[1] > 0 - - assert parse_calls['n'] == parses_after_construction, ( - f"per-block compute triggered extra parses " - f"({parse_calls['n']} vs {parses_after_construction})" - ) - - -def test_parsed_kwarg_does_not_mutate_caller_holes(single_tile_vrt_1825): - """``read_vrt(parsed=...)`` must not mutate the caller's ``holes``. - - The chunked dispatcher threads a single parsed ``VRTDataset`` into - every per-chunk task. ``read_vrt`` appends skipped-source records to - ``vrt.holes`` when a backing file is missing; without a defensive - copy the appends would land on the dispatcher's shared object and - leak across tasks (racy under the threaded scheduler, and - cumulatively across calls if a caller ever reused the parsed - object). Pin that ``parsed.holes`` stays untouched. - """ - vrt_path, _ = single_tile_vrt_1825 - from xrspatial.geotiff._vrt import _read_vrt_xml, parse_vrt - from xrspatial.geotiff._vrt import read_vrt as _read_vrt_internal - - xml_str = _read_vrt_xml(vrt_path) - vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) - parsed = parse_vrt(xml_str, vrt_dir) - - # Point the only source at a path that does not exist so the - # lenient ``missing_sources='warn'`` branch fires and would append - # a record onto ``holes``. - parsed.bands[0].sources[0].filename = os.path.join(vrt_dir, 'gone.tif') - holes_id_before = id(parsed.holes) - - import warnings - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - arr, returned = _read_vrt_internal( - vrt_path, parsed=parsed, missing_sources='warn', - ) - - assert parsed.holes == [], ( - f"parsed.holes was mutated across the read; got {parsed.holes!r}" - ) - assert id(parsed.holes) == holes_id_before, ( - "parsed.holes list object was replaced -- the caller's reference " - "is now stale" - ) - # The returned VRTDataset is the per-call working copy and DID - # collect the skipped-source record. - assert len(returned.holes) == 1 - assert returned.holes[0]['source'].endswith('gone.tif') - assert arr.shape == (64, 64) diff --git a/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py b/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py deleted file mode 100644 index 13a1eb5be..000000000 --- a/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py +++ /dev/null @@ -1,79 +0,0 @@ -"""VRT source reads must honor the caller's max_pixels budget (#1796). - -Originally the source loop in ``read_vrt`` read the full SrcRect of a -SimpleSource with size mismatch, so a tiny VRT output could force a huge -source decode. #1803 forwarded ``max_pixels`` to ``read_to_array`` to -catch that pattern. - -After #1704 / PR #1821 the resample path inverse-maps the clipped -destination sub-window to the minimal SrcRect sub-rect and reads only -that. A tiny VRT output is now bounded structurally: the source read -cannot exceed the dst sub-window size. The per-source ``max_pixels`` -guard still applies (defence in depth) and still bites when the -sub-window itself exceeds the budget. -""" -from __future__ import annotations - -import os - -import numpy as np -import pytest - -from xrspatial.geotiff import read_vrt, to_geotiff - - -def test_tiny_vrt_with_huge_srcrect_now_reads_minimally(tmp_path): - """A 1x1 VRT pointing at a 4x4 SrcRect now reads only the one source - pixel that maps to the single output pixel, so ``max_pixels=1`` is - no longer exceeded. Locks in the structural improvement from #1704.""" - src = tmp_path / "tmp_1796_source.tif" - data = np.arange(16, dtype=np.uint8).reshape(4, 4) - to_geotiff(data, str(src), compression='none') - - vrt = tmp_path / "tmp_1796_source_cap.vrt" - vrt.write_text( - '\n' - ' \n' - ' \n' - f' {os.path.basename(src)}' - '\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - - arr = read_vrt(str(vrt), max_pixels=1) - assert arr.shape == (1, 1) - - -def test_source_cap_still_fires_when_sub_window_exceeds_budget(tmp_path): - """The per-source pixel-budget guard still rejects a sub-window that - exceeds ``max_pixels``. With the sub-window-bounded read, the cap is - measured against the clipped destination region rather than the raw - SrcRect; the protection from #1796 carries over to that new - measurement. - """ - src = tmp_path / "tmp_1796_big_source.tif" - data = np.arange(64, dtype=np.uint8).reshape(8, 8) - to_geotiff(data, str(src), compression='none', tiled=False) - - vrt = tmp_path / "tmp_1796_big_cap.vrt" - vrt.write_text( - '\n' - ' \n' - ' \n' - f' {os.path.basename(src)}' - '\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - - with pytest.raises(ValueError, match="exceed|safety limit"): - read_vrt(str(vrt), max_pixels=4) diff --git a/xrspatial/geotiff/tests/test_vrt_source_nodata_zero_1655.py b/xrspatial/geotiff/tests/test_vrt_source_nodata_zero_1655.py deleted file mode 100644 index 64b0bde57..000000000 --- a/xrspatial/geotiff/tests/test_vrt_source_nodata_zero_1655.py +++ /dev/null @@ -1,171 +0,0 @@ -"""Regression tests for issue #1655. - -``read_vrt`` used to evaluate the per-source NODATA fallback as -``src.nodata or nodata``. Python treats ``0.0`` as falsy, so a -SimpleSource that declared ``0`` was silently replaced -with the band-level sentinel (or ``None`` when the band had none of its -own). Pixels equal to 0.0 in the source file survived as valid data and -biased every downstream NaN-aware aggregation. - -The fix changes the fallback to an explicit ``is not None`` check so a -legitimate zero sentinel survives. -""" -from __future__ import annotations - -import numpy as np -import pytest - -from xrspatial.geotiff._geotags import GeoTransform -from xrspatial.geotiff._vrt import read_vrt -from xrspatial.geotiff._writer import write - - -def _write_source(tmp_path, arr, name='src_1655.tif'): - """Write a small float32 GeoTIFF without a GDAL_NODATA tag.""" - p = str(tmp_path / name) - write( - arr, p, - geo_transform=GeoTransform( - origin_x=0.0, origin_y=0.0, - pixel_width=1.0, pixel_height=-1.0, - ), - crs_epsg=4326, - compression='none', - tiled=False, - ) - return p - - -def _vrt_with_source_nodata(tmp_path, src_path, nodata_xml, - include_band_nodata=False, - width=4, height=3, - band_nodata='0.0'): - """Write a single-band Float32 VRT with the supplied ```` - on its SimpleSource. ``include_band_nodata`` controls whether a - ```` is emitted on the band as well. - """ - band_nd_elem = ( - f'{band_nodata}' - if include_band_nodata else '') - vrt_xml = ( - f'\n' - f' EPSG:4326\n' - f' 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' - f' \n' - f' {band_nd_elem}\n' - f' \n' - f' {src_path}\n' - f' 1\n' - f' \n' - f' \n' - f' {nodata_xml}\n' - f' \n' - f' \n' - f'\n' - ) - vrt_path = str(tmp_path / 'src_zero_1655.vrt') - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -class TestVRTSourceNodataZero: - """SimpleSource ``0`` must mask zeros to NaN.""" - - def test_source_nodata_zero_no_band_nodata(self, tmp_path): - """SimpleSource NODATA=0 with no band-level fallback masks zeros.""" - arr = np.array( - [[1.0, 0.0, 3.0, 0.0], - [4.0, 0.0, 6.0, 7.0], - [0.0, 8.0, 9.0, 10.0]], - dtype=np.float32, - ) - src = _write_source(tmp_path, arr) - vrt = _vrt_with_source_nodata(tmp_path, src, '0.0') - - result, _ = read_vrt(vrt) - assert int(np.isnan(result).sum()) == 4 - - def test_source_nodata_zero_integer_xml(self, tmp_path): - """``0`` (integer literal) also masks zeros.""" - arr = np.array( - [[1.0, 0.0, 3.0]], - dtype=np.float32, - ) - src = _write_source(tmp_path, arr, name='int_xml.tif') - vrt = _vrt_with_source_nodata( - tmp_path, src, '0', width=3, height=1) - - result, _ = read_vrt(vrt) - assert int(np.isnan(result).sum()) == 1 - assert np.isnan(result[0, 1]) - - def test_source_nodata_nonzero_unchanged(self, tmp_path): - """SimpleSource NODATA != 0 keeps masking behaviour.""" - arr = np.array( - [[1.0, 0.0, 3.0, 0.0]], - dtype=np.float32, - ) - src = _write_source(tmp_path, arr, name='nonzero.tif') - vrt = _vrt_with_source_nodata( - tmp_path, src, '1.0', width=4, height=1) - - result, _ = read_vrt(vrt) - # Only the literal 1.0 at [0, 0] should be masked. - assert int(np.isnan(result).sum()) == 1 - assert np.isnan(result[0, 0]) - - def test_band_nodata_zero_still_honoured(self, tmp_path): - """Band-level ``0`` keeps working.""" - arr = np.array( - [[1.0, 0.0, 3.0]], - dtype=np.float32, - ) - src = _write_source(tmp_path, arr, name='band_zero.tif') - # Build a VRT where only the band carries nodata=0 (no NODATA - # on the SimpleSource). - vrt_xml = ( - f'\n' - f' EPSG:4326\n' - f' 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' - f' \n' - f' 0.0\n' - f' \n' - f' {src}\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - vrt = str(tmp_path / 'band_zero_1655.vrt') - with open(vrt, 'w') as f: - f.write(vrt_xml) - - result, _ = read_vrt(vrt) - assert int(np.isnan(result).sum()) == 1 - assert np.isnan(result[0, 1]) - - def test_source_nodata_zero_overrides_band(self, tmp_path): - """SimpleSource NODATA=0 takes precedence over band NoDataValue=99.""" - arr = np.array( - [[1.0, 0.0, 99.0]], - dtype=np.float32, - ) - src = _write_source(tmp_path, arr, name='override.tif') - vrt = _vrt_with_source_nodata( - tmp_path, src, '0.0', - include_band_nodata=True, band_nodata='99.0', - width=3, height=1) - - result, _ = read_vrt(vrt) - # The SimpleSource sentinel (0.0) wins over the band sentinel - # (99.0), so only the 0.0 cell becomes NaN. The 99.0 cell stays - # because the masking is per-source, applied at read time, and - # the band-level fallback never fires when src.nodata is set. - assert int(np.isnan(result).sum()) == 1 - assert np.isnan(result[0, 1]) - assert result[0, 2] == pytest.approx(99.0) diff --git a/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py deleted file mode 100644 index 14df1c002..000000000 --- a/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Regression tests for #1823. - -PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside -the VRT source loop so that a tiny VRT output could not force a huge -source decode (#1796). A separate per-tile dimension check was -incorrectly using the same ``max_pixels`` value, so a caller setting -``max_pixels`` as an output budget (e.g. 10,000) would also fail the -per-tile sanity check on every normal source whose default tile size is -256x256 (= 65,536 pixels). - -After PR #1821 (#1704) the resample path reads only the minimal source -sub-rect that feeds the clipped destination sub-window, so a tiny VRT -output cannot force a huge source decode by construction. The -output-extent check at the top of ``read_vrt`` still rejects requests -whose output itself exceeds ``max_pixels``. -""" -from __future__ import annotations - -import os -import tempfile - -import numpy as np -import pytest - -from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._reader import PixelSafetyLimitError -from xrspatial.geotiff._vrt import read_vrt - - -def _write_normal_tile_source(td: str) -> str: - """10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile.""" - src = os.path.join(td, 'src.tif') - to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none') - return src - - -def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, - raster_x: int = 100, raster_y: int = 100, - src_x_size: int = 10, src_y_size: int = 10) -> str: - vrt = os.path.join(td, 'mosaic.vrt') - xml = ( - f'\n' - f' \n' - f' \n' - f' src.tif\n' - f' 1\n' - f' \n' - f' \n' - f' \n' - f' \n' - f'\n' - ) - with open(vrt, 'w') as f: - f.write(xml) - return vrt - - -class TestPerTileCheckDoesNotUseCallerBudget: - """Per-tile dim sanity must not reject normal 256x256 source tiles - when the caller's ``max_pixels`` is a small output-budget value.""" - - def test_normal_tile_source_with_small_max_pixels(self): - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_normal_tile_source(td) - vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) - arr, _ = read_vrt(vrt, max_pixels=10_000) - assert arr.shape == (100, 100) - - def test_normal_tile_source_with_tiny_max_pixels(self): - """An output budget below a single tile must still succeed when - the requested output window itself fits.""" - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_normal_tile_source(td) - # Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room. - vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5, - raster_x=5, raster_y=5) - arr, _ = read_vrt(vrt, max_pixels=100) - assert arr.shape == (5, 5) - - -class TestOutputWindowCheckStillEnforced: - """The output-window check still rejects a read whose VRT extent - exceeds ``max_pixels``. After #1704 the source read is bounded by - the clipped destination sub-window, so the per-source guard now - rarely fires; the top-level ``_check_dimensions`` call against the - output extent catches over-budget requests up front. The #1796 - protection (tiny VRT cannot force huge source decode) is preserved - structurally. - """ - - def test_output_extent_exceeds_max_pixels_still_rejected(self): - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - src = os.path.join(td, 'src.tif') - to_geotiff(np.arange(64, dtype=np.uint8).reshape(8, 8), - src, compression='none', tiled=False) - # VRT output extent is 8x8 = 64 pixels; max_pixels=4 trips - # the dimensions check before any source read runs. - vrt = _write_vrt(td, dst_x_size=8, dst_y_size=8, - raster_x=8, raster_y=8, - src_x_size=4, src_y_size=4) - with pytest.raises(ValueError, match="exceed|safety limit"): - read_vrt(vrt, max_pixels=4) - - -class TestPerTileCheckStillRejectsCraftedHeader: - """A pathological ``TileWidth``/``TileLength`` must still fail at - the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``.""" - - def test_per_tile_check_caps_at_default(self, monkeypatch): - """Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site - is wired to it (rather than to the caller's budget).""" - from xrspatial.geotiff import _reader as reader_mod - - monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100) - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: - _write_normal_tile_source(td) - vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) - # 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile - # check fires regardless of caller's max_pixels (1e9 here). - with pytest.raises(PixelSafetyLimitError, match="65,536"): - read_vrt(vrt, max_pixels=1_000_000_000) diff --git a/xrspatial/geotiff/tests/test_vrt_tiled_metadata_1606.py b/xrspatial/geotiff/tests/test_vrt_tiled_metadata_1606.py deleted file mode 100644 index 2aa58f9e6..000000000 --- a/xrspatial/geotiff/tests/test_vrt_tiled_metadata_1606.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Regression tests for issue #1606. - -``to_geotiff(da, 'out.vrt')`` (which dispatches to ``_write_vrt_tiled``) -used to drop a chunk of metadata that ``to_geotiff(da, 'out.tif')`` -preserved: - -* ``attrs['nodatavals']`` / ``attrs['_FillValue']`` -- the VRT path - read ``attrs['nodata']`` directly instead of going through - ``_resolve_nodata_attr``. -* ``attrs['gdal_metadata']`` / ``attrs['gdal_metadata_xml']`` -* ``attrs['extra_tags']`` and the friendly tag attrs folded in by - ``_merge_friendly_extra_tags`` -* ``attrs['x_resolution']`` / ``attrs['y_resolution']`` / - ``attrs['resolution_unit']`` -* ``attrs['raster_type']`` - -Each tile under the VRT now carries the same rich tag set the -equivalent single-file ``.tif`` write would emit. -""" -import glob -import os - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import open_geotiff, to_geotiff - - -def _make_rioxarray_style(arr=None): - """DataArray that looks like rioxarray output: nodata only via aliases.""" - if arr is None: - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - arr[0, 0] = -9999.0 - return xr.DataArray( - arr, - dims=('y', 'x'), - coords={'y': np.arange(arr.shape[0], dtype=np.float64), - 'x': np.arange(arr.shape[1], dtype=np.float64)}, - attrs={ - # No bare 'nodata' key -- forces _resolve_nodata_attr to walk - # the alias chain. - 'nodatavals': (-9999.0,), - '_FillValue': -9999.0, - 'crs': 4326, - 'gdal_metadata': {'AREA_OR_POINT': 'Area', 'foo': 'bar'}, - 'x_resolution': 96, - 'y_resolution': 96, - 'resolution_unit': 'inch', - 'raster_type': 'point', - }, - ) - - -def _first_tile_path(vrt_path): - tiles_dir = vrt_path[:-len('.vrt')] + '_tiles' - tiles = sorted(glob.glob(os.path.join(tiles_dir, '*.tif'))) - assert tiles, f'no per-tile .tif files under {tiles_dir}' - return tiles[0] - - -class TestVrtTiledMetadataParity: - def test_nodatavals_alias_propagates_to_tiles(self, tmp_path): - da = _make_rioxarray_style() - vrt = str(tmp_path / 'nodatavals.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - # Before the fix this was None: _write_vrt_tiled read - # attrs['nodata'] directly and ignored the nodatavals alias. - assert tile_da.attrs.get('nodata') == -9999.0 - - def test_fill_value_alias_propagates_to_tiles(self, tmp_path): - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - arr[0, 0] = -9999.0 - da = xr.DataArray( - arr, dims=('y', 'x'), - coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, - attrs={'_FillValue': -9999.0, 'crs': 4326}, - ) - vrt = str(tmp_path / 'fillvalue.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - assert tile_da.attrs.get('nodata') == -9999.0 - - def test_gdal_metadata_propagates_to_tiles(self, tmp_path): - da = _make_rioxarray_style() - vrt = str(tmp_path / 'gdal_meta.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - gm = tile_da.attrs.get('gdal_metadata') - assert gm == {'AREA_OR_POINT': 'Area', 'foo': 'bar'} - - def test_resolution_tags_propagate_to_tiles(self, tmp_path): - da = _make_rioxarray_style() - vrt = str(tmp_path / 'resolution.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - assert tile_da.attrs.get('x_resolution') == 96.0 - assert tile_da.attrs.get('y_resolution') == 96.0 - assert tile_da.attrs.get('resolution_unit') == 'inch' - - def test_raster_type_point_propagates_to_tiles(self, tmp_path): - da = _make_rioxarray_style() - vrt = str(tmp_path / 'point.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - assert tile_da.attrs.get('raster_type') == 'point' - - def test_tif_vs_vrt_tile_metadata_parity(self, tmp_path): - """Same DataArray, two destinations -- per-tile metadata matches.""" - da = _make_rioxarray_style() - tif_path = str(tmp_path / 'parity.tif') - vrt_path = str(tmp_path / 'parity.vrt') - to_geotiff(da, tif_path, tile_size=16) - to_geotiff(da, vrt_path, tile_size=16) - - tif_da = open_geotiff(tif_path) - tile_da = open_geotiff(_first_tile_path(vrt_path)) - - keys = ('nodata', 'gdal_metadata', 'raster_type', - 'x_resolution', 'y_resolution', 'resolution_unit') - for k in keys: - assert tif_da.attrs.get(k) == tile_da.attrs.get(k), ( - f'{k} mismatch: tif={tif_da.attrs.get(k)!r}, ' - f'vrt-tile={tile_da.attrs.get(k)!r}') - - -class TestVrtTiledRichTagCoverage: - """Cover the XML / extra_tags / friendly-tag paths the bare - ``gdal_metadata`` dict assertion above does not exercise.""" - - def test_gdal_metadata_xml_string_propagates_to_tiles(self, tmp_path): - """``attrs['gdal_metadata_xml']`` (pre-built XML string) bypasses - the dict->XML builder. Verify it still reaches per-tile files.""" - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - xml = ( - '\n' - ' vrt_xml_value\n' - '\n' - ) - da = xr.DataArray( - arr, dims=('y', 'x'), - coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, - attrs={'crs': 4326, 'gdal_metadata_xml': xml}, - ) - vrt = str(tmp_path / 'gdal_xml.vrt') - # Rich-tag write surface (PR 4 of epic #2340). - to_geotiff(da, vrt, tile_size=16, - allow_experimental_codecs=True) - tile_da = open_geotiff(_first_tile_path(vrt)) - # On read, the XML is re-parsed into a dict under - # attrs['gdal_metadata']; the raw XML lands under - # attrs['gdal_metadata_xml']. Assert the item shows up in - # whichever surface the reader emits. - gm = tile_da.attrs.get('gdal_metadata') or {} - gm_xml = tile_da.attrs.get('gdal_metadata_xml') or '' - assert ( - gm.get('VRT_XML_KEY') == 'vrt_xml_value' - or 'VRT_XML_KEY' in gm_xml - ), ( - f'gdal_metadata_xml content lost on VRT-tile round-trip; ' - f'gdal_metadata={gm!r}, gdal_metadata_xml={gm_xml!r}' - ) - - def test_extra_tags_entry_propagates_to_tiles(self, tmp_path): - """A user-supplied ``extra_tags`` entry (Software, tag 305) - must round-trip through the VRT-tiled writer.""" - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - software = 'xrspatial-vrt-test-1606' - da = xr.DataArray( - arr, dims=('y', 'x'), - coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, - attrs={ - 'crs': 4326, - # (tag_id, type_id, count, value); type 2 = ASCII - 'extra_tags': [(305, 2, len(software) + 1, software)], - }, - ) - vrt = str(tmp_path / 'extra_tags.vrt') - # Rich-tag write surface (PR 4 of epic #2340). - to_geotiff(da, vrt, tile_size=16, - allow_experimental_codecs=True) - tile_da = open_geotiff(_first_tile_path(vrt)) - et = tile_da.attrs.get('extra_tags') or [] - tag_ids = {entry[0] for entry in et} - assert 305 in tag_ids, ( - f'Software (305) tag missing from VRT tile extra_tags; ' - f'got tag ids {sorted(tag_ids)!r}' - ) - - def test_image_description_friendly_attr_propagates_to_tiles( - self, tmp_path): - """``attrs['image_description']`` is folded into ``extra_tags`` - as tag 270 by ``_merge_friendly_extra_tags`` and then surfaces - on read as ``attrs['image_description']``.""" - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - da = xr.DataArray( - arr, dims=('y', 'x'), - coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, - attrs={'crs': 4326, - 'image_description': 'vrt-tile-friendly-1606'}, - ) - vrt = str(tmp_path / 'image_desc.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - assert (tile_da.attrs.get('image_description') - == 'vrt-tile-friendly-1606') - - -class TestVrtTiledMetadataDask: - def test_nodatavals_alias_dask(self, tmp_path): - pytest.importorskip('dask.array') - import dask.array as dska - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - arr[0, 0] = -9999.0 - da_np = xr.DataArray( - arr, dims=('y', 'x'), - coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, - attrs={'nodatavals': (-9999.0,), 'crs': 4326, - 'gdal_metadata': {'k': 'v'}}, - ) - # Dask-back the data so _write_vrt_tiled takes the dask branch. - da = xr.DataArray( - dska.from_array(arr, chunks=4), - dims=da_np.dims, coords=da_np.coords, attrs=da_np.attrs, - ) - vrt = str(tmp_path / 'dask.vrt') - to_geotiff(da, vrt, tile_size=16) - tile_da = open_geotiff(_first_tile_path(vrt)) - assert tile_da.attrs.get('nodata') == -9999.0 - assert tile_da.attrs.get('gdal_metadata') == {'k': 'v'} diff --git a/xrspatial/geotiff/tests/test_vrt_tiled_scheduler_1714.py b/xrspatial/geotiff/tests/test_vrt_tiled_scheduler_1714.py deleted file mode 100644 index b6fc1e53e..000000000 --- a/xrspatial/geotiff/tests/test_vrt_tiled_scheduler_1714.py +++ /dev/null @@ -1,118 +0,0 @@ -"""Verify ``_write_vrt_tiled`` runs tile writes on dask's threaded scheduler. - -Issue #1714: the prior code called ``dask.compute(*delayed_tasks, -scheduler='synchronous')`` which serialised independent tile writes on -the calling thread. Switching to the threaded scheduler reduces wall -time by ~33% on a 256-tile zstd write. These tests pin the new -scheduler choice and confirm the output is correct (no concurrent-write -races, all tiles present, content matches). -""" -from __future__ import annotations - -import glob -import os -import tempfile -from pathlib import Path -from unittest.mock import patch - -import dask -import dask.array as da -import numpy as np -import xarray as xr - -from xrspatial.geotiff import to_geotiff - - -def _make_dask_da(h: int = 32, w: int = 32, chunk: int = 8) -> xr.DataArray: - """Return a dask-backed 2D DataArray with ``chunk``-sized chunks. - - Using ``da.from_array`` on a pre-built numpy array gives clean - ``(chunk, chunk)`` chunking. ``da.arange(...).reshape(...)`` keeps a - chunk size of 1 along the new axis, which produces a confusing test - setup. - """ - arr = np.arange(h * w, dtype=np.float32).reshape(h, w) - return xr.DataArray( - da.from_array(arr, chunks=(chunk, chunk)), - dims=["y", "x"], - ) - - -def test_vrt_tiled_uses_threaded_scheduler(): - """_write_vrt_tiled passes ``scheduler='threads'`` to dask.compute.""" - da_arr = _make_dask_da() - with tempfile.TemporaryDirectory( - prefix="vrt_sched_1714_", ignore_cleanup_errors=True - ) as td: - vrt = os.path.join(td, "sched_check.vrt") - - # Wrap dask.compute so we can record the scheduler kwarg the - # writer chose. The dask module is imported inside - # _write_vrt_tiled, so we patch on the module object directly. - captured = {} - real_compute = dask.compute - - def spy(*args, **kwargs): - captured["scheduler"] = kwargs.get("scheduler") - return real_compute(*args, **kwargs) - - with patch.object(dask, "compute", side_effect=spy) as p: - to_geotiff(da_arr, vrt) - assert p.called, "_write_vrt_tiled never invoked dask.compute" - - assert captured.get("scheduler") == "threads", ( - "Expected scheduler='threads' on the VRT-tiled write but " - f"got {captured.get('scheduler')!r}" - ) - - -def test_vrt_tiled_threaded_write_produces_all_tiles(): - """All expected tile files exist after the threaded write.""" - da_arr = _make_dask_da(h=32, w=32, chunk=8) # 4x4 = 16 tiles - with tempfile.TemporaryDirectory( - prefix="vrt_sched_1714_", ignore_cleanup_errors=True - ) as td: - vrt = os.path.join(td, "tile_count.vrt") - to_geotiff(da_arr, vrt) - tiles_dir = os.path.join(td, "tile_count_tiles") - tiles = sorted(glob.glob(os.path.join(tiles_dir, "*.tif"))) - assert len(tiles) == 16, ( - f"Expected 16 tile files, got {len(tiles)} in {tiles_dir}" - ) - - -def test_vrt_tiled_threaded_write_is_deterministic(): - """Threaded scheduler must not introduce write ordering races. - - Each delayed task writes to its own file path, so the threaded - scheduler is safe. Run the same write twice and compare byte - contents of every tile to catch any accidental race regression. - """ - da_arr = _make_dask_da(h=32, w=32, chunk=8) - - def _write_and_collect(vrt_path: str) -> dict[str, bytes]: - to_geotiff(da_arr, vrt_path) - stem = os.path.splitext(os.path.basename(vrt_path))[0] - tiles_dir = os.path.join(os.path.dirname(vrt_path), stem + "_tiles") - return { - os.path.basename(p): Path(p).read_bytes() - for p in sorted(glob.glob(os.path.join(tiles_dir, "*.tif"))) - } - - with tempfile.TemporaryDirectory( - prefix="vrt_sched_1714_", ignore_cleanup_errors=True - ) as td1: - with tempfile.TemporaryDirectory( - prefix="vrt_sched_1714_", ignore_cleanup_errors=True - ) as td2: - tiles1 = _write_and_collect(os.path.join(td1, "run1.vrt")) - tiles2 = _write_and_collect(os.path.join(td2, "run2.vrt")) - - assert set(tiles1) == set(tiles2), ( - "Tile file set differs between runs: " - f"{set(tiles1) ^ set(tiles2)}" - ) - for name, blob1 in tiles1.items(): - assert blob1 == tiles2[name], ( - f"Tile {name} differs between runs (race condition?)" - ) diff --git a/xrspatial/geotiff/tests/test_vrt_window_validation_1697.py b/xrspatial/geotiff/tests/test_vrt_window_validation_1697.py deleted file mode 100644 index a730e3805..000000000 --- a/xrspatial/geotiff/tests/test_vrt_window_validation_1697.py +++ /dev/null @@ -1,227 +0,0 @@ -"""Regression tests for issue #1697. - -``read_vrt(path, window=...)`` silently clamped invalid window -coordinates instead of raising ``ValueError``. The local TIFF path -(#1634) and the HTTP COG path (#1669) both reject the same bad -windows up front. VRT was missed when #1634 landed. - -The hidden failure mode is the same one #1634 fixed: ``open_geotiff`` -builds the y/x coord arrays from the caller-supplied window indices, -so a clamped read returns a smaller array than the coord arrays and -xarray raises a ``CoordinateValidationError`` deep inside its -constructor instead of a clear xrspatial-level error. - -These tests pin the VRT path to the same contract as the local and -HTTP paths: out-of-bounds, zero-size, or inverted windows raise -``ValueError`` with a message of the same shape (one "extent" word -swap from ``source extent`` to ``VRT extent``). -""" -from __future__ import annotations - -import os -import uuid - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._reader import read_to_array -from xrspatial.geotiff._vrt import read_vrt - - -def _unique_dir(tmp_path, label: str) -> str: - """Return a sub-path under ``tmp_path`` with a uuid suffix so - parallel test workers cannot collide on the same name.""" - d = tmp_path / f"vrt_1697_{label}_{uuid.uuid4().hex[:8]}" - d.mkdir() - return str(d) - - -def _write_tif(path: str, size: int = 4) -> None: - """Write a ``size``x``size`` float32 GeoTIFF the VRT can wrap.""" - arr = np.arange(size * size, dtype=np.float32).reshape(size, size) - y = np.linspace(float(size) - 0.5, 0.5, size) - x = np.linspace(0.5, float(size) - 0.5, size) - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'crs': 4326}, - ) - to_geotiff(da, path, compression='none') - - -def _write_vrt(vrt_path: str, source_filename: str, size: int = 4) -> None: - """Write a single-band VRT of dimension ``size``x``size`` pointing - at ``source_filename`` (relative to the VRT directory).""" - xml = ( - f'\n' - ' 0, 1, 0, 0, 0, -1\n' - ' \n' - ' \n' - f' {source_filename}' - '\n' - ' 1\n' - f' \n' - f' \n' - ' \n' - ' \n' - '\n' - ) - with open(vrt_path, 'w') as f: - f.write(xml) - - -@pytest.fixture -def vrt_4x4(tmp_path): - """Return a path to a 4x4 single-band VRT wrapping a 4x4 TIFF.""" - d = _unique_dir(tmp_path, "fixture") - tif = os.path.join(d, 'data.tif') - _write_tif(tif, size=4) - vrt = os.path.join(d, 'mosaic.vrt') - _write_vrt(vrt, 'data.tif', size=4) - return vrt - - -# --------------------------------------------------------------------------- -# Negative starts -# --------------------------------------------------------------------------- - - -def test_negative_r0_raises_value_error(vrt_4x4): - """``r0 < 0`` raises ValueError instead of being clamped to 0.""" - with pytest.raises(ValueError, match='outside the VRT extent'): - read_vrt(vrt_4x4, window=(-1, 0, 2, 2)) - - -def test_negative_c0_raises_value_error(vrt_4x4): - """``c0 < 0`` raises ValueError instead of being clamped to 0.""" - with pytest.raises(ValueError, match='outside the VRT extent'): - read_vrt(vrt_4x4, window=(0, -1, 2, 2)) - - -# --------------------------------------------------------------------------- -# Past-edge stops -# --------------------------------------------------------------------------- - - -def test_r1_past_bottom_edge_raises_value_error(vrt_4x4): - """``r1 > vrt.height`` raises instead of being clamped to vrt.height.""" - with pytest.raises(ValueError, match='outside the VRT extent'): - read_vrt(vrt_4x4, window=(0, 0, 5, 4)) - - -def test_c1_past_right_edge_raises_value_error(vrt_4x4): - """``c1 > vrt.width`` raises instead of being clamped to vrt.width.""" - with pytest.raises(ValueError, match='outside the VRT extent'): - read_vrt(vrt_4x4, window=(0, 0, 4, 5)) - - -def test_window_past_both_edges_raises_value_error(vrt_4x4): - """Windows past both right and bottom edges raise the same error.""" - with pytest.raises(ValueError, match='outside the VRT extent'): - read_vrt(vrt_4x4, window=(0, 0, 10, 10)) - - -# --------------------------------------------------------------------------- -# Zero-size and inverted windows -# --------------------------------------------------------------------------- - - -def test_zero_size_row_window_raises_value_error(vrt_4x4): - """``r0 == r1`` produces a zero-height window and must raise.""" - with pytest.raises(ValueError, match='non-positive size'): - read_vrt(vrt_4x4, window=(2, 0, 2, 4)) - - -def test_zero_size_col_window_raises_value_error(vrt_4x4): - """``c0 == c1`` produces a zero-width window and must raise.""" - with pytest.raises(ValueError, match='non-positive size'): - read_vrt(vrt_4x4, window=(0, 2, 4, 2)) - - -def test_fully_zero_size_window_raises_value_error(vrt_4x4): - """``r0 == r1 and c0 == c1`` raises (current code returned a (0, 0) array).""" - with pytest.raises(ValueError, match='non-positive size'): - read_vrt(vrt_4x4, window=(2, 2, 2, 2)) - - -def test_inverted_row_window_raises_value_error(vrt_4x4): - """``r0 > r1`` is degenerate and must raise.""" - with pytest.raises(ValueError, match='non-positive size'): - read_vrt(vrt_4x4, window=(3, 0, 1, 4)) - - -def test_inverted_col_window_raises_value_error(vrt_4x4): - """``c0 > c1`` is degenerate and must raise.""" - with pytest.raises(ValueError, match='non-positive size'): - read_vrt(vrt_4x4, window=(0, 3, 4, 1)) - - -# --------------------------------------------------------------------------- -# Valid in-bounds windows still work (don't over-reject) -# --------------------------------------------------------------------------- - - -def test_full_extent_window_still_works(vrt_4x4): - """A window covering the full VRT extent still reads the full array.""" - arr, _ = read_vrt(vrt_4x4, window=(0, 0, 4, 4)) - assert arr.shape == (4, 4) - - -def test_interior_window_still_works(vrt_4x4): - """An interior window returns the requested subset shape.""" - arr, _ = read_vrt(vrt_4x4, window=(1, 1, 3, 3)) - assert arr.shape == (2, 2) - - -def test_edge_aligned_window_still_works(vrt_4x4): - """A window that touches but does not exceed the edges is accepted.""" - arr, _ = read_vrt(vrt_4x4, window=(2, 2, 4, 4)) - assert arr.shape == (2, 2) - - -def test_none_window_still_returns_full_array(vrt_4x4): - """``window=None`` still returns the full VRT extent.""" - arr, _ = read_vrt(vrt_4x4) - assert arr.shape == (4, 4) - - -# --------------------------------------------------------------------------- -# Cross-path parity: VRT and local-TIFF reject the same bad window -# --------------------------------------------------------------------------- - - -def test_vrt_and_local_paths_share_window_validation(tmp_path): - """Same bad window rejected on both VRT and local-TIFF paths with the - same error class and message shape (one word swap is fine).""" - d = _unique_dir(tmp_path, "parity") - tif = os.path.join(d, 'data.tif') - _write_tif(tif, size=4) - vrt = os.path.join(d, 'mosaic.vrt') - _write_vrt(vrt, 'data.tif', size=4) - - bad_window = (-1, 0, 2, 2) - - with pytest.raises(ValueError) as vrt_exc: - read_vrt(vrt, window=bad_window) - with pytest.raises(ValueError) as local_exc: - read_to_array(tif, window=bad_window) - - vrt_msg = str(vrt_exc.value) - local_msg = str(local_exc.value) - - # Both messages must echo the offending window and the source dims. - assert 'window=' in vrt_msg - assert 'window=' in local_msg - assert '4x4' in vrt_msg - assert '4x4' in local_msg - # And both must signal "out of bounds" via the same wording template, - # with a single word swap on the extent label. - assert 'extent' in vrt_msg - assert 'extent' in local_msg - assert 'non-positive size' in vrt_msg - assert 'non-positive size' in local_msg - # The VRT path says "VRT extent"; the local path says "source extent". - assert 'VRT extent' in vrt_msg - assert 'source extent' in local_msg diff --git a/xrspatial/geotiff/tests/test_vrt_xml_escape_1607.py b/xrspatial/geotiff/tests/test_vrt_xml_escape_1607.py deleted file mode 100644 index b3f4739a9..000000000 --- a/xrspatial/geotiff/tests/test_vrt_xml_escape_1607.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Regression tests for issue #1607: write_vrt must XML-escape -caller-supplied text (CRS WKT, source filenames) so a value carrying -XML special characters cannot break the generated VRT or inject extra -elements that change how the VRT parses when read back. -""" -from __future__ import annotations - -import os - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._vrt import parse_vrt, write_vrt - - -@pytest.fixture -def sample_tif(tmp_path): - """Write a tiny GeoTIFF the VRT writer can introspect for metadata.""" - arr = np.zeros((4, 4), dtype=np.float32) - y = np.linspace(1.0, 0.0, 4) - x = np.linspace(0.0, 1.0, 4) - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': y, 'x': x}, - attrs={'nodata': -9999.0}, - ) - path = str(tmp_path / 'src.tif') - to_geotiff(da, path) - return path - - -def test_crs_wkt_with_xml_special_chars_round_trips(sample_tif, tmp_path): - """A WKT containing ``& < > " '`` must round-trip through write_vrt / - parse_vrt unchanged (the entities are escaped on the way out and - decoded on the way in).""" - nasty_wkt = 'GEOGCS["spec & with \"quotes\" and \'apostrophes\'"]' - vrt_path = str(tmp_path / 'mosaic.vrt') - write_vrt(vrt_path, [sample_tif], crs_wkt=nasty_wkt) - - with open(vrt_path, 'r') as fh: - text = fh.read() - - parsed = parse_vrt(text, vrt_dir=str(tmp_path)) - assert parsed.crs_wkt == nasty_wkt - - -def test_crs_wkt_injection_does_not_change_raster_type(sample_tif, tmp_path): - """The headline #1607 case: a crafted WKT trying to close ```` - and inject ``Point...`` - must NOT change ``raster_type`` from its default 'area' value.""" - injection = ( - 'Point' - '' - ) - vrt_path = str(tmp_path / 'evil.vrt') - write_vrt(vrt_path, [sample_tif], crs_wkt=injection) - - with open(vrt_path, 'r') as fh: - text = fh.read() - - parsed = parse_vrt(text, vrt_dir=str(tmp_path)) - assert parsed.raster_type == 'area' - # And the injection round-trips as literal text inside . - assert parsed.crs_wkt == injection - - -def test_source_filename_with_ampersand_round_trips(tmp_path): - """A source filename containing ``&`` must produce a VRT whose - ```` element decodes back to the original on-disk - path (no double-escape, no corruption).""" - # Build a TIFF on disk whose filename has an ampersand. - arr = np.zeros((4, 4), dtype=np.float32) - da = xr.DataArray( - arr, dims=['y', 'x'], - coords={'y': np.linspace(1, 0, 4), 'x': np.linspace(0, 1, 4)}, - attrs={'nodata': -9999.0}, - ) - src = str(tmp_path / 'a&b.tif') - to_geotiff(da, src) - - vrt_path = str(tmp_path / 'mosaic.vrt') - write_vrt(vrt_path, [src]) - - with open(vrt_path, 'r') as fh: - text = fh.read() - # The on-disk text must carry the escaped form, not the raw '&'. - assert '&' in text - assert ' str: - src_path = os.path.join(td, 'tmp_1815_src.tif') - to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, - compression='none') - return src_path - - -def _write_vrt(td: str, *, pad_bytes: int = 0) -> str: - """Write a VRT, optionally padded with a large XML comment.""" - vrt_path = os.path.join(td, 'tmp_1815_mosaic.vrt') - comment = '' - if pad_bytes > 0: - comment = '\n' - vrt_xml = ( - '\n' - + comment + - ' \n' - ' \n' - ' ' - 'tmp_1815_src.tif\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -def test_small_vrt_parses_under_default_cap(tmp_path): - """A normal-sized VRT parses successfully with the default cap.""" - td = str(tmp_path) - _write_source(td) - vrt_path = _write_vrt(td) - arr, _ = read_vrt(vrt_path) - assert arr.shape == (10, 10) - - -def test_oversized_vrt_raises_value_error(tmp_path, monkeypatch): - """A VRT padded past the cap raises ValueError naming the cap and env var.""" - td = str(tmp_path) - _write_source(td) - # Set a small cap (1 KiB) and pad the comment past it. - monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024') - vrt_path = _write_vrt(td, pad_bytes=4096) - with pytest.raises(ValueError) as exc_info: - read_vrt(vrt_path) - msg = str(exc_info.value) - assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg - assert '1,024' in msg - - -def test_raising_cap_lets_padded_vrt_parse(tmp_path, monkeypatch): - """Setting the env var higher allows a padded VRT to parse.""" - td = str(tmp_path) - _write_source(td) - vrt_path = _write_vrt(td, pad_bytes=4096) - # Default cap of 64 MiB is more than enough; verify with an explicit - # higher cap too. - monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024)) - arr, _ = read_vrt(vrt_path) - assert arr.shape == (10, 10) - - -@pytest.mark.parametrize('bad_value', ['not_a_number', '0', '-1', '-1024']) -def test_invalid_cap_raises_value_error(tmp_path, monkeypatch, bad_value): - """Non-numeric, zero, or negative cap values produce a clear error.""" - td = str(tmp_path) - _write_source(td) - vrt_path = _write_vrt(td) - monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', bad_value) - with pytest.raises(ValueError, match='XRSPATIAL_VRT_MAX_XML_BYTES'): - read_vrt(vrt_path) diff --git a/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py b/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py deleted file mode 100644 index 2a36514d3..000000000 --- a/xrspatial/geotiff/tests/test_vrt_xml_size_cap_chunked_1831.py +++ /dev/null @@ -1,86 +0,0 @@ -"""VRT XML reads from the chunked dispatcher must honor the size cap. - -Regression test for issue #1831: ``read_vrt(path, chunks=...)`` (added in -#1822) parsed the VRT XML with an unbounded ``open().read()``, bypassing -the 64 MiB cap that #1818 added in ``_vrt._read_vrt_xml``. An attacker -supplying a multi-GB VRT file plus a chunked workflow would exhaust -host memory before any parser-side guard fired. - -The eager path is already covered by ``test_vrt_xml_size_cap_1815.py``; -this file pins the same behavior for ``chunks=``. -""" -from __future__ import annotations - -import os - -import numpy as np -import pytest - -from xrspatial.geotiff import read_vrt, to_geotiff - - -def _write_source(td: str) -> str: - src_path = os.path.join(td, 'tmp_1831_src.tif') - to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, - compression='none') - return src_path - - -def _write_vrt(td: str, *, pad_bytes: int = 0) -> str: - """Write a VRT, optionally padded with a large XML comment.""" - vrt_path = os.path.join(td, 'tmp_1831_mosaic.vrt') - comment = '' - if pad_bytes > 0: - comment = '\n' - vrt_xml = ( - '\n' - + comment + - ' \n' - ' \n' - ' ' - 'tmp_1831_src.tif\n' - ' 1\n' - ' \n' - ' \n' - ' \n' - ' \n' - '\n' - ) - with open(vrt_path, 'w') as f: - f.write(vrt_xml) - return vrt_path - - -def test_chunked_read_vrt_honors_xml_cap(tmp_path, monkeypatch): - """``read_vrt(chunks=...)`` rejects oversized VRT XML.""" - td = str(tmp_path) - _write_source(td) - # 1 KiB cap, 4 KiB pad. The cap message must reference the env var so - # operators know how to raise it. - monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024') - vrt_path = _write_vrt(td, pad_bytes=4096) - with pytest.raises(ValueError) as exc_info: - read_vrt(vrt_path, chunks=10) - msg = str(exc_info.value) - assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg - assert '1,024' in msg - - -def test_chunked_read_vrt_under_default_cap(tmp_path): - """A normal-sized VRT parses successfully under the default cap.""" - td = str(tmp_path) - _write_source(td) - vrt_path = _write_vrt(td) - arr = read_vrt(vrt_path, chunks=10) - assert arr.shape == (10, 10) - assert arr.dtype == np.uint8 - - -def test_chunked_read_vrt_raised_cap_allows_padded(tmp_path, monkeypatch): - """Raising ``XRSPATIAL_VRT_MAX_XML_BYTES`` lets a padded VRT parse.""" - td = str(tmp_path) - _write_source(td) - vrt_path = _write_vrt(td, pad_bytes=4096) - monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024)) - arr = read_vrt(vrt_path, chunks=10) - assert arr.shape == (10, 10) diff --git a/xrspatial/geotiff/tests/vrt/test_dtype_conversion.py b/xrspatial/geotiff/tests/vrt/test_dtype_conversion.py new file mode 100644 index 000000000..a30f2f512 --- /dev/null +++ b/xrspatial/geotiff/tests/vrt/test_dtype_conversion.py @@ -0,0 +1,1357 @@ +"""Consolidated VRT dtype-conversion test suite. + +Folds seven issue-numbered VRT test files covering source-vs-output +dtype handling, integer-with-nodata promotion, multiband dtype, and +the positive mosaic coverage that exercises dtype passthrough end to +end. Helpers are prefixed (e.g. ``_dtype_validation_*``) to avoid +collisions across folds. + +Sections: +* VRT dataType attribute validation and band-nodata parsing (#1783) +* VRT writer dtype name resolution (#1914) +* Integer source feeding a float VRT (#1616) +* Multiband dtype promotion and band-select (#1696) +* Multiband per-band integer nodata (#1611) +* VRT resample algorithm validation (#1751) +* Simple VRT mosaic positive coverage (#2369) + +See ``CLUSTER_AUDIT_PR6.md`` for the file:test -> section:test mapping. +""" +from __future__ import annotations + +import dask.array as da +import numpy as np +import os +import pytest +import uuid +import xarray as xr +from xrspatial.geotiff import read_vrt, to_geotiff +from xrspatial.geotiff._errors import VRTUnsupportedError +from xrspatial.geotiff._vrt import ( + _NP_TO_VRT_DTYPE, + _parse_band_nodata, + _vrt_dtype_name_for, + parse_vrt, +) +from xrspatial.geotiff._vrt import read_vrt as _resample_alg_read_vrt_internal +from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# VRT dataType attribute validation and parsing (#1783) +# Originally: test_vrt_dtype_1783.py +# --------------------------------------------------------------------------- + + +def _dtype_validation_write(arr, path, **kw): + """Write a 2D array to ``path`` with sensible defaults for tests.""" + write(arr, str(path), compression='none', tiled=False, **kw) + + +def _dtype_validation_build_single_band_vrt(tmp_path, *, dtype_attr, src_path, filename='b.vrt', size=2, nodata=None): + """Hand-roll a single-band VRT with an arbitrary ``dataType`` attribute. + + ``dtype_attr`` is rendered verbatim into the ```` + element. Pass an empty string to omit the attribute entirely (the + "GDAL default" case). + + ``nodata`` (when not ``None``) is rendered verbatim into a + ```` child so callers can exercise sentinel-parsing + edge cases (scientific notation, ``nan``, full-range 64-bit + integers). + """ + if dtype_attr: + attr = f' dataType="{dtype_attr}"' + else: + attr = '' + nodata_elem = f'{nodata}' if nodata is not None else '' + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {nodata_elem}\n \n {src_path}\n 1\n \n \n \n \n' + p = tmp_path / filename + p.write_text(vrt_xml) + return str(p) + + +@pytest.mark.parametrize('cdtype', ['CInt16', 'CInt32', 'CFloat32', 'CFloat64']) +def test_complex_dtype_raises_value_error(tmp_path, cdtype): + """A VRT declaring any complex ``dataType`` must raise ``ValueError`` + rather than silently substituting ``Float32``. The error message + must name both the band number and the offending dataType so the + operator can fix the VRT, and must mention that complex types are + explicitly unsupported. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr=cdtype, src_path=str(src)) + with pytest.raises(ValueError) as ei: + read_vrt(vrt) + msg = str(ei.value) + assert cdtype in msg, f'error message must name {cdtype!r}: {msg!r}' + assert 'band=1' in msg or 'band 1' in msg, f'error message must name the band: {msg!r}' + assert 'complex' in msg.lower(), f'error message must mention complex types: {msg!r}' + + +def test_garbage_dtype_raises_value_error(tmp_path): + """An unrecognised non-complex ``dataType`` (e.g. a typo) must also + raise ``ValueError`` rather than collapsing silently to Float32. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Garbage', src_path=str(src)) + with pytest.raises(ValueError, match='Garbage'): + read_vrt(vrt) + + +def test_typo_for_supported_dtype_is_still_rejected(tmp_path): + """``Flaot32`` (typo of ``Float32``) is distinct from the empty / + missing case and must surface as ``ValueError`` instead of silently + falling back. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Flaot32', src_path=str(src)) + with pytest.raises(ValueError, match='Flaot32'): + read_vrt(vrt) + + +def test_uint64_round_trip(tmp_path): + """A VRT declaring ``dataType="UInt64"`` whose source GeoTIFF is + written as uint64 must read back as uint64 with the exact values + preserved, including values past the float32 / int53 boundary. + """ + big = np.iinfo(np.uint64).max + near_big = big - 7 + b = np.array([[1, 2], [near_big, big]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='UInt64', src_path=str(src)) + r = read_vrt(vrt) + assert r.dtype == np.uint64, f'UInt64 VRT must read as uint64; got {r.dtype}' + np.testing.assert_array_equal(r.values, b) + assert int(r.values[1, 1]) == big + assert int(r.values[1, 0]) == near_big + + +def test_int64_round_trip(tmp_path): + """A VRT declaring ``dataType="Int64"`` must read back as int64 + with the full int64 range preserved (positive and negative + extremes). + """ + info = np.iinfo(np.int64) + b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Int64', src_path=str(src)) + r = read_vrt(vrt) + assert r.dtype == np.int64, f'Int64 VRT must read as int64; got {r.dtype}' + np.testing.assert_array_equal(r.values, b) + + +def test_missing_dtype_attribute_defaults_to_float32(tmp_path): + """```` with no ``dataType`` attribute must + still default to ``Float32``. This is GDAL's documented default + and the previous fallback handled it correctly; the new + "unknown-attribute raises" path must not regress the + "missing-attribute defaults" path. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='', src_path=str(src)) + r = read_vrt(vrt) + assert r.dtype == np.float32, f'missing dataType must default to Float32; got {r.dtype}' + np.testing.assert_allclose(r.values, b) + + +def test_byte_dtype_still_works(tmp_path): + """``Byte`` reads back as uint8 with values preserved. Smoke check + to confirm the rewritten dtype resolution did not break the + common-case integer path. + """ + b = np.array([[10, 11], [12, 13]], dtype=np.uint8) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Byte', src_path=str(src)) + r = read_vrt(vrt) + assert r.dtype == np.uint8 + np.testing.assert_array_equal(r.values, b) + + +def test_float64_dtype_still_works(tmp_path): + """``Float64`` reads back as float64 with values preserved. Smoke + check for the wider floating-point path. + """ + b = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Float64', src_path=str(src)) + r = read_vrt(vrt) + assert r.dtype == np.float64 + np.testing.assert_allclose(r.values, b) + + +def test_parse_band_nodata_uint64_max_exact(): + """``_parse_band_nodata`` must return the exact ``int`` for + ``2**64 - 1`` (UInt64 max), not a float64 that rounds up to + ``2**64``. + """ + big = 2 ** 64 - 1 + nd = _parse_band_nodata(str(big), np.dtype(np.uint64)) + assert isinstance(nd, int), f'UInt64 nodata must parse as int, got {type(nd).__name__}' + assert nd == big + assert nd != int(float(big)) + + +def test_parse_band_nodata_int64_min_exact(): + """``INT64_MIN`` (``-2**63``) must survive parsing as an int.""" + info = np.iinfo(np.int64) + nd = _parse_band_nodata(str(info.min), np.dtype(np.int64)) + assert isinstance(nd, int) + assert nd == info.min + + +def test_parse_band_nodata_int32_negative(): + """Common GDAL sentinel ``-9999`` for an Int32 band parses as int.""" + nd = _parse_band_nodata('-9999', np.dtype(np.int32)) + assert isinstance(nd, int) + assert nd == -9999 + + +def test_parse_band_nodata_int_scientific_notation(): + """GDAL occasionally emits integer nodata in scientific or + ``-9999.0`` form. Parsing should still land an int when the + value is integer-valued and in-range. + """ + nd = _parse_band_nodata('-9999.0', np.dtype(np.int32)) + assert isinstance(nd, int) and nd == -9999 + nd = _parse_band_nodata('1e3', np.dtype(np.int32)) + assert isinstance(nd, int) and nd == 1000 + + +def test_parse_band_nodata_int_out_of_range_falls_back(): + """An out-of-range sentinel for the band dtype is returned as the + parsed float so it surfaces via ``attrs['nodata']`` for round-trip + but can never match an integer pixel (mirroring + ``_resolve_masked_fill``'s tolerant behaviour). + """ + nd = _parse_band_nodata('-9999', np.dtype(np.uint16)) + assert nd == -9999 + + +def test_parse_band_nodata_float_nan(): + """Float bands keep NaN sentinels working (no integer-parse + regression for the floating path). + """ + nd = _parse_band_nodata('nan', np.dtype(np.float32)) + assert isinstance(nd, float) + assert np.isnan(nd) + + +def test_parse_band_nodata_float_scientific(): + """Float bands preserve scientific-notation sentinels.""" + nd = _parse_band_nodata('-1.5e10', np.dtype(np.float64)) + assert isinstance(nd, float) + assert nd == -15000000000.0 + + +def test_parse_band_nodata_empty_or_none(): + """Empty / whitespace / ``None`` input returns ``None`` regardless + of dtype. + """ + assert _parse_band_nodata(None, np.dtype(np.int32)) is None + assert _parse_band_nodata('', np.dtype(np.int32)) is None + assert _parse_band_nodata(' ', np.dtype(np.float32)) is None + + +def _dtype_validation_make_minimal_vrt_xml(dtype_attr, nodata_text): + """Tiny VRT XML string suitable for direct ``parse_vrt`` calls. + + The SourceFilename here is intentionally minimal -- ``parse_vrt`` + only does the containment check after canonicalising the path, so + we pass a path inside the temp dir at the call site. + """ + return f'0.0, 1.0, 0.0, 0.0, 0.0, -1.0{nodata_text}' + + +def test_parse_vrt_uint64_nodata_is_int(tmp_path): + """The dataclass stored on ``_VRTBand.nodata`` is a Python ``int`` + for an integer-dtype band, with the exact 64-bit value. + """ + big = 2 ** 64 - 1 + xml = _dtype_validation_make_minimal_vrt_xml('UInt64', str(big)) + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + assert len(vrt.bands) == 1 + nd = vrt.bands[0].nodata + assert isinstance(nd, int) + assert nd == big + + +def test_parse_vrt_int64_min_nodata_is_int(tmp_path): + info = np.iinfo(np.int64) + xml = _dtype_validation_make_minimal_vrt_xml('Int64', str(info.min)) + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + nd = vrt.bands[0].nodata + assert isinstance(nd, int) + assert nd == info.min + + +def test_parse_vrt_float32_nan_nodata_is_float(tmp_path): + xml = _dtype_validation_make_minimal_vrt_xml('Float32', 'nan') + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + nd = vrt.bands[0].nodata + assert isinstance(nd, float) + assert np.isnan(nd) + + +def test_uint64_nodata_round_trip_preserves_max_sentinel(tmp_path): + """A VRT declaring UInt64 + ``2**64 - 1`` + must surface ``attrs['nodata']`` as the exact integer value, not a + float that has rounded past the dtype's range. Downstream + consumers rely on exact equality. + """ + big = 2 ** 64 - 1 + b = np.array([[1, 2], [3, 4]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big) + r = read_vrt(vrt) + assert 'nodata' in r.attrs + assert int(r.attrs['nodata']) == big + assert isinstance(r.attrs['nodata'], (int, np.integer)) + + +def test_uint64_nodata_masks_max_sentinel_in_data(tmp_path): + """When the source pixel actually contains ``2**64 - 1``, the + masking pipeline must catch it: the result is promoted to float64 + with NaN at the sentinel position. This is the precision- + preservation acid test -- if the nodata was rounded to a float + that doesn't equal the source pixel, the mask never fires and the + sentinel survives as a 1.8e19 float. + """ + big = 2 ** 64 - 1 + b = np.array([[1, 2], [3, big]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big) + r = read_vrt(vrt) + assert r.dtype == np.float64, f'sentinel hit must promote to float64, got {r.dtype}' + assert np.isnan(r.values[1, 1]), f'the 2**64-1 cell must be masked to NaN; got {r.values[1, 1]!r}' + assert r.values[0, 0] == 1.0 + assert r.values[0, 1] == 2.0 + assert r.values[1, 0] == 3.0 + + +def test_int64_min_nodata_masks_correctly(tmp_path): + """``INT64_MIN`` as both the nodata sentinel and a real pixel value + masks correctly without int64 -> float64 rounding aliasing the + sentinel onto adjacent values. + """ + info = np.iinfo(np.int64) + b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Int64', src_path=str(src), nodata=info.min) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[0, 0]) + assert r.values[0, 1] == -1.0 + assert r.values[1, 0] == 0.0 + assert r.values[1, 1] == float(info.max) + + +def test_int32_negative_nodata_still_masks(tmp_path): + """Smoke regression for the common Int32 + ``-9999`` case. The + integer parsing path must not break this when there is no precision + pressure -- ``-9999`` survives ``float()`` fine but we still want + the new int-typed parse to mask the same way the old float-typed + parse did. + """ + b = np.array([[10, -9999], [-9999, 20]], dtype=np.int32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Int32', src_path=str(src), nodata=-9999) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[0, 1]) + assert np.isnan(r.values[1, 0]) + assert r.values[0, 0] == 10.0 + assert r.values[1, 1] == 20.0 + + +def test_float32_nan_nodata_still_works(tmp_path): + """``Float32`` + ``nan`` still parses and + surfaces NaN via ``attrs['nodata']`` (no regression on the float + path). + """ + b = np.array([[1.0, np.nan], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Float32', src_path=str(src), nodata='nan') + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert np.isnan(r.attrs['nodata']) + assert np.isnan(r.values[0, 1]) + + +def test_float64_scientific_nodata_still_works(tmp_path): + """``Float64`` + scientific-notation ```` survives as + float (no integer-parse regression for the float path). + """ + b = np.array([[1.0, -15000000000.0], [3.0, 4.0]], dtype=np.float64) + src = tmp_path / 'src.tif' + _dtype_validation_write(b, src) + vrt = _dtype_validation_build_single_band_vrt(tmp_path, dtype_attr='Float64', src_path=str(src), nodata='-1.5e10') + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert r.attrs['nodata'] == -15000000000.0 + + +# --------------------------------------------------------------------------- +# VRT writer dtype name resolution (#1914) +# Originally: test_vrt_dtype_12bit_1914.py +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize('bps,sf,expected', [(12, 1, 'UInt16'), (12, 4, 'UInt16'), (1, 1, 'Byte'), (2, 1, 'Byte'), (4, 1, 'Byte'), (8, 1, 'Byte'), (16, 1, 'UInt16'), (32, 1, 'UInt32'), (64, 1, 'UInt64'), (8, 2, 'Int8'), (16, 2, 'Int16'), (32, 2, 'Int32'), (64, 2, 'Int64'), (32, 3, 'Float32'), (64, 3, 'Float64')]) +def test_vrt_dtype_name_for_supported(bps, sf, expected): + assert _vrt_dtype_name_for(bps, sf) == expected + + +def test_vrt_dtype_name_for_sample_format_sequence_resolves(): + assert _vrt_dtype_name_for(8, [1, 1]) == 'Byte' + assert _vrt_dtype_name_for(16, (2, 2)) == 'Int16' + + +def test_vrt_dtype_name_for_unsupported_raises(): + with pytest.raises(ValueError): + _vrt_dtype_name_for(24, 2) + + +def test_np_to_vrt_dtype_table_covers_all_resolver_outputs(): + from xrspatial.geotiff._dtypes import tiff_dtype_to_numpy + pairs = [(8, 1), (8, 2), (16, 1), (16, 2), (32, 1), (32, 2), (32, 3), (64, 1), (64, 2), (64, 3), (1, 1), (2, 1), (4, 1), (12, 1)] + for bps, sf in pairs: + np_dtype = tiff_dtype_to_numpy(bps, sf) + assert np_dtype.type in _NP_TO_VRT_DTYPE, f'resolver yields {np_dtype} for bps={bps}, sf={sf} but _NP_TO_VRT_DTYPE has no entry for it' + + +def _dtype_12bit_unique_dir(tmp_path, label: str) -> str: + d = tmp_path / f'vrt_1914_{label}_{uuid.uuid4().hex[:8]}' + d.mkdir() + return str(d) + + +def _dtype_12bit_write_uint16_tif(path: str, *, h: int=4, w: int=4, origin_x: float=0.0) -> None: + arr = np.arange(h * w, dtype=np.uint16).reshape(h, w) + y = 100.0 + (np.arange(h) + 0.5) * -1.0 + x = origin_x + (np.arange(w) + 0.5) * 1.0 + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + to_geotiff(da, path, compression='none') + + +def test_uint16_source_writes_uint16_vrt_datatype(tmp_path): + d = _dtype_12bit_unique_dir(tmp_path, 'u16') + a = os.path.join(d, 'a.tif') + b = os.path.join(d, 'b.tif') + _dtype_12bit_write_uint16_tif(a) + _dtype_12bit_write_uint16_tif(b, origin_x=4.0) + vrt = os.path.join(d, 'out.vrt') + _write_vrt_internal(vrt, [a, b]) + with open(vrt) as f: + xml = f.read() + assert 'dataType="UInt16"' in xml + assert 'dataType="Byte"' not in xml + + +def test_int16_source_writes_int16_vrt_datatype(tmp_path): + d = _dtype_12bit_unique_dir(tmp_path, 'i16') + a = os.path.join(d, 'a.tif') + arr = np.arange(16, dtype=np.int16).reshape(4, 4) + y = 100.0 + (np.arange(4) + 0.5) * -1.0 + x = (np.arange(4) + 0.5) * 1.0 + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + to_geotiff(da, a, compression='none') + vrt = os.path.join(d, 'out.vrt') + _write_vrt_internal(vrt, [a]) + with open(vrt) as f: + xml = f.read() + assert 'dataType="Int16"' in xml + + +# --------------------------------------------------------------------------- +# integer source feeding float VRT (#1616) +# Originally: test_vrt_int_source_float_dtype_1616.py +# --------------------------------------------------------------------------- + + +def _int_source_float_dtype_write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='b0.tif'): + band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return p + + +def _int_source_float_dtype_write_int16_with_sentinel(tmp_path, sentinel=-1, filename='b0.tif'): + band = np.array([[1, 2], [3, sentinel]], dtype=np.int16) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return p + + +def _int_source_float_dtype_build_vrt(tmp_path, source_path, vrt_dtype, nodata_value, filename='mismatch.vrt'): + """Hand-roll a VRT with the requested dataType / NoDataValue pair.""" + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {nodata_value}\n \n {source_path}\n 1\n \n \n \n \n' + p = str(tmp_path / filename) + with open(p, 'w') as f: + f.write(vrt_xml) + return p + + +def test_float32_vrt_uint16_source_masks_in_range_sentinel(tmp_path): + """Float32 VRT, uint16 source with in-range sentinel: pixel becomes NaN. + + Before the fix this returned dtype=float32 with values[1, 1] == 65535.0 + while ``attrs['nodata']`` advertised the sentinel. + """ + src = _int_source_float_dtype_write_uint16_with_sentinel(tmp_path) + vrt = _int_source_float_dtype_build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt) + assert r.dtype == np.float32, f'Float32-declared VRT should return float32, got {r.dtype}' + assert np.isnan(r.values[1, 1]), f'Sentinel pixel (uint16 65535 -> float32) should be NaN-masked; got values[1, 1]={r.values[1, 1]}' + assert r.attrs.get('nodata') == 65535.0 + assert r.values[0, 0] == 1.0 + + +def test_float64_vrt_int16_source_masks_negative_sentinel(tmp_path): + """Float64 VRT, int16 source with negative sentinel: pixel becomes NaN.""" + src = _int_source_float_dtype_write_int16_with_sentinel(tmp_path, sentinel=-1) + vrt = _int_source_float_dtype_build_vrt(tmp_path, src, 'Float64', -1) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[1, 1]), f'Sentinel pixel (-1) should be NaN-masked; got values[1, 1]={r.values[1, 1]}' + assert r.attrs.get('nodata') == -1.0 + + +def test_float32_vrt_out_of_range_sentinel_is_noop(tmp_path): + """An out-of-range sentinel (e.g. uint16 source + NoDataValue=-9999) + stays unmasked rather than raising ``OverflowError`` from the + ``uint16(-9999)`` cast. The pixel data is returned as-is and + ``attrs['nodata']`` still surfaces the declared sentinel so callers + can mask in user code or write through. + """ + arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) + p = str(tmp_path / 'b0_no_nodata.tif') + write(arr, p, compression='none', tiled=False) + vrt = _int_source_float_dtype_build_vrt(tmp_path, p, 'Float32', -9999) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert not np.isnan(r.values).any() + assert r.attrs.get('nodata') == -9999.0 + + +def test_float32_vrt_uint16_source_no_sentinel_pixels(tmp_path): + """Float32 VRT, uint16 source whose pixels do not match the sentinel: + the result is a clean float array with no NaNs introduced. + + This exercises the early-out path inside the new mask branch -- a + declared sentinel that matches no pixels must not perturb the data + or cause an extra copy that would surface as a different dtype. + """ + arr = np.array([[1, 2], [3, 4]], dtype=np.uint16) + p = str(tmp_path / 'b0_clean.tif') + write(arr, p, compression='none', tiled=False) + vrt = _int_source_float_dtype_build_vrt(tmp_path, p, 'Float32', 65535) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert not np.isnan(r.values).any() + np.testing.assert_array_equal(r.values, arr.astype(np.float32)) + + +def test_float_vrt_int_source_dask_path_masks_sentinel(tmp_path): + """The dask wrapper path (``chunks=...``) also returns NaN at the + sentinel pixel. The dask reader chunks the eager result after decode, + so the bug propagates if the eager path leaks the sentinel. + """ + src = _int_source_float_dtype_write_uint16_with_sentinel(tmp_path) + vrt = _int_source_float_dtype_build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt, chunks=2) + assert r.dtype == np.float32 + val = r.values + assert np.isnan(val[1, 1]) + + +def test_float_vrt_int_source_round_trip_nodata_attr(tmp_path): + """Even though the masking promotes pixels to NaN, the + ``attrs['nodata']`` value still carries the original sentinel so a + downstream write can restore the literal sentinel byte pattern. + """ + src = _int_source_float_dtype_write_uint16_with_sentinel(tmp_path) + vrt = _int_source_float_dtype_build_vrt(tmp_path, src, 'Float32', 65535) + r = read_vrt(vrt) + assert r.attrs.get('nodata') == 65535.0 + + +def test_float_vrt_int_source_with_band_select(tmp_path): + """The band=N selection path also masks integer sentinels for a + float-declared VRT. The per-band ``NoDataValue`` from the VRT XML + must reach the source-side masking step, not just ``attrs['nodata']``. + """ + src_a = _int_source_float_dtype_write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='ba.tif') + src_b = _int_source_float_dtype_write_uint16_with_sentinel(tmp_path, sentinel=65000, filename='bb.tif') + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n 65535\n \n {src_a}\n 1\n \n \n \n \n \n 65000\n \n {src_b}\n 1\n \n \n \n \n' + vrt_path = str(tmp_path / 'mb.vrt') + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + r0 = read_vrt(vrt_path, band=0, band_nodata='first') + assert r0.dtype == np.float32 + assert np.isnan(r0.values[1, 1]) + assert r0.attrs.get('nodata') == 65535.0 + r1 = read_vrt(vrt_path, band=1, band_nodata='first') + assert r1.dtype == np.float32 + assert np.isnan(r1.values[1, 1]) + assert r1.attrs.get('nodata') == 65000.0 + + +# --------------------------------------------------------------------------- +# multiband dtype promotion (#1696) +# Originally: test_vrt_multiband_dtype_1696.py +# --------------------------------------------------------------------------- + + +def _multiband_dtype_write(arr, path, **kw): + """Write a 2D array to ``path`` with sensible defaults for tests.""" + write(arr, str(path), compression='none', tiled=False, **kw) + + +def _multiband_dtype_build_two_band_vrt(tmp_path, *, b0_dtype_str, b0_path, b1_dtype_str, b1_path, b1_extra='', b0_extra='', filename='mb.vrt', size=2): + """Hand-roll a two-band VRT with arbitrary dataType strings.""" + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n{b0_extra} \n {b0_path}\n 1\n \n \n \n \n \n{b1_extra} \n {b1_path}\n 1\n \n \n \n \n' + p = tmp_path / filename + p.write_text(vrt_xml) + return str(p) + + +def _multiband_dtype_build_complex_source_vrt(tmp_path, *, dtype_str, src_path, scale_ratio=None, scale_offset=None, filename='cs.vrt', size=2, band_num=2, other_band_dtype='Byte', other_band_path=None, extra_band=True): + """Hand-roll a VRT where band 2 (or the only band) uses ComplexSource. + + ``extra_band=False`` writes a single-band VRT. + """ + cs_lines = [] + if scale_ratio is not None: + cs_lines.append(f' {scale_ratio}') + if scale_offset is not None: + cs_lines.append(f' {scale_offset}') + cs_inner = '\n'.join(cs_lines) + complex_block = f' \n {src_path}\n 1\n{cs_inner}\n \n \n \n' + if extra_band and other_band_path is not None: + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {other_band_path}\n 1\n \n \n \n \n \n{complex_block} \n' + else: + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n{complex_block} \n' + p = tmp_path / filename + p.write_text(vrt_xml) + return str(p) + + +def test_mixed_byte_and_float32_bands_preserve_fractional(tmp_path): + """``Byte`` band 0 + ``Float32`` band 1: band 1's fractional values + must survive the read. Before the fix the buffer was allocated as + uint8 and ``1.5, 2.5, 3.5, 4.5`` truncated to ``1, 2, 3, 4``. + """ + b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) + b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0) + _multiband_dtype_write(b1, p1) + vrt_path = _multiband_dtype_build_two_band_vrt(tmp_path, b0_dtype_str='Byte', b0_path=str(p0), b1_dtype_str='Float32', b1_path=str(p1)) + r = read_vrt(vrt_path) + assert r.dtype.kind == 'f', f'Mixed Byte+Float32 must widen to float; got {r.dtype}' + np.testing.assert_allclose(r.values[..., 1], b1.astype(r.dtype)) + np.testing.assert_array_equal(r.values[..., 0], b0.astype(r.dtype)) + + +def test_complex_source_scale_promotes_buffer_to_float(tmp_path): + """Both bands declare ``Byte`` but band 1 has ``0.5``. + The scaled source values include fractional results (11 * 0.5 = 5.5) + which must survive. Before the fix the buffer stayed uint8 and the + fractional values rounded down to 5. + """ + b = np.array([[10, 11], [12, 13]], dtype=np.uint8) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b, p0) + _multiband_dtype_write(b, p1) + vrt_path = _multiband_dtype_build_complex_source_vrt(tmp_path, dtype_str='Byte', src_path=str(p1), scale_ratio=0.5, other_band_dtype='Byte', other_band_path=str(p0)) + r = read_vrt(vrt_path) + assert r.dtype.kind == 'f', f'ScaleRatio on a Byte band must widen the buffer to float; got {r.dtype}' + expected = b.astype(np.float64) * 0.5 + np.testing.assert_allclose(r.values[..., 1], expected) + np.testing.assert_array_equal(r.values[..., 0].astype(np.uint8), b) + + +def test_all_byte_no_scaling_stays_uint8(tmp_path): + """Two ``Byte`` bands with no ``ComplexSource`` scaling: the result + must stay uint8 (memory regression guard). The fix must not widen + unconditionally to float64. + """ + b0 = np.array([[10, 20], [30, 40]], dtype=np.uint8) + b1 = np.array([[50, 60], [70, 80]], dtype=np.uint8) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0) + _multiband_dtype_write(b1, p1) + vrt_path = _multiband_dtype_build_two_band_vrt(tmp_path, b0_dtype_str='Byte', b0_path=str(p0), b1_dtype_str='Byte', b1_path=str(p1)) + r = read_vrt(vrt_path) + assert r.dtype == np.uint8, f'All-Byte VRT with no scaling must stay uint8; got {r.dtype}' + np.testing.assert_array_equal(r.values[..., 0], b0) + np.testing.assert_array_equal(r.values[..., 1], b1) + + +def test_complex_source_scale_and_offset_preserve_precision(tmp_path): + """``ScaleRatio=0.25`` plus ``ScaleOffset=1.5`` on a uint8 band: + the scaled-and-offset values (e.g. ``10 * 0.25 + 1.5 = 4.0``, + ``11 * 0.25 + 1.5 = 4.25``) must survive without truncation. + + Note: the ``ComplexSource`` branch of ``parse_vrt`` in ``_vrt.py`` + maps the XML ```` to the dataclass ``scale`` attribute + and ```` to the ``offset`` attribute, then the + ``# Apply ComplexSource scaling`` block in ``read_vrt`` applies + ``src_arr = src_arr * scale + offset``. + """ + b = np.array([[10, 11], [12, 13]], dtype=np.uint8) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b, p0) + _multiband_dtype_write(b, p1) + vrt_path = _multiband_dtype_build_complex_source_vrt(tmp_path, dtype_str='Byte', src_path=str(p1), scale_ratio=0.25, scale_offset=1.5, other_band_dtype='Byte', other_band_path=str(p0)) + r = read_vrt(vrt_path) + assert r.dtype.kind == 'f' + expected = b.astype(np.float64) * 0.25 + 1.5 + np.testing.assert_allclose(r.values[..., 1], expected) + + +def test_nodata_round_trip_through_widened_int_dtype(tmp_path): + """Band 0 = uint8 with NoData=255; band 1 = int16 with NoData=-9999. + ``np.result_type(uint8, int16) = int16``. Band 0's value 255 is + representable as int16 so the nodata fast-path still fires; the + surviving values must be preserved through the wider buffer. + """ + b0 = np.array([[1, 2], [3, 255]], dtype=np.uint8) + b1 = np.array([[100, 200], [300, -9999]], dtype=np.int16) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0, nodata=255) + _multiband_dtype_write(b1, p1, nodata=-9999) + vrt_path = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n 255\n \n {p0}\n 1\n \n \n \n \n \n -9999\n \n {p1}\n 1\n \n \n \n \n' + out = tmp_path / 'mixed.vrt' + out.write_text(vrt_path) + r = read_vrt(str(out), band_nodata='first') + if r.dtype.kind == 'f': + assert r.values[0, 0, 0] == 1 + assert r.values[0, 1, 0] == 2 + assert r.values[1, 0, 0] == 3 + assert np.isnan(r.values[1, 1, 0]) + assert np.isnan(r.values[1, 1, 1]) + assert r.values[0, 0, 1] == 100 + else: + assert r.dtype == np.int16 + assert r.values[0, 0, 0] == 1 + assert r.values[0, 0, 1] == 100 + + +def test_single_band_complex_source_scale_widens_buffer(tmp_path): + """Single-band ``Byte`` VRT with ``0.5``. + The single-band branch in ``read_vrt`` must mirror the multi-band + widening logic; previously it used ``selected_bands[0].dtype`` + directly, so the scaled source values truncated back to uint8. + """ + b = np.array([[10, 11], [12, 13]], dtype=np.uint8) + p = tmp_path / 'b.tif' + _multiband_dtype_write(b, p) + vrt_path = _multiband_dtype_build_complex_source_vrt(tmp_path, dtype_str='Byte', src_path=str(p), scale_ratio=0.5, extra_band=False) + r = read_vrt(vrt_path) + assert r.ndim == 2, f'Single-band VRT must return a 2D array; got shape {r.shape}' + assert r.dtype.kind == 'f', f'Single-band scaled VRT must widen to float; got {r.dtype}' + expected = b.astype(np.float64) * 0.5 + np.testing.assert_allclose(r.values, expected) + + +def test_band_select_uint8_first_then_float_returns_float_for_band_1(tmp_path): + """When the caller selects ``band=1`` from a ``Byte`` + ``Float32`` VRT, + the result dtype must be float (the selected band's declared dtype), + not uint8 carried over from band 0. The previous code allocated based + on ``selected_bands[0].dtype`` -- which is correct after band selection + -- so this is the non-regression check that the new code still does + the right thing when only one band is selected. + """ + b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) + b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0) + _multiband_dtype_write(b1, p1) + vrt_path = _multiband_dtype_build_two_band_vrt(tmp_path, b0_dtype_str='Byte', b0_path=str(p0), b1_dtype_str='Float32', b1_path=str(p1)) + r = read_vrt(vrt_path, band=1) + assert r.dtype == np.float32 + np.testing.assert_allclose(r.values, b1) + + +def test_band_select_uint8_first_then_float_returns_uint8_for_band_0(tmp_path): + """Selecting ``band=0`` from a ``Byte`` + ``Float32`` VRT must return + uint8 (band 0's declared dtype) without widening. + """ + b0 = np.array([[10, 11], [12, 13]], dtype=np.uint8) + b1 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0) + _multiband_dtype_write(b1, p1) + vrt_path = _multiband_dtype_build_two_band_vrt(tmp_path, b0_dtype_str='Byte', b0_path=str(p0), b1_dtype_str='Float32', b1_path=str(p1)) + r = read_vrt(vrt_path, band=0) + assert r.dtype == np.uint8 + np.testing.assert_array_equal(r.values, b0) + + +def test_all_float32_multiband_stays_float32(tmp_path): + """Two ``Float32`` bands with no scaling: the buffer must stay + float32 rather than widening to float64. ``np.result_type`` of two + identical dtypes returns the same dtype. + """ + b0 = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32) + b1 = np.array([[5.5, 6.5], [7.5, 8.5]], dtype=np.float32) + p0 = tmp_path / 'b0.tif' + p1 = tmp_path / 'b1.tif' + _multiband_dtype_write(b0, p0) + _multiband_dtype_write(b1, p1) + vrt_path = _multiband_dtype_build_two_band_vrt(tmp_path, b0_dtype_str='Float32', b0_path=str(p0), b1_dtype_str='Float32', b1_path=str(p1)) + r = read_vrt(vrt_path) + assert r.dtype == np.float32 + np.testing.assert_allclose(r.values[..., 0], b0) + np.testing.assert_allclose(r.values[..., 1], b1) + + +def test_zero_band_vrt_raises_value_error(tmp_path): + """A malformed VRT with zero ```` children must + surface a clear ``ValueError`` from ``read_vrt`` rather than the + generic ``"at least one array or dtype is required"`` message + raised by ``np.result_type`` when called with no arguments. + """ + import pytest + vrt_xml = '\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n' + p = tmp_path / 'empty.vrt' + p.write_text(vrt_xml) + with pytest.raises(ValueError, match='no '): + read_vrt(str(p)) + + +# --------------------------------------------------------------------------- +# multiband per-band int nodata (#1611) +# Originally: test_vrt_multiband_int_nodata_1611.py +# --------------------------------------------------------------------------- + + +def _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path, *, dtype_str='UInt16', np_dtype=np.uint16, band0_sentinel=65535, band1_sentinel=65000, band0_other=(1, 2, 3), band1_other=(7, 8, 9)): + """Two single-band integer sources, each with a distinct nodata + sentinel, exposed as bands 1 and 2 of a hand-rolled VRT. + + Used to be band 0's sentinel was the only one masked. Now every + band gets its own sentinel. + """ + b0_arr = np.array([[band0_other[0], band0_other[1]], [band0_other[2], band0_sentinel]], dtype=np_dtype) + b1_arr = np.array([[band1_other[0], band1_other[1]], [band1_other[2], band1_sentinel]], dtype=np_dtype) + p0 = str(tmp_path / 'vrt_b0_1611.tif') + p1 = str(tmp_path / 'vrt_b1_1611.tif') + write(b0_arr, p0, nodata=band0_sentinel, compression='none', tiled=False) + write(b1_arr, p1, nodata=band1_sentinel, compression='none', tiled=False) + vrt_path = str(tmp_path / 'two_band_per_band_nodata_1611.vrt') + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {band0_sentinel}\n \n {p0}\n 1\n \n \n \n \n \n {band1_sentinel}\n \n {p1}\n 1\n \n \n \n \n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_multiband_uint16_per_band_sentinel_each_masked(tmp_path): + """The previously-broken case: every band's sentinel must be NaN. + + Before the fix this returned dtype=float64 with band 0's (1,1) cell + as NaN but band 1's (1,1) cell as the literal 65000.0. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band_nodata='first') + assert r.shape == (2, 2, 2) + assert r.dtype == np.float64, f'expected float64 promotion, got {r.dtype}' + assert np.isnan(r.values[1, 1, 0]), "band 0's sentinel pixel was not NaN-masked." + assert np.isnan(r.values[1, 1, 1]), "band 1's sentinel pixel was not NaN-masked; the regression from issue #1611 has returned." + assert r.values[0, 0, 0] == 1 + assert r.values[0, 0, 1] == 7 + assert r.values[1, 0, 0] == 3 + assert r.values[1, 0, 1] == 9 + + +def test_multiband_int32_negative_per_band_sentinel(tmp_path): + """Negative sentinels in a signed integer VRT also mask per-band. + + The original bug was dtype-independent: any integer dtype with + per-band would have hit it. Cover int32 + negative + sentinels to make sure the helper handles signed types and the + range guard accepts negatives. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path, dtype_str='Int32', np_dtype=np.int32, band0_sentinel=-9999, band1_sentinel=-7777, band0_other=(10, 20, 30), band1_other=(40, 50, 60)) + r = read_vrt(vrt_path, band_nodata='first') + assert r.dtype == np.float64 + assert np.isnan(r.values[1, 1, 0]) + assert np.isnan(r.values[1, 1, 1]) + assert r.values[0, 0, 0] == 10 + assert r.values[0, 0, 1] == 40 + + +def test_multiband_only_one_band_has_sentinel_present(tmp_path): + """If only one band's sentinel actually appears in the data, only + that band should change. The non-hitting band stays the same float64 + value (no spurious NaN introduced). + + Force band 1's sentinel never to appear by writing 99 instead. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path, band0_sentinel=65535, band1_sentinel=65000, band1_other=(7, 8, 9)) + b1_no_sentinel = np.array([[7, 8], [9, 99]], dtype=np.uint16) + import os + p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif') + write(b1_no_sentinel, p1, nodata=65000, compression='none', tiled=False) + r = read_vrt(vrt_path, band_nodata='first') + assert r.dtype == np.float64, "Even when only band 0 has a present sentinel, the array still needs promotion so band 0's NaN can be expressed." + assert np.isnan(r.values[1, 1, 0]) + assert r.values[1, 1, 1] == 99.0 + + +def test_multiband_no_sentinel_present_anywhere_keeps_int_dtype(tmp_path): + """When no band actually contains its declared sentinel, skip + promotion entirely. Avoids a needless float64 cast on integer data. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path, band0_sentinel=65535, band1_sentinel=65000, band0_other=(1, 2, 3), band1_other=(7, 8, 9)) + import os + b0 = np.array([[1, 2], [3, 4]], dtype=np.uint16) + b1 = np.array([[7, 8], [9, 10]], dtype=np.uint16) + p0 = os.path.join(os.path.dirname(vrt_path), 'vrt_b0_1611.tif') + p1 = os.path.join(os.path.dirname(vrt_path), 'vrt_b1_1611.tif') + write(b0, p0, nodata=65535, compression='none', tiled=False) + write(b1, p1, nodata=65000, compression='none', tiled=False) + r = read_vrt(vrt_path, band_nodata='first') + assert r.dtype == np.uint16 + assert r.values[1, 1, 0] == 4 + assert r.values[1, 1, 1] == 10 + + +def test_multiband_per_band_out_of_range_sentinel_is_no_op(tmp_path): + """A sentinel out of the integer dtype's range should be a no-op + for that band rather than raising. Mirrors PR #1583's behaviour + (#1581): the helper ``_int_nodata_in_range`` gates the cast. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path, dtype_str='UInt16', np_dtype=np.uint16, band0_sentinel=65535, band1_sentinel=10, band0_other=(1, 2, 3), band1_other=(7, 8, 9)) + with open(vrt_path) as f: + xml = f.read() + xml = xml.replace('10', '-9999') + with open(vrt_path, 'w') as f: + f.write(xml) + r = read_vrt(vrt_path, band_nodata='first') + assert np.isnan(r.values[1, 1, 0]) + assert r.values[1, 1, 1] == 10.0 or r.values[1, 1, 1] == 10 + + +def test_multiband_band_kwarg_still_per_band_post_pr1602(tmp_path): + """Non-regression check that PR #1602's band=N path still works. + + The fix here only changes the ``band is None`` branch; ``band=N`` + must still route through the single-band masking with its own + sentinel. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r0 = read_vrt(vrt_path, band=0, band_nodata='first') + r1 = read_vrt(vrt_path, band=1, band_nodata='first') + assert r0.dtype == np.float64 + assert r1.dtype == np.float64 + assert r0.attrs.get('nodata') == 65535.0 + assert r1.attrs.get('nodata') == 65000.0 + assert np.isnan(r0.values[1, 1]) + assert np.isnan(r1.values[1, 1]) + + +def test_multiband_attrs_nodata_still_band0(tmp_path): + """``attrs['nodata']`` for band=None reads is documented as band + 0's sentinel (the canonical attr cannot encode per-band values). + The pixel-level fix must not change that contract. + """ + vrt_path = _multiband_int_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band_nodata='first') + assert r.attrs.get('nodata') == 65535.0 + + +# --------------------------------------------------------------------------- +# VRT resample algorithm validation (#1751) +# Originally: test_vrt_resample_alg_1751.py +# --------------------------------------------------------------------------- + + +_UNSUPPORTED_RESAMPLE_EXC = (NotImplementedError, VRTUnsupportedError) + + +def _resample_alg_write_src(tmp_path) -> str: + """Write a 4x4 uint16 source TIFF and return its path.""" + src = np.arange(16, dtype=np.uint16).reshape(4, 4) + src_path = str(tmp_path / 'src.tif') + write(src, src_path, compression='none', tiled=False) + return src_path + + +def _resample_alg_write_vrt(tmp_path, xml: str, name: str='test.vrt') -> str: + p = str(tmp_path / name) + with open(p, 'w') as f: + f.write(xml) + return p + + +def _resample_alg_vrt_xml(src_path: str, *, alg_elem: str, dst_x: int=2, dst_y: int=2) -> str: + """Render a VRT XML with a 4x4 SrcRect and configurable DstRect+Alg. + + ``alg_elem`` is the raw ``...`` element + to splice into the ````, or the empty string to + omit it entirely. + """ + return f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n \n {src_path}\n 1\n \n \n {alg_elem}\n \n \n' + + +@pytest.mark.parametrize('alg', ['Bilinear', 'Cubic', 'CubicSpline', 'Lanczos', 'Average', 'Mode']) +def test_unsupported_resample_alg_raises(tmp_path, alg): + """A ComplexSource declaring any non-nearest algorithm with a size + change must raise ``NotImplementedError`` rather than return + silently nearest-sampled pixels.""" + src_path = _resample_alg_write_src(tmp_path) + xml = _resample_alg_vrt_xml(src_path, alg_elem=f'{alg}') + vrt_path = _resample_alg_write_vrt(tmp_path, xml, f'{alg.lower()}.vrt') + with pytest.raises(_UNSUPPORTED_RESAMPLE_EXC) as excinfo: + _resample_alg_read_vrt_internal(vrt_path) + msg = str(excinfo.value) + assert alg in msg + assert '1751' in msg + + +def test_unsupported_resample_alg_case_insensitive(tmp_path): + """Algorithm names are matched case-insensitively: ``bilinear`` + (lowercase) is the same unsupported request as ``Bilinear``.""" + src_path = _resample_alg_write_src(tmp_path) + xml = _resample_alg_vrt_xml(src_path, alg_elem='bilinear') + vrt_path = _resample_alg_write_vrt(tmp_path, xml, 'lower.vrt') + with pytest.raises(_UNSUPPORTED_RESAMPLE_EXC, match='bilinear'): + _resample_alg_read_vrt_internal(vrt_path) + + +@pytest.mark.parametrize('alg', ['Nearest', 'NearestNeighbour', 'NearestNeighbor', 'NEAR', 'nearest', 'NEAREST', '']) +def test_nearest_variants_accepted(tmp_path, alg): + """Nearest (and its case / spelling variants, plus empty text) is + the implemented algorithm and must round-trip without raising.""" + src_path = _resample_alg_write_src(tmp_path) + xml = _resample_alg_vrt_xml(src_path, alg_elem=f'{alg}') + vrt_path = _resample_alg_write_vrt(tmp_path, xml, f"near_{alg or 'empty'}.vrt") + arr, _ = _resample_alg_read_vrt_internal(vrt_path) + assert arr.shape == (2, 2) + + +def test_missing_resample_alg_accepted(tmp_path): + """Absent ```` (GDAL's nearest default) must still + round-trip without raising.""" + src_path = _resample_alg_write_src(tmp_path) + xml = _resample_alg_vrt_xml(src_path, alg_elem='') + vrt_path = _resample_alg_write_vrt(tmp_path, xml, 'absent.vrt') + arr, _ = _resample_alg_read_vrt_internal(vrt_path) + assert arr.shape == (2, 2) + + +def test_bilinear_at_same_size_does_not_raise(tmp_path): + """A ``Bilinear`` declaration with matching SrcRect/DstRect sizes + is nearest-equivalent (no resample step runs) so the read is + accepted. This pins down the resample-site placement of the + check -- a parse-time check would have rejected this case too.""" + src_path = _resample_alg_write_src(tmp_path) + xml = _resample_alg_vrt_xml(src_path, alg_elem='Bilinear', dst_x=4, dst_y=4) + vrt_path = _resample_alg_write_vrt(tmp_path, xml, 'bilinear_1to1.vrt') + arr, _ = _resample_alg_read_vrt_internal(vrt_path) + assert arr.shape == (4, 4) + + +# --------------------------------------------------------------------------- +# simple VRT mosaic positive coverage (#2369) +# Originally: test_vrt_simple_mosaic_2369.py +# --------------------------------------------------------------------------- + + +_PIXEL_W = 0.001 + + +_PIXEL_H = -0.001 + + +_CRS = 4326 + + +_NODATA = -9999.0 + + +def _simple_mosaic_make_tile(tmp_dir, name: str, data: np.ndarray, origin_x: float, origin_y: float, *, nodata: float | None=_NODATA) -> str: + """Write ``data`` as a single-band GeoTIFF anchored at the given origin. + + Returns the on-disk path. ``data`` shape is ``(H, W)``. + """ + height, width = data.shape + y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) + x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) + attrs = {'crs': _CRS} + if nodata is not None: + attrs['nodata'] = nodata + raster = xr.DataArray(data, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs=attrs) + path = os.path.join(tmp_dir, name) + to_geotiff(raster, path, nodata=nodata) + return path + + +def _simple_mosaic_make_multiband_tile(tmp_dir, name: str, data: np.ndarray, origin_x: float, origin_y: float) -> str: + """Write a multi-band GeoTIFF anchored at the given origin. + + ``data`` shape is ``(H, W, B)``. + """ + height, width, nbands = data.shape + y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) + x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) + raster = xr.DataArray(data, dims=['y', 'x', 'band'], coords={'y': y, 'x': x, 'band': np.arange(nbands)}, attrs={'crs': _CRS}) + path = os.path.join(tmp_dir, name) + to_geotiff(raster, path) + return path + + +@pytest.fixture +def simple_mosaic_mosaic_2x1(tmp_path): + """Two 32x32 float32 tiles side-by-side, west and east. + + Yields ``(vrt_path, expected_array, origin_x, origin_y)``. The + expected array is the horizontal concatenation of the two source + arrays. + """ + td = tmp_path / 'tmp_2369_2x1' + td.mkdir() + td = str(td) + height, width = (32, 32) + left_data = np.arange(height * width, dtype=np.float32).reshape(height, width) + right_data = (left_data + height * width).astype(np.float32) + origin_x, origin_y = (-120.0, 45.0) + left_path = _simple_mosaic_make_tile(td, 'left.tif', left_data, origin_x, origin_y) + right_path = _simple_mosaic_make_tile(td, 'right.tif', right_data, origin_x + _PIXEL_W * width, origin_y) + vrt_path = os.path.join(td, 'simple_mosaic_mosaic_2x1.vrt') + _write_vrt_internal(vrt_path, [left_path, right_path]) + expected = np.concatenate([left_data, right_data], axis=1) + yield (vrt_path, expected, origin_x, origin_y) + + +@pytest.fixture +def simple_mosaic_mosaic_2x2(tmp_path): + """Four 32x32 float32 tiles arranged 2 rows by 2 cols. + + Yields ``(vrt_path, expected_array, origin_x, origin_y)`` with the + expected array stitched in (row, col) order. + """ + td = tmp_path / 'tmp_2369_2x2' + td.mkdir() + td = str(td) + h, w = (32, 32) + tile_nw = np.full((h, w), 1.0, dtype=np.float32) + tile_ne = np.full((h, w), 2.0, dtype=np.float32) + tile_sw = np.full((h, w), 3.0, dtype=np.float32) + tile_se = np.full((h, w), 4.0, dtype=np.float32) + origin_x, origin_y = (-120.0, 45.0) + nw_path = _simple_mosaic_make_tile(td, 'nw.tif', tile_nw, origin_x, origin_y) + ne_path = _simple_mosaic_make_tile(td, 'ne.tif', tile_ne, origin_x + _PIXEL_W * w, origin_y) + sw_path = _simple_mosaic_make_tile(td, 'sw.tif', tile_sw, origin_x, origin_y + _PIXEL_H * h) + se_path = _simple_mosaic_make_tile(td, 'se.tif', tile_se, origin_x + _PIXEL_W * w, origin_y + _PIXEL_H * h) + vrt_path = os.path.join(td, 'simple_mosaic_mosaic_2x2.vrt') + _write_vrt_internal(vrt_path, [nw_path, ne_path, sw_path, se_path]) + top = np.concatenate([tile_nw, tile_ne], axis=1) + bottom = np.concatenate([tile_sw, tile_se], axis=1) + expected = np.concatenate([top, bottom], axis=0) + yield (vrt_path, expected, origin_x, origin_y) + + +@pytest.fixture +def simple_mosaic_mosaic_multiband_2x1(tmp_path): + """Two 3-band 32x32 float32 tiles side-by-side.""" + td = tmp_path / 'tmp_2369_mb_2x1' + td.mkdir() + td = str(td) + h, w, b = (32, 32, 3) + rng = np.random.default_rng(2369) + left_data = rng.random((h, w, b), dtype=np.float32) + right_data = rng.random((h, w, b), dtype=np.float32) + origin_x, origin_y = (-120.0, 45.0) + left_path = _simple_mosaic_make_multiband_tile(td, 'left_mb.tif', left_data, origin_x, origin_y) + right_path = _simple_mosaic_make_multiband_tile(td, 'right_mb.tif', right_data, origin_x + _PIXEL_W * w, origin_y) + vrt_path = os.path.join(td, 'mosaic_mb.vrt') + _write_vrt_internal(vrt_path, [left_path, right_path]) + expected = np.stack([np.concatenate([left_data[..., k], right_data[..., k]], axis=1) for k in range(b)], axis=-1) + yield (vrt_path, expected, origin_x, origin_y) + + +def _simple_mosaic_assert_attrs_ok(result, *, expected_nodata=None, expected_origin_x=None, expected_origin_y=None): + """Common attr assertions for VRT reads in this module. + + Checks that ``crs`` and ``transform`` are present and consistent + with the fixture constants, and optionally that ``nodata`` matches. + When ``expected_origin_x`` / ``expected_origin_y`` are passed, the + transform's origin entries are checked too -- pixel size alone is + not enough to catch a translation bug. + """ + assert 'crs' in result.attrs, f'crs missing from attrs; have {sorted(result.attrs)}' + crs_val = result.attrs['crs'] + if isinstance(crs_val, int): + assert crs_val == _CRS + else: + assert crs_val, 'crs attr is present but empty' + assert 'WGS' in str(crs_val) or '4326' in str(crs_val), f'crs attr does not look like EPSG:4326: {crs_val!r}' + assert 'transform' in result.attrs, f'transform missing from attrs; have {sorted(result.attrs)}' + transform = result.attrs['transform'] + assert len(transform) == 6, f'transform should be a 6-tuple, got {transform!r}' + assert transform[0] == pytest.approx(_PIXEL_W), f'transform pixel width = {transform[0]}, expected {_PIXEL_W}' + assert transform[4] == pytest.approx(_PIXEL_H), f'transform pixel height = {transform[4]}, expected {_PIXEL_H}' + if expected_origin_x is not None: + assert transform[2] == pytest.approx(expected_origin_x), f'transform origin_x = {transform[2]}, expected {expected_origin_x}' + if expected_origin_y is not None: + assert transform[5] == pytest.approx(expected_origin_y), f'transform origin_y = {transform[5]}, expected {expected_origin_y}' + if expected_nodata is not None: + assert 'nodata' in result.attrs, f'nodata missing from attrs; have {sorted(result.attrs)}' + assert result.attrs['nodata'] == pytest.approx(expected_nodata) + + +def _simple_mosaic_assert_coords_monotonic(result, *, expected_origin_x, expected_origin_y): + """Check that x/y coords are monotonic and start at the expected origin + (within half a pixel: TIFF coords are pixel centers, not corners). + """ + x = np.asarray(result['x'].values) + y = np.asarray(result['y'].values) + assert np.all(np.diff(x) > 0), 'x coord is not strictly increasing' + assert np.all(np.diff(y) < 0), 'y coord is not strictly decreasing' + assert x[0] == pytest.approx(expected_origin_x + _PIXEL_W * 0.5) + assert y[0] == pytest.approx(expected_origin_y + _PIXEL_H * 0.5) + + +def test_eager_2x1_mosaic_values_coords_attrs(simple_mosaic_mosaic_2x1): + """Eager read of a 2x1 horizontal mosaic returns the concatenated + pixel block, with monotonic coords and the fixture's crs / transform + / nodata on attrs. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_2x1 + result = read_vrt(vrt_path) + assert result.shape == expected.shape, f'eager 2x1 shape {result.shape}, expected {expected.shape}' + np.testing.assert_array_equal(result.values, expected) + _simple_mosaic_assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(result, expected_nodata=_NODATA, expected_origin_x=ox, expected_origin_y=oy) + + +def test_eager_2x2_mosaic_values_coords_attrs(simple_mosaic_mosaic_2x2): + """Eager read of a 2x2 mosaic stitches tiles in the right order. + + Each tile has a distinct constant value, so a misordered placement + surfaces immediately in the value assertion rather than appearing + only as a numeric diff. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_2x2 + result = read_vrt(vrt_path) + assert result.shape == expected.shape, f'eager 2x2 shape {result.shape}, expected {expected.shape}' + np.testing.assert_array_equal(result.values, expected) + _simple_mosaic_assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(result, expected_nodata=_NODATA, expected_origin_x=ox, expected_origin_y=oy) + + +def test_windowed_read_aligned_with_source_boundary(simple_mosaic_mosaic_2x1): + """A window crossing the seam between the two source tiles returns + the same pixels as slicing the full mosaic. + + The window picked here covers the right half of the left tile and + the left half of the right tile: both halves land on whole-pixel + boundaries inside their respective sources, so this is the + "request lines up with source pixels" case from the issue. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_2x1 + h = expected.shape[0] + r0, c0, r1, c1 = (0, 16, h, 48) + result = read_vrt(vrt_path, window=(r0, c0, r1, c1)) + np.testing.assert_array_equal(result.values, expected[r0:r1, c0:c1]) + full = read_vrt(vrt_path) + np.testing.assert_array_equal(np.asarray(result['x'].values), np.asarray(full['x'].values)[c0:c1]) + np.testing.assert_array_equal(np.asarray(result['y'].values), np.asarray(full['y'].values)[r0:r1]) + expected_window_ox = ox + _PIXEL_W * c0 + expected_window_oy = oy + _PIXEL_H * r0 + _simple_mosaic_assert_attrs_ok(result, expected_nodata=_NODATA, expected_origin_x=expected_window_ox, expected_origin_y=expected_window_oy) + + +def test_dask_2x1_mosaic_multi_chunk_matches_eager(simple_mosaic_mosaic_2x1): + """Dask read with chunks smaller than the mosaic returns the same + pixels as the eager read, and uses a real multi-block dask graph. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_2x1 + chunked = read_vrt(vrt_path, chunks=(16, 16)) + assert isinstance(chunked.data, da.Array), f'expected dask Array, got {type(chunked.data).__name__}' + assert chunked.data.numblocks == (2, 4), f'expected 2x4 blocks, got {chunked.data.numblocks}' + computed = chunked.compute() + np.testing.assert_array_equal(computed.values, expected) + _simple_mosaic_assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(computed, expected_nodata=_NODATA, expected_origin_x=ox, expected_origin_y=oy) + + +def test_dask_2x2_mosaic_multi_chunk_matches_eager(simple_mosaic_mosaic_2x2): + """Dask read of the 2x2 mosaic with chunk size below tile size. + + Chunks of 16 split each 32x32 tile into 2x2 blocks. The full + mosaic is 64x64 so the resulting dask array is 4x4 blocks. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_2x2 + chunked = read_vrt(vrt_path, chunks=(16, 16)) + assert isinstance(chunked.data, da.Array) + assert chunked.data.numblocks == (4, 4), f'expected 4x4 blocks, got {chunked.data.numblocks}' + computed = chunked.compute() + np.testing.assert_array_equal(computed.values, expected) + _simple_mosaic_assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(computed, expected_nodata=_NODATA, expected_origin_x=ox, expected_origin_y=oy) + + +def test_eager_multiband_2x1_mosaic(simple_mosaic_mosaic_multiband_2x1): + """Eager read of a multi-band 2x1 mosaic returns one stitched plane + per band. + + Multi-band VRT reads return shape ``(H, W, B)`` to match the + on-disk layout; assert per-band values against the stack built in + the fixture. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_multiband_2x1 + result = read_vrt(vrt_path) + assert result.shape == expected.shape, f'multiband 2x1 shape {result.shape}, expected {expected.shape}' + np.testing.assert_array_equal(result.values, expected) + _simple_mosaic_assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(result, expected_origin_x=ox, expected_origin_y=oy) + + +def test_dask_multiband_2x1_mosaic_matches_eager(simple_mosaic_mosaic_multiband_2x1): + """Dask read of the multi-band 2x1 mosaic with sub-tile chunks must + match the eager read pixel-for-pixel across every band. + + Chunking exercises per-block band handling: a bug that loses a + band on one chunk but not another would not appear in the eager + test above. + """ + vrt_path, expected, ox, oy = simple_mosaic_mosaic_multiband_2x1 + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=(16, 16)) + assert isinstance(chunked.data, da.Array), f'expected dask Array, got {type(chunked.data).__name__}' + computed = chunked.compute() + assert computed.shape == eager.shape + np.testing.assert_array_equal(computed.values, eager.values) + np.testing.assert_array_equal(computed.values, expected) + _simple_mosaic_assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) + _simple_mosaic_assert_attrs_ok(computed, expected_origin_x=ox, expected_origin_y=oy) diff --git a/xrspatial/geotiff/tests/vrt/test_metadata.py b/xrspatial/geotiff/tests/vrt/test_metadata.py new file mode 100644 index 000000000..6a8df74bb --- /dev/null +++ b/xrspatial/geotiff/tests/vrt/test_metadata.py @@ -0,0 +1,1803 @@ +"""Consolidated VRT metadata test suite. + +Folds twelve issue-numbered VRT test files under +``xrspatial/geotiff/tests/`` into one place, organised by sub-concern. +Each section preserves the helpers, fixtures, and assertions of its +originating file; helpers are prefixed (e.g. ``_holes_attr_*``) so the +cross-file folds do not collide. Test names dropped their trailing +issue number where the originating file already namespaced them. + +Sections: +* ``vrt_holes`` attr on missing-source reads (#1734) +* ``masked_nodata`` attr honours ``mask_nodata`` kwarg (#2159) +* Per-band ```` selection (#1598) +* SimpleSource ``0`` survives the falsy-zero bug (#1655) +* Integer-with-nodata promotion through ``read_vrt`` (#1564) +* ``mask_nodata=False`` preserves float sentinels (#2158) +* Tile-level metadata parity for VRT tiled writes (#1606) +* VRT XML parsed once on the chunked path (#1825) +* ``write_vrt`` escapes XML special characters (#1607) +* XML size cap on eager ``read_vrt`` (#1815) +* XML size cap on chunked ``read_vrt`` (#1831) +* VRT metadata parity across backends (#2321 sub-PR 3) + +See ``CLUSTER_AUDIT_PR6.md`` for the file:test -> section:test mapping. +""" +from __future__ import annotations + +import dask.array as da +import glob +import numpy as np +import os +import pathlib +import pickle +import pytest +import tempfile +import warnings +import xarray as xr +from xrspatial.geotiff import ( + GeoTIFFFallbackWarning, + MixedBandMetadataError, + open_geotiff, + read_geotiff_dask, + read_vrt, + to_geotiff, + write_vrt, +) +from xrspatial.geotiff._attrs import GEOREF_STATUS_FULL, GEOREF_STATUS_TRANSFORM_ONLY +from xrspatial.geotiff._errors import VRTUnsupportedError +from xrspatial.geotiff._geotags import GeoTransform +from xrspatial.geotiff._vrt import parse_vrt +from xrspatial.geotiff._vrt import read_vrt as _source_nodata_zero_read_vrt_internal +from xrspatial.geotiff._vrt import read_vrt as _xml_size_cap_read_vrt_internal +from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal +from xrspatial.geotiff._writer import write +from xrspatial.geotiff.tests.conftest import requires_gpu + + +# --------------------------------------------------------------------------- +# vrt_holes attr on missing-source reads (#1734) +# Originally: test_vrt_holes_attr_1734.py +# --------------------------------------------------------------------------- + + +@pytest.fixture +def holes_attr_clear_strict_env(monkeypatch): + monkeypatch.delenv('XRSPATIAL_GEOTIFF_STRICT', raising=False) + + +@pytest.fixture +def holes_attr_set_strict_env(monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_STRICT', '1') + + +def _holes_attr_write_vrt_with_missing_source(vrt_path, missing_src) -> None: + """Write a VRT with an Int32 band whose only source is missing. + + Integer ``dataType`` is the failure mode issue #1734 was about: the + pre-fix lenient path zero-fills the output buffer (``fill = 0`` for + integer dtypes) and the user cannot distinguish that hole from real + zero-valued data. ``NoDataValue`` is omitted on purpose -- having + one would let downstream code mask the hole and side-step the + regression. See the module docstring. + """ + vrt_path.write_text(f'\n \n 0, 1, 0, 0, 0, -1\n \n \n {missing_src}\n 1\n \n \n \n \n\n') + + +def test_skipped_source_records_vrt_holes_attr(holes_attr_clear_strict_env, tmp_path): + """A VRT with a missing source returns a DataArray whose attrs + carry a ``vrt_holes`` entry naming the source, band, dst_rect, + and underlying error. + + Uses an Int32 VRT so the hole is zero-filled (the exact failure + mode #1734 was about): without the attr there is no way to tell + the all-zeros tile from real data. + """ + import numpy as np + vrt_path = tmp_path / 'mosaic_1734_missing.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734.tif' + _holes_attr_write_vrt_with_missing_source(vrt_path, missing_src) + with warnings.catch_warnings(): + warnings.simplefilter('ignore', GeoTIFFFallbackWarning) + da = read_vrt(str(vrt_path), missing_sources='warn') + assert np.issubdtype(da.dtype, np.integer) + assert (da.values == 0).all() + assert 'vrt_holes' in da.attrs + holes = da.attrs['vrt_holes'] + assert isinstance(holes, list) + assert len(holes) == 1 + h = holes[0] + assert h['source'].endswith('does_not_exist_1734.tif') + assert h['band'] == 1 + assert h['dst_rect'] == (0, 0, 4, 4) + assert 'error' in h + assert h['error'] + + +def test_no_holes_attr_when_all_sources_read(holes_attr_clear_strict_env, tmp_path): + """A successful VRT read does not advertise an empty ``vrt_holes`` + attr; the key is omitted entirely so ``"vrt_holes" in attrs`` is a + cheap completeness check.""" + import numpy as np + import xarray as xr + from xrspatial.geotiff import to_geotiff + src_path = tmp_path / 'src_1734.tif' + arr = np.arange(16, dtype=np.float32).reshape(4, 4) + da_src = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.linspace(3.5, 0.5, 4), 'x': np.linspace(0.5, 3.5, 4)}, attrs={'crs': 4326}) + to_geotiff(da_src, str(src_path), compression='none') + vrt_path = tmp_path / 'mosaic_1734_ok.vrt' + vrt_path.write_text(f'\n \n 0, 1, 0, 0, 0, -1\n \n \n {src_path}\n 1\n \n \n \n \n\n') + with warnings.catch_warnings(): + warnings.simplefilter('error', GeoTIFFFallbackWarning) + da = read_vrt(str(vrt_path)) + assert 'vrt_holes' not in da.attrs + + +def test_strict_mode_still_raises(holes_attr_set_strict_env, tmp_path): + """Strict mode is unchanged: the missing source surfaces the + underlying ``FileNotFoundError`` (an ``OSError`` subclass) from + ``read_to_array`` instead of warning-and-skipping. + + Asserting the concrete exception class -- not a bare ``Exception`` + -- keeps the regression test honest: an unrelated bug somewhere in + the read path that happens to raise a different exception will + fail this test instead of silently satisfying it. + """ + vrt_path = tmp_path / 'mosaic_1734_strict.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif' + _holes_attr_write_vrt_with_missing_source(vrt_path, missing_src) + with pytest.raises(FileNotFoundError, match='does_not_exist_1734_strict.tif'): + read_vrt(str(vrt_path)) + + +def test_warning_mentions_how_to_detect_holes(holes_attr_clear_strict_env, tmp_path): + """The fallback warning now points callers at the attr or the + strict env var so the recovery path is discoverable from a single + captured warning.""" + vrt_path = tmp_path / 'mosaic_1734_msg.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734_msg.tif' + _holes_attr_write_vrt_with_missing_source(vrt_path, missing_src) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + read_vrt(str(vrt_path), missing_sources='warn') + fallback = [x for x in w if issubclass(x.category, GeoTIFFFallbackWarning)] + assert fallback, 'expected at least one GeoTIFFFallbackWarning' + msg = ' '.join((str(x.message) for x in fallback)) + assert 'vrt_holes' in msg or 'XRSPATIAL_GEOTIFF_STRICT' in msg + + +# --------------------------------------------------------------------------- +# masked_nodata attr honours mask_nodata kwarg (#2159) +# Originally: test_vrt_masked_nodata_attr_2159.py +# --------------------------------------------------------------------------- + + +def _masked_nodata_attr_write_float_vrt(tmp_path, src_basename, vrt_basename, sentinel=-9999.0): + """Build a single-band float32 VRT with a declared sentinel. + + Layout mirrors the working pattern from + ``test_masked_nodata_attr_2092.py``: ``GeoTransform`` plus explicit + ``SrcRect`` / ``DstRect`` are required by the in-repo VRT reader. + """ + tifffile = pytest.importorskip('tifffile') + src = str(tmp_path / src_basename) + tifffile.imwrite(src, np.array([[1.0, 2.0, sentinel], [4.0, sentinel, 6.0]], dtype=np.float32), metadata=None) + vrt = str(tmp_path / vrt_basename) + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {sentinel}\n \n {src}\n 1\n \n \n \n \n\n' + with open(vrt, 'w') as fh: + fh.write(vrt_xml) + return vrt + + +def _masked_nodata_attr_write_int_vrt(tmp_path, src_basename, vrt_basename, sentinel=30): + """Single-band int16 VRT with a declared sentinel.""" + tifffile = pytest.importorskip('tifffile') + src = str(tmp_path / src_basename) + tifffile.imwrite(src, np.array([[10, 20, 30], [40, 50, 60]], dtype=np.int16), metadata=None) + vrt = str(tmp_path / vrt_basename) + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {sentinel}\n \n {src}\n 1\n \n \n \n \n\n' + with open(vrt, 'w') as fh: + fh.write(vrt_xml) + return vrt + + +def test_vrt_eager_float_source_mask_off_reports_false(tmp_path): + """Eager VRT + float source + ``mask_nodata=False`` must report + ``masked_nodata=False``. Pre-fix rule (dtype alone) said ``True``.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_eager_float_src.tif', 'tmp_2159_eager_unmasked.vrt') + out = open_geotiff(vrt, mask_nodata=False) + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is False, f"caller opted out of masking but attrs say masked_nodata={out.attrs.get('masked_nodata')!r}" + + +def test_vrt_eager_float_source_mask_on_reports_true(tmp_path): + """Canonical direction: float source + masking on. The masking + step runs, attr says True. Regression guard.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_eager_float_src_masked.tif', 'tmp_2159_eager_masked.vrt') + out = open_geotiff(vrt) + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is True + + +def test_vrt_eager_int_source_mask_off_reports_false(tmp_path): + """Eager VRT + int source + ``mask_nodata=False``: integer helper + skipped, dtype stays int, attr says False. Pre-fix rule already + got this right (int dtype -> False); keep it green under the + new ``mask_nodata and dtype.kind == 'f'`` rule.""" + vrt = _masked_nodata_attr_write_int_vrt(tmp_path, 'tmp_2159_eager_int_src.tif', 'tmp_2159_eager_int_unmasked.vrt') + out = open_geotiff(vrt, mask_nodata=False) + assert out.dtype.kind == 'i' + assert out.attrs.get('masked_nodata') is False + + +def test_vrt_eager_float_source_mask_off_with_cast_reports_false(tmp_path): + """Eager VRT + float source + ``mask_nodata=False`` + ``dtype=float64`` + cast. Pre-fix used ``pre_cast_dtype.kind == 'f'`` so pre-cast is + float anyway and the rule said True. New rule short-circuits on + ``mask_nodata=False`` and says False. The caller-supplied cast is + still recorded via ``nodata_dtype_cast``.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_eager_float_src_cast.tif', 'tmp_2159_eager_unmasked_cast.vrt') + out = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_dtype_cast') == 'float64' + + +def test_vrt_chunked_float_source_mask_off_reports_false(tmp_path): + """Chunked VRT path (``chunks=`` triggers ``_read_vrt_chunked``) + + float source + ``mask_nodata=False`` must report False.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_chunked_float_src.tif', 'tmp_2159_chunked_unmasked.vrt') + out = read_geotiff_dask(vrt, chunks=2, mask_nodata=False) + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is False, f"chunked VRT path: caller opted out of masking but attrs say masked_nodata={out.attrs.get('masked_nodata')!r}" + + +def test_vrt_chunked_float_source_mask_on_reports_true(tmp_path): + """Canonical direction on the chunked path: masking on, attr True.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_chunked_float_src_masked.tif', 'tmp_2159_chunked_masked.vrt') + out = read_geotiff_dask(vrt, chunks=2) + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is True + + +def test_vrt_chunked_int_source_mask_off_reports_false(tmp_path): + """Chunked VRT + int source + ``mask_nodata=False``. ``declared_dtype`` + stays integer because the masking-driven float-promotion gate + earlier in the function is itself gated on ``mask_nodata``. + The attr says False under both the old and the new rule.""" + vrt = _masked_nodata_attr_write_int_vrt(tmp_path, 'tmp_2159_chunked_int_src.tif', 'tmp_2159_chunked_int_unmasked.vrt') + out = read_geotiff_dask(vrt, chunks=2, mask_nodata=False) + assert out.dtype.kind == 'i' + assert out.attrs.get('masked_nodata') is False + + +def test_vrt_chunked_float_source_mask_off_with_cast_reports_false(tmp_path): + """Chunked VRT + float source + ``mask_nodata=False`` + ``dtype=float64`` + cast. Same logic as the eager equivalent: caller opted out of + masking, attr is False even though the lazy graph dtype is float.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_chunked_float_src_cast.tif', 'tmp_2159_chunked_unmasked_cast.vrt') + out = read_geotiff_dask(vrt, chunks=2, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_dtype_cast') == 'float64' + + +def test_vrt_attr_matches_dask_backend_under_mask_off(tmp_path): + """Both VRT backends should report the same ``masked_nodata`` as + the regular dask backend does for an equivalent input. Pins the + cross-backend invariant the contract at + ``_attrs._set_nodata_attrs`` calls out.""" + vrt = _masked_nodata_attr_write_float_vrt(tmp_path, 'tmp_2159_xbackend_src.tif', 'tmp_2159_xbackend.vrt') + eager = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) + chunked = read_geotiff_dask(vrt, chunks=2, mask_nodata=False, dtype=np.float64) + assert eager.attrs.get('masked_nodata') is False + assert chunked.attrs.get('masked_nodata') is False + assert eager.attrs.get('masked_nodata') == chunked.attrs.get('masked_nodata') + + +# --------------------------------------------------------------------------- +# per-band selection (#1598) +# Originally: test_vrt_band_nodata_1598.py +# --------------------------------------------------------------------------- + + +def _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path): + """Two single-band uint16 sources, each with a distinct nodata + sentinel, exposed as bands 1 and 2 of a hand-rolled VRT. + """ + band0 = np.array([[1, 2], [3, 65535]], dtype=np.uint16) + band1 = np.array([[7, 8], [9, 65000]], dtype=np.uint16) + p0 = str(tmp_path / 'vrt_band0_1598.tif') + p1 = str(tmp_path / 'vrt_band1_1598.tif') + write(band0, p0, nodata=65535, compression='none', tiled=False) + write(band1, p1, nodata=65000, compression='none', tiled=False) + vrt_path = str(tmp_path / 'two_band_per_band_nodata_1598.vrt') + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n 65535\n \n {p0}\n 1\n \n \n \n \n \n 65000\n \n {p1}\n 1\n \n \n \n \n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_read_vrt_band0_uses_band0_nodata(tmp_path): + """Sanity check the band-0 selection still works after the fix. + + Confirms the refactor did not flip the index. + + The fixture mosaics two bands with distinct per-band sentinels, so + after #1987 PR 5 the default read raises ``MixedBandMetadataError``. + The pre-#1987 flatten-to-first-band semantics this regression tests + are still reachable via ``band_nodata='first'``; the opt-in surfaces + at the call site that the test is exercising the legacy behaviour. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band=0, band_nodata='first') + assert r.dtype == np.float64 + assert r.attrs.get('nodata') == 65535.0 + assert np.isnan(r.values[1, 1]) + assert r.values[0, 0] == 1 + + +def test_read_vrt_band1_uses_band1_nodata(tmp_path): + """The previously-broken case: band=1 must use band 1's sentinel. + + Before the fix this returned dtype=uint16 with values=[[7,8], + [9,65000]] and attrs['nodata']=65535. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band=1, band_nodata='first') + assert r.dtype == np.float64, 'band=1 read kept uint16 dtype; per-band nodata regression.' + assert r.attrs.get('nodata') == 65000.0, f"attrs['nodata'] was {r.attrs.get('nodata')}, expected 65000 from band 1's ." + assert np.isnan(r.values[1, 1]), "band 1's sentinel pixel was not NaN-masked; promotion ran against the wrong sentinel." + assert r.values[0, 0] == 7 + assert r.values[1, 0] == 9 + + +def test_read_vrt_no_band_keeps_band0_nodata_attr(tmp_path): + """Unselected reads still surface band 0's sentinel. + + Multi-band VRTs with mixed sentinels return all bands stacked, and + the canonical attr cannot encode per-band values; advertising + band 0's sentinel matches the prior behavior and the documented + "first band wins" contract for multi-band reads. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band_nodata='first') + assert r.attrs.get('nodata') == 65535.0 + + +def test_read_vrt_negative_band_raises(tmp_path): + """Negative band indices used to be silently accepted via Python + list indexing (``vrt.bands[-1]`` returned the last band) while the + public reader's nodata lookup rejected them, producing band-N data + with no nodata sentinel. They are now a clear ValueError up front. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match='band'): + read_vrt(vrt_path, band=-1) + + +def test_read_vrt_out_of_range_band_raises(tmp_path): + """Out-of-range band indices used to raise IndexError from deep in + the read path. They are now a ValueError that names the available + band count. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match='out of range'): + read_vrt(vrt_path, band=5, band_nodata='first') + + +def test_read_vrt_non_integer_band_raises(tmp_path): + """A non-int ``band`` would previously have raised TypeError on the + list index. ValueError here matches the rest of the input + validation surface. + """ + vrt_path = _band_nodata_write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match='band'): + read_vrt(vrt_path, band='1') + with pytest.raises(ValueError, match='band'): + read_vrt(vrt_path, band=True) + + +# --------------------------------------------------------------------------- +# SimpleSource 0 survives (#1655) +# Originally: test_vrt_source_nodata_zero_1655.py +# --------------------------------------------------------------------------- + + +def _source_nodata_zero_write_source(tmp_path, arr, name='src_1655.tif'): + """Write a small float32 GeoTIFF without a GDAL_NODATA tag.""" + p = str(tmp_path / name) + write(arr, p, geo_transform=GeoTransform(origin_x=0.0, origin_y=0.0, pixel_width=1.0, pixel_height=-1.0), crs_epsg=4326, compression='none', tiled=False) + return p + + +def _source_nodata_zero_vrt_with_source_nodata(tmp_path, src_path, nodata_xml, include_band_nodata=False, width=4, height=3, band_nodata='0.0'): + """Write a single-band Float32 VRT with the supplied ```` + on its SimpleSource. ``include_band_nodata`` controls whether a + ```` is emitted on the band as well. + """ + band_nd_elem = f'{band_nodata}' if include_band_nodata else '' + vrt_xml = f'\n EPSG:4326\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {band_nd_elem}\n \n {src_path}\n 1\n \n \n {nodata_xml}\n \n \n\n' + vrt_path = str(tmp_path / 'src_zero_1655.vrt') + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +class TestVRTSourceNodataZero: + """SimpleSource ``0`` must mask zeros to NaN.""" + + def test_source_nodata_zero_no_band_nodata(self, tmp_path): + """SimpleSource NODATA=0 with no band-level fallback masks zeros.""" + arr = np.array([[1.0, 0.0, 3.0, 0.0], [4.0, 0.0, 6.0, 7.0], [0.0, 8.0, 9.0, 10.0]], dtype=np.float32) + src = _source_nodata_zero_write_source(tmp_path, arr) + vrt = _source_nodata_zero_vrt_with_source_nodata(tmp_path, src, '0.0') + result, _ = _source_nodata_zero_read_vrt_internal(vrt) + assert int(np.isnan(result).sum()) == 4 + + def test_source_nodata_zero_integer_xml(self, tmp_path): + """``0`` (integer literal) also masks zeros.""" + arr = np.array([[1.0, 0.0, 3.0]], dtype=np.float32) + src = _source_nodata_zero_write_source(tmp_path, arr, name='int_xml.tif') + vrt = _source_nodata_zero_vrt_with_source_nodata(tmp_path, src, '0', width=3, height=1) + result, _ = _source_nodata_zero_read_vrt_internal(vrt) + assert int(np.isnan(result).sum()) == 1 + assert np.isnan(result[0, 1]) + + def test_source_nodata_nonzero_unchanged(self, tmp_path): + """SimpleSource NODATA != 0 keeps masking behaviour.""" + arr = np.array([[1.0, 0.0, 3.0, 0.0]], dtype=np.float32) + src = _source_nodata_zero_write_source(tmp_path, arr, name='nonzero.tif') + vrt = _source_nodata_zero_vrt_with_source_nodata(tmp_path, src, '1.0', width=4, height=1) + result, _ = _source_nodata_zero_read_vrt_internal(vrt) + assert int(np.isnan(result).sum()) == 1 + assert np.isnan(result[0, 0]) + + def test_band_nodata_zero_still_honoured(self, tmp_path): + """Band-level ``0`` keeps working.""" + arr = np.array([[1.0, 0.0, 3.0]], dtype=np.float32) + src = _source_nodata_zero_write_source(tmp_path, arr, name='band_zero.tif') + vrt_xml = f'\n EPSG:4326\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n 0.0\n \n {src}\n 1\n \n \n \n \n\n' + vrt = str(tmp_path / 'band_zero_1655.vrt') + with open(vrt, 'w') as f: + f.write(vrt_xml) + result, _ = _source_nodata_zero_read_vrt_internal(vrt) + assert int(np.isnan(result).sum()) == 1 + assert np.isnan(result[0, 1]) + + def test_source_nodata_zero_overrides_band(self, tmp_path): + """SimpleSource NODATA=0 takes precedence over band NoDataValue=99.""" + arr = np.array([[1.0, 0.0, 99.0]], dtype=np.float32) + src = _source_nodata_zero_write_source(tmp_path, arr, name='override.tif') + vrt = _source_nodata_zero_vrt_with_source_nodata(tmp_path, src, '0.0', include_band_nodata=True, band_nodata='99.0', width=3, height=1) + result, _ = _source_nodata_zero_read_vrt_internal(vrt) + assert int(np.isnan(result).sum()) == 1 + assert np.isnan(result[0, 1]) + assert result[0, 2] == pytest.approx(99.0) + + +# --------------------------------------------------------------------------- +# integer-with-nodata promotion (#1564) +# Originally: test_vrt_int_nodata_1564.py +# --------------------------------------------------------------------------- + + +def _int_nodata_write_uint16_with_nodata_tif(path, sentinel): + """Write a small uint16 GeoTIFF with a nodata sentinel.""" + arr = np.array([[1, 2, 3], [sentinel, 5, 6]], dtype=np.uint16) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.arange(2), 'x': np.arange(3)}, attrs={'crs': 4326, 'nodata': sentinel}) + to_geotiff(da, path, compression='none', nodata=sentinel) + return arr + + +def test_vrt_uint16_nodata_promotes_to_float64(tmp_path): + """VRT route NaN-masks integer-with-nodata, matching open_geotiff.""" + tif = str(tmp_path / 'src_1564.tif') + _int_nodata_write_uint16_with_nodata_tif(tif, sentinel=65535) + eager = open_geotiff(tif) + assert eager.dtype == np.float64 + assert np.isnan(eager.values[1, 0]) + vrt_path = str(tmp_path / 'src_1564.vrt') + write_vrt(vrt_path, [tif]) + via_vrt = read_vrt(vrt_path) + assert via_vrt.dtype == np.float64, f'VRT integer-with-nodata should promote to float64; got {via_vrt.dtype}' + assert np.isnan(via_vrt.values[1, 0]), f'VRT sentinel pixel should be NaN; got {via_vrt.values[1, 0]} (literal sentinel survived)' + assert via_vrt.attrs.get('nodata') == 65535.0 + + +def test_vrt_uint16_no_nodata_keeps_dtype(tmp_path): + """Without a nodata sentinel, the dtype stays integer.""" + arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.arange(2), 'x': np.arange(3)}, attrs={'crs': 4326}) + tif = str(tmp_path / 'src_no_nodata_1564.tif') + to_geotiff(da, tif, compression='none') + vrt_path = str(tmp_path / 'src_no_nodata_1564.vrt') + write_vrt(vrt_path, [tif]) + via_vrt = read_vrt(vrt_path) + assert via_vrt.dtype == np.uint16 + np.testing.assert_array_equal(via_vrt.values, arr) + + +def test_vrt_float_nodata_still_masks(tmp_path): + """Regression guard: the existing float-with-nodata branch still + works after the integer-branch addition.""" + arr = np.array([[1.0, 2.0, -9999.0], [4.0, -9999.0, 6.0]], dtype=np.float32) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.arange(2), 'x': np.arange(3)}, attrs={'crs': 4326, 'nodata': -9999.0}) + tif = str(tmp_path / 'srcf_1564.tif') + to_geotiff(da, tif, compression='none', nodata=-9999.0) + vrt_path = str(tmp_path / 'srcf_1564.vrt') + write_vrt(vrt_path, [tif]) + via_vrt = read_vrt(vrt_path) + assert via_vrt.dtype == np.float32 + assert np.isnan(via_vrt.values[0, 2]) + assert np.isnan(via_vrt.values[1, 1]) + + +def _int_nodata_rewrite_vrt_nodata(vrt_path, new_nodata_text): + """Rewrite the element of an existing VRT to a literal + string so we can exercise fractional / out-of-range cases without + going through ``write_vrt`` (which only accepts numeric values).""" + with open(vrt_path, 'r') as f: + xml = f.read() + import re + new_xml, n = re.subn('[^<]*', f'{new_nodata_text}', xml) + assert n == 1, f'expected 1 NoDataValue element, found {n}' + with open(vrt_path, 'w') as f: + f.write(new_xml) + + +def test_vrt_fractional_nodata_is_not_masked(tmp_path): + """Fractional VRT NoDataValue against an integer band must NOT mask: + truncating to int would alias a real pixel value as nodata.""" + arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.arange(2), 'x': np.arange(3)}, attrs={'crs': 4326, 'nodata': 1}) + tif = str(tmp_path / 'frac_1564.tif') + to_geotiff(da, tif, compression='none', nodata=1) + vrt_path = str(tmp_path / 'frac_1564.vrt') + write_vrt(vrt_path, [tif]) + _int_nodata_rewrite_vrt_nodata(vrt_path, '1.9') + via_vrt = read_vrt(vrt_path) + assert via_vrt.dtype == np.uint16, f'Fractional NoDataValue must not trigger integer masking (got dtype {via_vrt.dtype}, pixel @[0,0]={via_vrt.values[0, 0]})' + np.testing.assert_array_equal(via_vrt.values, arr) + + +def test_vrt_out_of_range_nodata_is_not_masked(tmp_path): + """NoDataValue outside the dtype range must NOT mask: casting would + wrap and alias an in-range pixel.""" + arr = np.array([[0, 1, 2], [3, 4, 5]], dtype=np.uint16) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.arange(2), 'x': np.arange(3)}, attrs={'crs': 4326, 'nodata': 0}) + tif = str(tmp_path / 'oor_1564.tif') + to_geotiff(da, tif, compression='none', nodata=0) + vrt_path = str(tmp_path / 'oor_1564.vrt') + write_vrt(vrt_path, [tif]) + _int_nodata_rewrite_vrt_nodata(vrt_path, '-1') + via_vrt = read_vrt(vrt_path) + assert via_vrt.dtype == np.uint16, f'Out-of-range NoDataValue must not trigger integer masking (got dtype {via_vrt.dtype})' + np.testing.assert_array_equal(via_vrt.values, arr) + + +def test_vrt_open_geotiff_parity_uint16_nodata(tmp_path): + """open_geotiff routing a .vrt path should produce the same dtype + and masked positions as a direct GeoTIFF read.""" + tif = str(tmp_path / 'parity_1564.tif') + _int_nodata_write_uint16_with_nodata_tif(tif, sentinel=65535) + direct = open_geotiff(tif) + vrt_path = str(tmp_path / 'parity_1564.vrt') + write_vrt(vrt_path, [tif]) + via_vrt = open_geotiff(vrt_path) + assert direct.dtype == via_vrt.dtype + np.testing.assert_array_equal(np.isnan(direct.values), np.isnan(via_vrt.values), err_msg='VRT route should NaN-mask the same pixels as direct read') + mask = ~np.isnan(direct.values) + np.testing.assert_array_equal(direct.values[mask], via_vrt.values[mask]) + + +# --------------------------------------------------------------------------- +# mask_nodata=False preserves float sentinels (#2158) +# Originally: test_vrt_mask_nodata_float_source_2158.py +# --------------------------------------------------------------------------- + + +def _mask_nodata_float_write_float32_with_sentinel(tmp_path, sentinel=-9999.0, filename='float_2158.tif'): + """float32 GeoTIFF with a non-NaN sentinel and matching pixels. + + The middle row has a literal ``-9999.0`` so the inline masking + actually has something to rewrite. + """ + band = np.array([[1.0, 2.0, 3.0], [4.0, sentinel, 6.0], [7.0, sentinel, 9.0]], dtype=np.float32) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return (p, band) + + +def _mask_nodata_float_write_float64_with_fractional_sentinel(tmp_path, sentinel=-9999.25, filename='float64_2158.tif'): + """float64 GeoTIFF with a fractional sentinel. + + Float32's exact-cast rounding would clobber a fractional value + like ``-9999.25``; the float64 path is the only one where the + sentinel survives lossless. + """ + band = np.array([[1.0, 2.0], [sentinel, 4.0]], dtype=np.float64) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return (p, band) + + +def _mask_nodata_float_build_vrt(tmp_path, source_path, vrt_dtype, nodata_value, filename='float_2158.vrt', shape=(3, 3)): + """Hand-roll a single-source VRT pointing at the float source.""" + h, w = shape + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n {nodata_value}\n \n {source_path}\n 1\n \n \n \n \n' + p = str(tmp_path / filename) + with open(p, 'w') as f: + f.write(vrt_xml) + return p + + +def test_default_mask_nodata_true_rewrites_float_sentinel(tmp_path): + """The default behaviour (mask_nodata=True) still substitutes NaN. + + Pins the existing contract so the fix below does not regress the + masking happy path. + """ + src, _ = _mask_nodata_float_write_float32_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', -9999.0) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert np.isnan(r.values[1, 1]) + assert np.isnan(r.values[2, 1]) + assert r.values[0, 0] == 1.0 + assert r.values[1, 0] == 4.0 + assert r.attrs.get('nodata') == -9999.0 + assert r.attrs.get('masked_nodata') is True + + +def test_eager_mask_nodata_false_preserves_float_sentinel(tmp_path): + """Eager VRT path: ``mask_nodata=False`` keeps the literal sentinel. + + Before #2158 this assertion failed -- the sentinel pixels were + silently rewritten to NaN inside ``_vrt._read_data`` regardless + of the kwarg. + """ + src, original = _mask_nodata_float_write_float32_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', -9999.0) + r = read_vrt(vrt, mask_nodata=False) + assert r.dtype == np.float32 + assert not np.isnan(r.values).any() + assert r.values[1, 1] == np.float32(-9999.0) + assert r.values[2, 1] == np.float32(-9999.0) + np.testing.assert_array_equal(r.values, original) + assert r.attrs.get('nodata') == -9999.0 + assert r.attrs.get('masked_nodata') is False + + +def test_chunked_mask_nodata_false_preserves_float_sentinel(tmp_path): + """Chunked VRT path: ``mask_nodata=False`` keeps the literal sentinel. + + The chunked path used to call ``_read_vrt_internal`` from + ``_vrt_chunk_read`` without forwarding the kwarg, so per-chunk + decodes silently rewrote float sentinels too. With #2158 the + kwarg is forwarded into the internal reader and both paths agree. + """ + src, original = _mask_nodata_float_write_float32_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', -9999.0) + r = read_vrt(vrt, chunks=2, mask_nodata=False) + assert r.dtype == np.float32 + computed = r.compute() + assert not np.isnan(computed.values).any() + assert computed.values[1, 1] == np.float32(-9999.0) + assert computed.values[2, 1] == np.float32(-9999.0) + np.testing.assert_array_equal(computed.values, original) + assert computed.attrs.get('nodata') == -9999.0 + assert computed.attrs.get('masked_nodata') is False + + +def test_eager_and_chunked_agree_under_mask_nodata_false(tmp_path): + """Cross-path parity: eager and chunked produce the same buffer. + + Before #2158 the two paths could disagree because both rewrote + the sentinel inline but at slightly different points in the + pipeline. With the opt-out honored, both paths land on the + untouched source array. + """ + src, _ = _mask_nodata_float_write_float32_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', -9999.0) + eager = read_vrt(vrt, mask_nodata=False) + chunked = read_vrt(vrt, chunks=2, mask_nodata=False).compute() + np.testing.assert_array_equal(eager.values, chunked.values) + assert eager.attrs.get('masked_nodata') == chunked.attrs.get('masked_nodata') + + +def test_mask_nodata_false_float64_fractional_sentinel(tmp_path): + """A fractional sentinel survives the float64 opt-out path. + + Float32 would round ``-9999.25`` to the nearest representable + value, so this corner is float64-only. With the opt-out honored + the pixel keeps its exact bit pattern. + """ + src, original = _mask_nodata_float_write_float64_with_fractional_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float64', -9999.25, filename='float64_2158.vrt', shape=(2, 2)) + r = read_vrt(vrt, mask_nodata=False) + assert r.dtype == np.float64 + assert r.values[1, 0] == -9999.25 + np.testing.assert_array_equal(r.values, original) + + +def test_masked_vs_unmasked_differ_only_at_sentinels(tmp_path): + """``mask_nodata=True`` and ``=False`` differ only where the sentinel hits. + + Every pixel that is NaN in the masked output equals the declared + sentinel in the unmasked output, and every non-sentinel pixel is + bit-identical between the two reads. This pins the contract that + the opt-out is a pure passthrough on the non-sentinel positions. + """ + src, _ = _mask_nodata_float_write_float32_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', -9999.0) + masked = read_vrt(vrt).values + unmasked = read_vrt(vrt, mask_nodata=False).values + nan_positions = np.isnan(masked) + sentinel_positions = unmasked == np.float32(-9999.0) + np.testing.assert_array_equal(nan_positions, sentinel_positions) + np.testing.assert_array_equal(masked[~nan_positions], unmasked[~sentinel_positions]) + + +def _mask_nodata_float_write_uint16_with_sentinel(tmp_path, sentinel=65535, filename='uint16_2158.tif'): + """uint16 GeoTIFF with a matching sentinel. + + Used to exercise the integer-source-feeding-float-VRT promotion at + ``_vrt.py:1351-1390``. With ``mask_nodata=True`` the sentinel pixel + surfaces as NaN in the float buffer; with ``mask_nodata=False`` the + literal integer value flows through the int->float cast and lands + as ``65535.0``. + """ + band = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) + p = str(tmp_path / filename) + write(band, p, nodata=sentinel, compression='none', tiled=False) + return (p, band) + + +def test_int_source_float_vrt_mask_nodata_false_keeps_literal(tmp_path): + """Integer source feeding a Float32 VRT preserves the literal sentinel. + + Pins the second branch of the inline masking that #2158 gated. + Before the fix, ``_vrt._read_data`` ran the int->float-with-NaN + promotion unconditionally, so even ``mask_nodata=False`` lost the + sentinel. After the fix the integer source pixel survives the + int->float cast as ``65535.0`` and ``masked_nodata`` reflects + that no masking ran. + """ + src, _ = _mask_nodata_float_write_uint16_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', 65535, filename='int_float_2158.vrt', shape=(2, 2)) + r = read_vrt(vrt, mask_nodata=False) + assert r.dtype == np.float32 + assert not np.isnan(r.values).any() + assert r.values[1, 1] == np.float32(65535.0) + assert r.values[0, 0] == 1.0 + assert r.attrs.get('nodata') == 65535.0 + assert r.attrs.get('masked_nodata') is False + + +def test_int_source_float_vrt_default_still_promotes(tmp_path): + """Default ``mask_nodata=True`` still NaN-masks the int->float promotion. + + Baseline that documents the pre-#2158 contract for the integer + source path: the existing #1616 behavior is unchanged when the + opt-out is not requested. + """ + src, _ = _mask_nodata_float_write_uint16_with_sentinel(tmp_path) + vrt = _mask_nodata_float_build_vrt(tmp_path, src, 'Float32', 65535, filename='int_float_default_2158.vrt', shape=(2, 2)) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert np.isnan(r.values[1, 1]) + assert r.values[0, 0] == 1.0 + assert r.attrs.get('nodata') == 65535.0 + assert r.attrs.get('masked_nodata') is True + + +# --------------------------------------------------------------------------- +# tile metadata parity for VRT tiled writes (#1606) +# Originally: test_vrt_tiled_metadata_1606.py +# --------------------------------------------------------------------------- + + +def _tiled_metadata_make_rioxarray_style(arr=None): + """DataArray that looks like rioxarray output: nodata only via aliases.""" + if arr is None: + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + arr[0, 0] = -9999.0 + return xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(arr.shape[0], dtype=np.float64), 'x': np.arange(arr.shape[1], dtype=np.float64)}, attrs={'nodatavals': (-9999.0,), '_FillValue': -9999.0, 'crs': 4326, 'gdal_metadata': {'AREA_OR_POINT': 'Area', 'foo': 'bar'}, 'x_resolution': 96, 'y_resolution': 96, 'resolution_unit': 'inch', 'raster_type': 'point'}) + + +def _tiled_metadata_first_tile_path(vrt_path): + tiles_dir = vrt_path[:-len('.vrt')] + '_tiles' + tiles = sorted(glob.glob(os.path.join(tiles_dir, '*.tif'))) + assert tiles, f'no per-tile .tif files under {tiles_dir}' + return tiles[0] + + +class TestVrtTiledMetadataParity: + + def test_nodatavals_alias_propagates_to_tiles(self, tmp_path): + da = _tiled_metadata_make_rioxarray_style() + vrt = str(tmp_path / 'nodatavals.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('nodata') == -9999.0 + + def test_fill_value_alias_propagates_to_tiles(self, tmp_path): + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + arr[0, 0] = -9999.0 + da = xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, attrs={'_FillValue': -9999.0, 'crs': 4326}) + vrt = str(tmp_path / 'fillvalue.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('nodata') == -9999.0 + + def test_gdal_metadata_propagates_to_tiles(self, tmp_path): + da = _tiled_metadata_make_rioxarray_style() + vrt = str(tmp_path / 'gdal_meta.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + gm = tile_da.attrs.get('gdal_metadata') + assert gm == {'AREA_OR_POINT': 'Area', 'foo': 'bar'} + + def test_resolution_tags_propagate_to_tiles(self, tmp_path): + da = _tiled_metadata_make_rioxarray_style() + vrt = str(tmp_path / 'resolution.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('x_resolution') == 96.0 + assert tile_da.attrs.get('y_resolution') == 96.0 + assert tile_da.attrs.get('resolution_unit') == 'inch' + + def test_raster_type_point_propagates_to_tiles(self, tmp_path): + da = _tiled_metadata_make_rioxarray_style() + vrt = str(tmp_path / 'point.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('raster_type') == 'point' + + def test_tif_vs_vrt_tile_metadata_parity(self, tmp_path): + """Same DataArray, two destinations -- per-tile metadata matches.""" + da = _tiled_metadata_make_rioxarray_style() + tif_path = str(tmp_path / 'parity.tif') + vrt_path = str(tmp_path / 'parity.vrt') + to_geotiff(da, tif_path, tile_size=16) + to_geotiff(da, vrt_path, tile_size=16) + tif_da = open_geotiff(tif_path) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt_path)) + keys = ('nodata', 'gdal_metadata', 'raster_type', 'x_resolution', 'y_resolution', 'resolution_unit') + for k in keys: + assert tif_da.attrs.get(k) == tile_da.attrs.get(k), f'{k} mismatch: tif={tif_da.attrs.get(k)!r}, vrt-tile={tile_da.attrs.get(k)!r}' + + +class TestVrtTiledRichTagCoverage: + """Cover the XML / extra_tags / friendly-tag paths the bare + ``gdal_metadata`` dict assertion above does not exercise.""" + + def test_gdal_metadata_xml_string_propagates_to_tiles(self, tmp_path): + """``attrs['gdal_metadata_xml']`` (pre-built XML string) bypasses + the dict->XML builder. Verify it still reaches per-tile files.""" + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + xml = '\n vrt_xml_value\n\n' + da = xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, attrs={'crs': 4326, 'gdal_metadata_xml': xml}) + vrt = str(tmp_path / 'gdal_xml.vrt') + to_geotiff(da, vrt, tile_size=16, allow_experimental_codecs=True) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + gm = tile_da.attrs.get('gdal_metadata') or {} + gm_xml = tile_da.attrs.get('gdal_metadata_xml') or '' + assert gm.get('VRT_XML_KEY') == 'vrt_xml_value' or 'VRT_XML_KEY' in gm_xml, f'gdal_metadata_xml content lost on VRT-tile round-trip; gdal_metadata={gm!r}, gdal_metadata_xml={gm_xml!r}' + + def test_extra_tags_entry_propagates_to_tiles(self, tmp_path): + """A user-supplied ``extra_tags`` entry (Software, tag 305) + must round-trip through the VRT-tiled writer.""" + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + software = 'xrspatial-vrt-test-1606' + da = xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, attrs={'crs': 4326, 'extra_tags': [(305, 2, len(software) + 1, software)]}) + vrt = str(tmp_path / 'extra_tags.vrt') + to_geotiff(da, vrt, tile_size=16, allow_experimental_codecs=True) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + et = tile_da.attrs.get('extra_tags') or [] + tag_ids = {entry[0] for entry in et} + assert 305 in tag_ids, f'Software (305) tag missing from VRT tile extra_tags; got tag ids {sorted(tag_ids)!r}' + + def test_image_description_friendly_attr_propagates_to_tiles(self, tmp_path): + """``attrs['image_description']`` is folded into ``extra_tags`` + as tag 270 by ``_merge_friendly_extra_tags`` and then surfaces + on read as ``attrs['image_description']``.""" + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + da = xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, attrs={'crs': 4326, 'image_description': 'vrt-tile-friendly-1606'}) + vrt = str(tmp_path / 'image_desc.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('image_description') == 'vrt-tile-friendly-1606' + + +class TestVrtTiledMetadataDask: + + def test_nodatavals_alias_dask(self, tmp_path): + pytest.importorskip('dask.array') + import dask.array as dska + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + arr[0, 0] = -9999.0 + da_np = xr.DataArray(arr, dims=('y', 'x'), coords={'y': np.arange(8.0), 'x': np.arange(8.0)}, attrs={'nodatavals': (-9999.0,), 'crs': 4326, 'gdal_metadata': {'k': 'v'}}) + da = xr.DataArray(dska.from_array(arr, chunks=4), dims=da_np.dims, coords=da_np.coords, attrs=da_np.attrs) + vrt = str(tmp_path / 'dask.vrt') + to_geotiff(da, vrt, tile_size=16) + tile_da = open_geotiff(_tiled_metadata_first_tile_path(vrt)) + assert tile_da.attrs.get('nodata') == -9999.0 + assert tile_da.attrs.get('gdal_metadata') == {'k': 'v'} + + +# --------------------------------------------------------------------------- +# VRT XML parsed once on the chunked path (#1825) +# Originally: test_vrt_single_parse_1825.py +# --------------------------------------------------------------------------- + + +@pytest.fixture +def single_parse_two_by_two_vrt_1825(): + """4-tile mosaic via the to_geotiff(.vrt, ...) dask path.""" + arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) + y = np.linspace(41.0, 40.0, 256) + x = np.linspace(-106.0, -105.0, 256) + raster = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1825_2x2_') + vrt_path = os.path.join(td, 'mosaic_1825.vrt') + to_geotiff(raster, vrt_path, tile_size=128) + yield (vrt_path, arr) + + +@pytest.fixture +def single_parse_single_tile_vrt_1825(): + """One 64x64 float32 tile wrapped in a VRT.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + y = np.linspace(41.0, 40.0, 64) + x = np.linspace(-106.0, -105.0, 64) + raster = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1825_single_') + tile_path = os.path.join(td, 'tile_1825.tif') + to_geotiff(raster, tile_path) + vrt_path = os.path.join(td, 'single_1825.vrt') + _write_vrt_internal(vrt_path, [tile_path]) + yield (vrt_path, arr) + + +def test_chunked_path_parses_xml_once(monkeypatch, single_parse_two_by_two_vrt_1825): + """Construction parses once, and ``.compute()`` adds zero parses. + + The previous implementation re-parsed inside every per-chunk task, + so a 4x4 chunk grid produced 17 parses total. After #1825 the + dispatcher parses once and threads the already-parsed VRTDataset + through the task graph. + """ + vrt_path, _ = single_parse_two_by_two_vrt_1825 + from xrspatial.geotiff import _vrt as vrt_module + counter = {'parses': 0} + real_parse = vrt_module.parse_vrt + + def counting_parse(*args, **kwargs): + counter['parses'] += 1 + return real_parse(*args, **kwargs) + monkeypatch.setattr(vrt_module, 'parse_vrt', counting_parse) + result = read_vrt(vrt_path, chunks=(64, 64)) + assert counter['parses'] == 1, f"expected 1 parse during construction, got {counter['parses']}" + computed = result.compute() + assert counter['parses'] == 1, f"expected 1 parse total (construction only); got {counter['parses']} -- per-chunk tasks are still reparsing" + assert computed.shape == (256, 256) + assert computed.dtype == np.float32 + + +def test_chunked_path_reads_xml_file_once(monkeypatch, single_parse_two_by_two_vrt_1825): + """The chunked dispatcher reads the VRT XML file exactly once. + + Pin the file-read side too: before #1825 every per-chunk task + re-opened the .vrt file via ``_read_vrt_xml``. After the refactor + only the dispatcher reads it. + """ + vrt_path, _ = single_parse_two_by_two_vrt_1825 + from xrspatial.geotiff import _vrt as vrt_module + counter = {'reads': 0} + real_read_xml = vrt_module._read_vrt_xml + + def counting_read_xml(*args, **kwargs): + counter['reads'] += 1 + return real_read_xml(*args, **kwargs) + monkeypatch.setattr(vrt_module, '_read_vrt_xml', counting_read_xml) + result = read_vrt(vrt_path, chunks=(64, 64)) + assert counter['reads'] == 1, f"expected 1 XML file read during construction, got {counter['reads']}" + result.compute() + assert counter['reads'] == 1, f"expected 1 XML file read total; got {counter['reads']} -- per-chunk tasks are still re-opening the .vrt file" + + +def test_parsed_vrt_is_picklable(single_parse_single_tile_vrt_1825): + """The parsed VRTDataset round-trips through pickle. + + The chunked dispatcher embeds the parsed VRT into the dask graph, + so dask must be able to serialise it for the distributed and + process-pool schedulers. Pin picklability with the stdlib pickler + (cloudpickle is a strict superset). + """ + vrt_path, _ = single_parse_single_tile_vrt_1825 + from xrspatial.geotiff._vrt import _read_vrt_xml, parse_vrt + xml_str = _read_vrt_xml(vrt_path) + vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) + vrt = parse_vrt(xml_str, vrt_dir) + blob = pickle.dumps(vrt) + restored = pickle.loads(blob) + assert restored.width == vrt.width + assert restored.height == vrt.height + assert len(restored.bands) == len(vrt.bands) + assert restored.bands[0].dtype == vrt.bands[0].dtype + assert [s.filename for s in restored.bands[0].sources] == [s.filename for s in vrt.bands[0].sources] + + +def test_chunked_matches_eager_after_refactor(single_parse_two_by_two_vrt_1825): + """Byte-identical eager vs chunked results after the helper consolidation. + + The eager path uses ``_apply_integer_sentinel_mask`` / + ``_effective_dtype_for_bands`` / ``_sentinel_for_dtype`` from + ``_vrt`` directly; the chunked path imports the same helpers. A + regression in either call site would surface here. + """ + vrt_path, original = single_parse_two_by_two_vrt_1825 + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=(64, 64)).compute() + assert eager.dtype == chunked.dtype + np.testing.assert_array_equal(eager.values, chunked.values) + np.testing.assert_array_equal(eager.values, original) + + +def test_no_path_containment_revalidation_per_chunk(monkeypatch, single_parse_two_by_two_vrt_1825): + """Per-chunk tasks skip the source-path containment check. + + ``parse_vrt`` is the only place that resolves and validates source + paths against the VRT directory / ``XRSPATIAL_VRT_ALLOWED_ROOTS``. + Because each task now receives the already-parsed VRT, ``parse_vrt`` + must not run during ``.compute()`` even when the graph is hydrated. + """ + vrt_path, _ = single_parse_two_by_two_vrt_1825 + from xrspatial.geotiff import _vrt as vrt_module + parse_calls = {'n': 0} + real_parse = vrt_module.parse_vrt + + def counting_parse(*args, **kwargs): + parse_calls['n'] += 1 + return real_parse(*args, **kwargs) + monkeypatch.setattr(vrt_module, 'parse_vrt', counting_parse) + result = read_vrt(vrt_path, chunks=(64, 64)) + parses_after_construction = parse_calls['n'] + da_arr = result.data + if isinstance(da_arr, da.Array): + _block = da_arr.blocks[0, 0].compute() + assert _block.shape[0] > 0 and _block.shape[1] > 0 + assert parse_calls['n'] == parses_after_construction, f"per-block compute triggered extra parses ({parse_calls['n']} vs {parses_after_construction})" + + +def test_parsed_kwarg_does_not_mutate_caller_holes(single_parse_single_tile_vrt_1825): + """``read_vrt(parsed=...)`` must not mutate the caller's ``holes``. + + The chunked dispatcher threads a single parsed ``VRTDataset`` into + every per-chunk task. ``read_vrt`` appends skipped-source records to + ``vrt.holes`` when a backing file is missing; without a defensive + copy the appends would land on the dispatcher's shared object and + leak across tasks (racy under the threaded scheduler, and + cumulatively across calls if a caller ever reused the parsed + object). Pin that ``parsed.holes`` stays untouched. + """ + vrt_path, _ = single_parse_single_tile_vrt_1825 + from xrspatial.geotiff._vrt import _read_vrt_xml, parse_vrt + from xrspatial.geotiff._vrt import read_vrt as _read_vrt_internal + xml_str = _read_vrt_xml(vrt_path) + vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) + parsed = parse_vrt(xml_str, vrt_dir) + parsed.bands[0].sources[0].filename = os.path.join(vrt_dir, 'gone.tif') + holes_id_before = id(parsed.holes) + import warnings + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + arr, returned = _read_vrt_internal(vrt_path, parsed=parsed, missing_sources='warn') + assert parsed.holes == [], f'parsed.holes was mutated across the read; got {parsed.holes!r}' + assert id(parsed.holes) == holes_id_before, "parsed.holes list object was replaced -- the caller's reference is now stale" + assert len(returned.holes) == 1 + assert returned.holes[0]['source'].endswith('gone.tif') + assert arr.shape == (64, 64) + + +# --------------------------------------------------------------------------- +# write_vrt escapes XML special chars (#1607) +# Originally: test_vrt_xml_escape_1607.py +# --------------------------------------------------------------------------- + + +@pytest.fixture +def xml_escape_sample_tif(tmp_path): + """Write a tiny GeoTIFF the VRT writer can introspect for metadata.""" + arr = np.zeros((4, 4), dtype=np.float32) + y = np.linspace(1.0, 0.0, 4) + x = np.linspace(0.0, 1.0, 4) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'nodata': -9999.0}) + path = str(tmp_path / 'src.tif') + to_geotiff(da, path) + return path + + +def test_crs_wkt_with_xml_special_chars_round_trips(xml_escape_sample_tif, tmp_path): + """A WKT containing ``& < > " '`` must round-trip through write_vrt / + parse_vrt unchanged (the entities are escaped on the way out and + decoded on the way in).""" + nasty_wkt = 'GEOGCS["spec & with "quotes" and \'apostrophes\'"]' + vrt_path = str(tmp_path / 'mosaic.vrt') + _write_vrt_internal(vrt_path, [xml_escape_sample_tif], crs_wkt=nasty_wkt) + with open(vrt_path, 'r') as fh: + text = fh.read() + parsed = parse_vrt(text, vrt_dir=str(tmp_path)) + assert parsed.crs_wkt == nasty_wkt + + +def test_crs_wkt_injection_does_not_change_raster_type(xml_escape_sample_tif, tmp_path): + """The headline #1607 case: a crafted WKT trying to close ```` + and inject ``Point...`` + must NOT change ``raster_type`` from its default 'area' value.""" + injection = 'Point' + vrt_path = str(tmp_path / 'evil.vrt') + _write_vrt_internal(vrt_path, [xml_escape_sample_tif], crs_wkt=injection) + with open(vrt_path, 'r') as fh: + text = fh.read() + parsed = parse_vrt(text, vrt_dir=str(tmp_path)) + assert parsed.raster_type == 'area' + assert parsed.crs_wkt == injection + + +def test_source_filename_with_ampersand_round_trips(tmp_path): + """A source filename containing ``&`` must produce a VRT whose + ```` element decodes back to the original on-disk + path (no double-escape, no corruption).""" + arr = np.zeros((4, 4), dtype=np.float32) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': np.linspace(1, 0, 4), 'x': np.linspace(0, 1, 4)}, attrs={'nodata': -9999.0}) + src = str(tmp_path / 'a&b.tif') + to_geotiff(da, src) + vrt_path = str(tmp_path / 'mosaic.vrt') + _write_vrt_internal(vrt_path, [src]) + with open(vrt_path, 'r') as fh: + text = fh.read() + assert '&' in text + assert ' str: + src_path = os.path.join(td, 'tmp_1815_src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, compression='none') + return src_path + + +def _xml_size_cap_write_vrt(td: str, *, pad_bytes: int=0) -> str: + """Write a VRT, optionally padded with a large XML comment.""" + vrt_path = os.path.join(td, 'tmp_1815_mosaic.vrt') + comment = '' + if pad_bytes > 0: + comment = '\n' + vrt_xml = '\n' + comment + ' \n \n tmp_1815_src.tif\n 1\n \n \n \n \n\n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_small_vrt_parses_under_default_cap(tmp_path): + """A normal-sized VRT parses successfully with the default cap.""" + td = str(tmp_path) + _xml_size_cap_write_source(td) + vrt_path = _xml_size_cap_write_vrt(td) + arr, _ = _xml_size_cap_read_vrt_internal(vrt_path) + assert arr.shape == (10, 10) + + +def test_oversized_vrt_raises_value_error(tmp_path, monkeypatch): + """A VRT padded past the cap raises ValueError naming the cap and env var.""" + td = str(tmp_path) + _xml_size_cap_write_source(td) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024') + vrt_path = _xml_size_cap_write_vrt(td, pad_bytes=4096) + with pytest.raises(ValueError) as exc_info: + _xml_size_cap_read_vrt_internal(vrt_path) + msg = str(exc_info.value) + assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg + assert '1,024' in msg + + +def test_raising_cap_lets_padded_vrt_parse(tmp_path, monkeypatch): + """Setting the env var higher allows a padded VRT to parse.""" + td = str(tmp_path) + _xml_size_cap_write_source(td) + vrt_path = _xml_size_cap_write_vrt(td, pad_bytes=4096) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024)) + arr, _ = _xml_size_cap_read_vrt_internal(vrt_path) + assert arr.shape == (10, 10) + + +@pytest.mark.parametrize('bad_value', ['not_a_number', '0', '-1', '-1024']) +def test_invalid_cap_raises_value_error(tmp_path, monkeypatch, bad_value): + """Non-numeric, zero, or negative cap values produce a clear error.""" + td = str(tmp_path) + _xml_size_cap_write_source(td) + vrt_path = _xml_size_cap_write_vrt(td) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', bad_value) + with pytest.raises(ValueError, match='XRSPATIAL_VRT_MAX_XML_BYTES'): + _xml_size_cap_read_vrt_internal(vrt_path) + + +# --------------------------------------------------------------------------- +# XML size cap on chunked read_vrt (#1831) +# Originally: test_vrt_xml_size_cap_chunked_1831.py +# --------------------------------------------------------------------------- + + +def _xml_size_cap_chunked_write_source(td: str) -> str: + src_path = os.path.join(td, 'tmp_1831_src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, compression='none') + return src_path + + +def _xml_size_cap_chunked_write_vrt(td: str, *, pad_bytes: int=0) -> str: + """Write a VRT, optionally padded with a large XML comment.""" + vrt_path = os.path.join(td, 'tmp_1831_mosaic.vrt') + comment = '' + if pad_bytes > 0: + comment = '\n' + vrt_xml = '\n' + comment + ' \n \n tmp_1831_src.tif\n 1\n \n \n \n \n\n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_chunked_read_vrt_honors_xml_cap(tmp_path, monkeypatch): + """``read_vrt(chunks=...)`` rejects oversized VRT XML.""" + td = str(tmp_path) + _xml_size_cap_chunked_write_source(td) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024') + vrt_path = _xml_size_cap_chunked_write_vrt(td, pad_bytes=4096) + with pytest.raises(ValueError) as exc_info: + read_vrt(vrt_path, chunks=10) + msg = str(exc_info.value) + assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg + assert '1,024' in msg + + +def test_chunked_read_vrt_under_default_cap(tmp_path): + """A normal-sized VRT parses successfully under the default cap.""" + td = str(tmp_path) + _xml_size_cap_chunked_write_source(td) + vrt_path = _xml_size_cap_chunked_write_vrt(td) + arr = read_vrt(vrt_path, chunks=10) + assert arr.shape == (10, 10) + assert arr.dtype == np.uint8 + + +def test_chunked_read_vrt_raised_cap_allows_padded(tmp_path, monkeypatch): + """Raising ``XRSPATIAL_VRT_MAX_XML_BYTES`` lets a padded VRT parse.""" + td = str(tmp_path) + _xml_size_cap_chunked_write_source(td) + vrt_path = _xml_size_cap_chunked_write_vrt(td, pad_bytes=4096) + monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024)) + arr = read_vrt(vrt_path, chunks=10) + assert arr.shape == (10, 10) + + +# --------------------------------------------------------------------------- +# VRT metadata parity across backends (#2321 sub-PR 3) +# Originally: test_vrt_metadata_parity_2321.py +# --------------------------------------------------------------------------- + + +_WGS84_WKT = 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]' + + +_VRT_OMITTED_ATTR_KEYS = frozenset({'extra_tags', 'image_description', 'extra_samples', 'gdal_metadata', 'gdal_metadata_xml', 'x_resolution', 'y_resolution', 'resolution_unit', 'colormap'}) + + +_REPRESENTATION_KEYS = frozenset({'crs_wkt'}) + + +_BACKEND_LIFECYCLE_KEYS = frozenset({'nodata_pixels_present'}) + + +def _metadata_parity_write_single_source_vrt(tiff_path: str, vrt_path: str, *, width: int, height: int, dtype_xml: str='Float32', nodata: float | int | None=None, geo_transform: str | None='0.0, 1.0, 0.0, 0.0, 0.0, -1.0', srs: str | None=None) -> None: + """Write a 1-band VRT pointing at ``tiff_path``. + + Same writer style as ``test_vrt_finalization_parity_2162`` so the + two test modules share fixture geometry conventions. + """ + nodata_xml = f' {nodata}\n' if nodata is not None else '' + srs_xml = f' {srs}\n' if srs is not None else '' + gt_xml = f' {geo_transform}\n' if geo_transform is not None else '' + vrt_xml = f'\n{gt_xml}{srs_xml} \n{nodata_xml} \n {tiff_path}\n 1\n \n \n \n \n\n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + + +def _metadata_parity_build_full_georef_vrt(tmp_path: pathlib.Path) -> str: + """4x4 float32 single-source VRT with full georef + nodata.""" + import xarray as xr + tiff = str(tmp_path / 'tmp_2321_full_src.tif') + vrt = str(tmp_path / 'tmp_2321_full.vrt') + data = np.arange(16, dtype=np.float32).reshape(4, 4) + da = xr.DataArray(data, coords={'y': np.array([200.0, 199.0, 198.0, 197.0]), 'x': np.array([100.0, 101.0, 102.0, 103.0])}, dims=('y', 'x'), attrs={'crs': 4326}) + to_geotiff(da, tiff) + _metadata_parity_write_single_source_vrt(tiff, vrt, width=4, height=4, dtype_xml='Float32', nodata=-9999.0, geo_transform='100.0, 1.0, 0.0, 200.0, 0.0, -1.0', srs=_WGS84_WKT) + return vrt + + +def _metadata_parity_build_transform_only_vrt(tmp_path: pathlib.Path) -> str: + """4x4 single-source VRT with transform but no SRS (CRS absent).""" + import xarray as xr + tiff = str(tmp_path / 'tmp_2321_tonly_src.tif') + vrt = str(tmp_path / 'tmp_2321_tonly.vrt') + data = np.arange(16, dtype=np.float32).reshape(4, 4) + da = xr.DataArray(data, coords={'y': np.array([200.0, 199.0, 198.0, 197.0]), 'x': np.array([100.0, 101.0, 102.0, 103.0])}, dims=('y', 'x')) + to_geotiff(da, tiff) + _metadata_parity_write_single_source_vrt(tiff, vrt, width=4, height=4, dtype_xml='Float32', geo_transform='100.0, 1.0, 0.0, 200.0, 0.0, -1.0', srs=None) + return vrt + + +def _metadata_parity_build_integer_with_nodata_vrt(tmp_path: pathlib.Path) -> str: + """4x4 uint16 single-source VRT with declared nodata sentinel. + + Used for ``masked_nodata`` parity: the integer-with-sentinel source + must promote to float64 with NaN-masked sentinel pixels in every + backend and stamp ``attrs['masked_nodata']=True``. + """ + src_arr = np.array([[1, 2, 3, 4], [5, 6, 7, 65535], [9, 10, 11, 12], [13, 14, 15, 16]], dtype=np.uint16) + tiff = str(tmp_path / 'tmp_2321_int_src.tif') + vrt = str(tmp_path / 'tmp_2321_int.vrt') + write(src_arr, tiff, nodata=65535, compression='none', tiled=False) + _metadata_parity_write_single_source_vrt(tiff, vrt, width=4, height=4, dtype_xml='UInt16', nodata=65535, geo_transform='0.0, 1.0, 0.0, 0.0, 0.0, -1.0', srs=_WGS84_WKT) + return vrt + + +def _metadata_parity_read_eager_numpy(vrt_path: str): + """Eager numpy via the dispatcher (mirrors public surface).""" + return open_geotiff(vrt_path) + + +def _metadata_parity_read_dask(vrt_path: str): + """Dask via the dispatcher, then ``compute()`` for value parity.""" + lazy = open_geotiff(vrt_path, chunks=2) + return lazy.compute() + + +def _metadata_parity_read_dask_chunks_2(vrt_path: str): + """Dask via the dispatcher, lazy (no compute). + + Used for negative-tests that pin the build-time raise contract + (e.g., ``test_mixed_nodata_vrt_fails_closed_by_default``). Named + at module scope so pytest test ids render as + ``[dask_chunks_2-_metadata_parity_read_dask_chunks_2]`` rather than the cryptic + ``[dask_chunks_2-]`` an inline lambda would produce. + """ + return open_geotiff(vrt_path, chunks=2) + + +def _metadata_parity_read_gpu_eager(vrt_path: str): + """GPU eager via ``read_vrt(gpu=True)``. + + ``open_geotiff(..., gpu=True)`` rejects ``.vrt`` sources up front + (the dispatcher routes ``.vrt`` to ``read_vrt`` and ``read_vrt`` + owns the ``gpu`` kwarg, see ``_backends/vrt.py``). Use the direct + entry point here so the GPU eager path is exercised. + """ + return read_vrt(vrt_path, gpu=True) + + +_BACKENDS = [pytest.param('numpy', _metadata_parity_read_eager_numpy, id='numpy'), pytest.param('dask', _metadata_parity_read_dask, id='dask'), pytest.param('gpu', _metadata_parity_read_gpu_eager, id='gpu', marks=requires_gpu)] + + +def _metadata_parity_comparable_attrs(attrs: dict) -> dict: + """Filter attrs down to the cross-backend comparable subset. + + Drops the documented VRT-omitted keys (which may differ if one + backend stamps a TIFF-specific key while another does not) and the + representation-only keys (``crs_wkt``). + """ + return {k: v for k, v in attrs.items() if k not in _VRT_OMITTED_ATTR_KEYS and k not in _REPRESENTATION_KEYS and (k not in _BACKEND_LIFECYCLE_KEYS)} + + +def _metadata_parity_to_numpy(arr) -> np.ndarray: + """Return a host-side numpy view of ``arr.values`` regardless of + backend. + + CuPy DataArrays have a ``.values`` accessor that triggers an + implicit host transfer in some xarray versions but not others; use + the explicit ``.data.get()`` path for cupy buffers per CLAUDE.md. + """ + data = arr.data + if hasattr(data, 'get'): + return data.get() + return np.asarray(data) + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_full_georef_vrt_attrs_match_eager_numpy(tmp_path, _label, reader): + """Each non-numpy backend's attrs must match the eager numpy baseline. + + The full-georef VRT carries CRS, transform, nodata, and an + integer-source-promotes-to-float lifecycle. Every attr the contract + promises (``transform``, ``crs``, ``nodata``, ``masked_nodata``, + ``georef_status``, ``raster_type``) must compare equal across + backends. ``crs_wkt`` is compared via the ``crs`` integer instead + because the WKT text can re-emit under pyproj normalisation. + + Without this assertion a backend regression that drops one of + these attrs but still returns correct pixels would slip through + every existing pixel-only test. + """ + vrt = _metadata_parity_build_full_georef_vrt(tmp_path) + baseline = _metadata_parity_read_eager_numpy(vrt) + candidate = reader(vrt) + base_attrs = _metadata_parity_comparable_attrs(dict(baseline.attrs)) + cand_attrs = _metadata_parity_comparable_attrs(dict(candidate.attrs)) + base_keys = set(base_attrs) + cand_keys = set(cand_attrs) + assert base_keys == cand_keys, f'Attr-key drift between numpy and {_label}: numpy-only={base_keys - cand_keys}, {_label}-only={cand_keys - base_keys}' + differing = [k for k in base_keys if base_attrs[k] != cand_attrs[k]] + assert not differing, f'Attr value drift between numpy and {_label}: {[(k, base_attrs[k], cand_attrs[k]) for k in differing]}' + for key in ('transform', 'crs', 'georef_status'): + assert key in cand_attrs, f'{_label} backend missing required attr {key!r}' + assert cand_attrs['georef_status'] == GEOREF_STATUS_FULL + assert cand_attrs['crs'] == 4326 + assert len(cand_attrs['transform']) == 6 + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_full_georef_vrt_pixels_match_eager_numpy(tmp_path, _label, reader): + """Pixel-value parity for the full-georef VRT. + + Twin of the attrs test above: a regression that fixed attrs but + broke pixels (or vice versa) must surface on at least one of the + two. Asserting both side-by-side keeps the surface explicit. + """ + vrt = _metadata_parity_build_full_georef_vrt(tmp_path) + base = _metadata_parity_to_numpy(_metadata_parity_read_eager_numpy(vrt)) + cand = _metadata_parity_to_numpy(reader(vrt)) + assert base.shape == cand.shape, f'shape drift numpy vs {_label}: {base.shape} vs {cand.shape}' + np.testing.assert_array_equal(base, cand) + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_full_georef_vrt_coords_match_eager_numpy(tmp_path, _label, reader): + """Coord-array parity for the full-georef VRT. + + The transform attr alone does not guarantee correct coords: the + half-pixel AREA_OR_POINT shift can drift between backends. Compare + the actual coord arrays so a coord regression surfaces directly. + """ + vrt = _metadata_parity_build_full_georef_vrt(tmp_path) + base = _metadata_parity_read_eager_numpy(vrt) + cand = reader(vrt) + assert list(cand.dims) == list(base.dims), f'dim drift numpy vs {_label}: {base.dims} vs {cand.dims}' + for axis in ('y', 'x'): + np.testing.assert_array_equal(np.asarray(cand[axis].values), np.asarray(base[axis].values)) + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_transform_only_vrt_attrs_match_eager_numpy(tmp_path, _label, reader): + """Same parity sweep on a transform-only VRT (no CRS). + + ``georef_status`` must be ``transform_only`` on every backend and + ``attrs['crs']`` must be absent on every backend. A regression + that emits a stale CRS from a TIFF-tag fallback would show up here + as a key-set diff. + """ + vrt = _metadata_parity_build_transform_only_vrt(tmp_path) + baseline = _metadata_parity_read_eager_numpy(vrt) + candidate = reader(vrt) + base_attrs = _metadata_parity_comparable_attrs(dict(baseline.attrs)) + cand_attrs = _metadata_parity_comparable_attrs(dict(candidate.attrs)) + assert set(base_attrs) == set(cand_attrs) + assert base_attrs == cand_attrs + assert cand_attrs['georef_status'] == GEOREF_STATUS_TRANSFORM_ONLY + assert 'crs' not in cand_attrs + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_integer_nodata_vrt_attrs_match_eager_numpy(tmp_path, _label, reader): + """``masked_nodata`` and ``nodata`` lifecycle parity on integer VRT. + + The integer-with-sentinel source must promote to float on every + backend and stamp ``attrs['masked_nodata']=True`` plus + ``attrs['nodata']=65535`` (the original sentinel). A backend that + forgets to stamp ``masked_nodata`` would silently mislead callers + who branch on the attr to decide whether NaN is real or a mask. + """ + vrt = _metadata_parity_build_integer_with_nodata_vrt(tmp_path) + baseline = _metadata_parity_read_eager_numpy(vrt) + candidate = reader(vrt) + base_attrs = _metadata_parity_comparable_attrs(dict(baseline.attrs)) + cand_attrs = _metadata_parity_comparable_attrs(dict(candidate.attrs)) + assert set(base_attrs) == set(cand_attrs) + assert base_attrs == cand_attrs + assert cand_attrs.get('masked_nodata') is True + assert cand_attrs.get('nodata') == 65535 + + +@pytest.mark.parametrize('_label, reader', _BACKENDS) +def test_integer_nodata_vrt_pixels_match_eager_numpy(tmp_path, _label, reader): + """Pixel parity for the integer-VRT case. + + Twin of the attrs test so a backend regression that masks but + forgets the attr (or stamps the attr but masks the wrong cell) + fails one assertion or the other, never both silently. + """ + vrt = _metadata_parity_build_integer_with_nodata_vrt(tmp_path) + base = _metadata_parity_to_numpy(_metadata_parity_read_eager_numpy(vrt)) + cand = _metadata_parity_to_numpy(reader(vrt)) + assert base.shape == cand.shape + np.testing.assert_array_equal(np.isnan(base), np.isnan(cand)) + base_finite = base[~np.isnan(base)] + cand_finite = cand[~np.isnan(cand)] + np.testing.assert_array_equal(base_finite, cand_finite) + + +def _metadata_parity_write_mixed_crs_vrt(tmp_path: pathlib.Path) -> str: + """Two single-band sources with disagreeing CRS at the VRT. + + The VRT XML carries one SRS (WGS84) but the second underlying TIFF + carries a UTM CRS. The fail-closed contract calls for the read to + reject this up front, but today the per-source CRS check does NOT + surface the conflict: the read succeeds and silently flattens to + the VRT-declared SRS. See the xfail on + ``test_mixed_crs_vrt_does_not_silently_flatten`` for the + consumer-side pin and the gap PR 2 must close. + """ + import xarray as xr + src0 = tmp_path / 'tmp_2321_mix_crs_src0.tif' + src1 = tmp_path / 'tmp_2321_mix_crs_src1.tif' + data = np.arange(16, dtype=np.float32).reshape(4, 4) + da0 = xr.DataArray(data, coords={'y': np.array([200.0, 199.0, 198.0, 197.0]), 'x': np.array([100.0, 101.0, 102.0, 103.0])}, dims=('y', 'x'), attrs={'crs': 4326}) + da1 = xr.DataArray(data, coords={'y': np.array([200.0, 199.0, 198.0, 197.0]), 'x': np.array([104.0, 105.0, 106.0, 107.0])}, dims=('y', 'x'), attrs={'crs': 32633}) + to_geotiff(da0, str(src0)) + to_geotiff(da1, str(src1)) + vrt_path = tmp_path / 'tmp_2321_mixed_crs.vrt' + vrt_xml = f'\n 100.0, 1.0, 0.0, 200.0, 0.0, -1.0\n {_WGS84_WKT}\n \n \n {src0}\n 1\n \n \n \n \n {src1}\n 1\n \n \n \n \n\n' + vrt_path.write_text(vrt_xml) + return str(vrt_path) + + +@pytest.mark.xfail(reason="Mixed-CRS VRT currently silently flattens to the VRT-declared SRS (#2321 gap). The validator from sub-PR 2 must reject this with a typed error at graph build / eager-read setup; once that lands, drop the xfail and tighten the assertion to VRTUnsupportedError. Today the read produces a mosaic whose attrs['crs'] reports only the VRT-declared CRS while the second source's UTM data has been silently incorporated.", strict=True) +def test_mixed_crs_vrt_does_not_silently_flatten(tmp_path): + """A mixed-CRS VRT must not return a mosaic that silently inherits + one source's CRS while pixels came from a CRS-incompatible source. + + This is the gap that motivates sub-PR 2 of the parent epic: the + VRT XML declares one SRS, the underlying sources disagree, and + the reader hands back a single ``attrs['crs']`` as if everything + were homogeneous. The pixel content is no longer geospatially + meaningful once the underlying CRSs disagree, but no error fires. + + ``strict=True`` so the test flips to XPASS the moment the gap is + fixed -- CI will fail loudly, prompting the upgrade to a proper + raise assertion. That is the desired posture: a finding pinned in + test form, not silently passing. + """ + vrt = _metadata_parity_write_mixed_crs_vrt(tmp_path) + with pytest.raises(Exception): + read_vrt(vrt) + + +def _metadata_parity_write_mixed_nodata_vrt(tmp_path: pathlib.Path) -> str: + """Two-band uint16 VRT with disagreeing per-band ````. + + Mirrors the fixture in ``test_vrt_multiband_int_nodata_1611``: the + fail-closed default (band_nodata=None) must raise + ``MixedBandMetadataError``. The opt-out + ``band_nodata='first'`` is the explicit escape hatch. + """ + b0_arr = np.array([[1, 2], [3, 65535]], dtype=np.uint16) + b1_arr = np.array([[7, 8], [9, 65000]], dtype=np.uint16) + p0 = tmp_path / 'tmp_2321_mix_nodata_b0.tif' + p1 = tmp_path / 'tmp_2321_mix_nodata_b1.tif' + write(b0_arr, str(p0), nodata=65535, compression='none', tiled=False) + write(b1_arr, str(p1), nodata=65000, compression='none', tiled=False) + vrt_path = tmp_path / 'tmp_2321_mix_nodata.vrt' + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n 65535\n \n {p0}\n 1\n \n \n \n \n \n 65000\n \n {p1}\n 1\n \n \n \n \n\n' + vrt_path.write_text(vrt_xml) + return str(vrt_path) + + +@pytest.mark.parametrize('reader_label, reader', [('eager_numpy', _metadata_parity_read_eager_numpy), ('dask_chunks_2', _metadata_parity_read_dask_chunks_2)]) +def test_mixed_nodata_vrt_fails_closed_by_default(tmp_path, reader_label, reader): + """Per-band disagreeing nodata raises ``MixedBandMetadataError`` + by default on every backend route. + + The dask path's check fires at graph-build time (the metadata + sweep runs before dask materialises any chunk). The eager path + raises during the dispatcher's metadata validation. Both must + refuse rather than flattening to band 0's sentinel. + """ + vrt = _metadata_parity_write_mixed_nodata_vrt(tmp_path) + with pytest.raises(MixedBandMetadataError): + result = reader(vrt) + if hasattr(result, 'compute'): + result.compute() + + +def test_mixed_nodata_vrt_opt_in_first_succeeds(tmp_path): + """``band_nodata='first'`` is the documented opt-out for the + mixed-nodata fail-closed check. + + Positive pin so a future change that breaks the escape hatch + surfaces here. The opt-out flattens to band 0's sentinel, which + is the legacy behaviour callers may explicitly want. + """ + vrt = _metadata_parity_write_mixed_nodata_vrt(tmp_path) + result = read_vrt(vrt, band_nodata='first') + assert result.shape == (2, 2, 2) + + +def _metadata_parity_write_unsupported_resample_vrt(tmp_path: pathlib.Path) -> str: + """VRT with ``Bilinear`` and a size-changing DstRect. + + A 4x4 source projected into a 2x2 destination with Bilinear must + raise because the implementation only honours nearest-neighbour + resampling at the placement site. See #1751. + """ + src_arr = np.arange(16, dtype=np.uint16).reshape(4, 4) + src_path = tmp_path / 'tmp_2321_resample_src.tif' + write(src_arr, str(src_path), compression='none', tiled=False) + vrt_path = tmp_path / 'tmp_2321_unsupported_resample.vrt' + vrt_xml = f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n \n {src_path}\n 1\n \n \n Bilinear\n \n \n\n' + vrt_path.write_text(vrt_xml) + return str(vrt_path) + + +def test_unsupported_resample_alg_raises(tmp_path): + """A non-nearest resampling algorithm with a size-changing DstRect + must raise ``NotImplementedError`` rather than return + silently-nearest-sampled pixels mislabelled as Bilinear. + + The ``match=`` clause pins the algorithm name and the issue number + so an unrelated ``NotImplementedError`` from some other VRT code + path cannot keep the test green. See ``_vrt.py`` for the existing + raise that names both fields. Sub-PR 2 (#2329) added + ``VRTUnsupportedError`` to the centralised validator; the + assertion below accepts either type. + """ + vrt = _metadata_parity_write_unsupported_resample_vrt(tmp_path) + with pytest.raises((NotImplementedError, VRTUnsupportedError), match='Bilinear'): + read_vrt(vrt) + + +def _metadata_parity_write_bad_srcrect_vrt(tmp_path: pathlib.Path, *, x_size: int=-50) -> str: + """VRT with a negative-size ````. + + See #1784: the validator must reject this up front rather than + swallow it in the missing-source ``try/except``. + """ + src_arr = np.zeros((10, 10), dtype=np.uint8) + src_path = tmp_path / 'tmp_2321_bad_srcrect_src.tif' + to_geotiff(src_arr, str(src_path), compression='none') + vrt_path = tmp_path / 'tmp_2321_bad_srcrect.vrt' + vrt_xml = f'\n \n \n {src_path}\n 1\n \n \n \n \n\n' + vrt_path.write_text(vrt_xml) + return str(vrt_path) + + +def test_negative_srcrect_size_rejected(tmp_path): + """Malformed ``SrcRect`` rejected with a ``ValueError`` (legacy + path) or ``VRTUnsupportedError`` (centralised validator from + sub-PR 2 of #2321) that names the offending field. + """ + vrt = _metadata_parity_write_bad_srcrect_vrt(tmp_path, x_size=-50) + with pytest.raises((ValueError, VRTUnsupportedError), match='SrcRect.*negative'): + read_vrt(vrt) + + +def _metadata_parity_write_bad_dstrect_vrt(tmp_path: pathlib.Path, *, x_size: int=-10) -> str: + """VRT with a negative-size ```` for the negative test. + + Mirrors the DstRect rejection added for #1737; the regression + coverage today targets oversized DstRects, this test pins the + sister case for negative dimensions. + """ + src_arr = np.zeros((10, 10), dtype=np.uint8) + src_path = tmp_path / 'tmp_2321_bad_dstrect_src.tif' + to_geotiff(src_arr, str(src_path), compression='none') + vrt_path = tmp_path / 'tmp_2321_bad_dstrect.vrt' + vrt_xml = f'\n \n \n {src_path}\n 1\n \n \n \n \n\n' + vrt_path.write_text(vrt_xml) + return str(vrt_path) + + +def test_negative_dstrect_size_rejected(tmp_path): + """Malformed ``DstRect`` must not survive into the read path. + + Accept ``ValueError`` (today's posture; the SimpleSource DstRect + validator raises ``VRT SimpleSource DstRect has negative size + (...)`` before any pixel work begins). The ``match=`` clause pins + the field name and the rejection reason so an unrelated + ``ValueError`` from some other VRT code path cannot silently keep + the test green. The centralised validator from sub-PR 2 of #2321 + raises ``VRTUnsupportedError`` for the same case; both are accepted. + """ + vrt = _metadata_parity_write_bad_dstrect_vrt(tmp_path, x_size=-10) + with pytest.raises((ValueError, VRTUnsupportedError), match='DstRect.*negative'): + read_vrt(vrt) + + +def _metadata_parity_write_missing_source_vrt(tmp_path: pathlib.Path, *, name: str='tmp_2321_missing.vrt') -> str: + """VRT pointing at a single source path that does not exist. + + The dispatcher's static missing-source sweep (#2265) raises at + construction time for both eager and dask routes when + ``missing_sources='raise'`` is in effect. + """ + vrt_path = tmp_path / name + missing = tmp_path / 'tmp_2321_missing_src.tif' + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {missing}\n 1\n \n \n \n \n\n' + vrt_path.write_text(vrt_xml) + assert not os.path.exists(str(missing)), 'fixture leak: missing-source path exists on disk' + return str(vrt_path) + + +def test_missing_sources_raise_eager(tmp_path): + """``missing_sources='raise'`` (the public default since #1860) + must abort the read up front on the eager path.""" + vrt = _metadata_parity_write_missing_source_vrt(tmp_path, name='tmp_2321_miss_eager.vrt') + with pytest.raises((OSError, ValueError, FileNotFoundError)): + read_vrt(vrt) + + +def test_missing_sources_raise_dask(tmp_path): + """``missing_sources='raise'`` (default) on the dask path raises + at graph-build time per #2265, not at ``.compute()``. + + Pin both the build-time raise and the value path so a regression + that defers the check to compute surfaces here. + """ + vrt = _metadata_parity_write_missing_source_vrt(tmp_path, name='tmp_2321_miss_dask.vrt') + with pytest.raises((OSError, ValueError, FileNotFoundError)): + lazy = open_geotiff(vrt, chunks=2) + lazy.compute() + + +def test_missing_sources_warn_records_holes(tmp_path): + """``missing_sources='warn'`` is the documented escape hatch. + + The lenient path must emit ``GeoTIFFFallbackWarning`` and populate + ``attrs['vrt_holes']`` so callers branching on the attr can detect + a partial mosaic. This is the contract documented in #1734 / #1843; + the test pins it via the public ``read_vrt`` entry point so a + regression in the warn-policy attr emission surfaces. + + The plan calls for parity tests against ``missing_sources='skip'``; + the public API exposes ``'warn'`` as the lenient option (skip is + used internally inside ``_vrt.read_vrt``). Use the documented public + value here so the test pins the user-facing contract. + """ + vrt = _metadata_parity_write_missing_source_vrt(tmp_path, name='tmp_2321_miss_warn.vrt') + with pytest.warns(GeoTIFFFallbackWarning, match='could not be read'): + result = read_vrt(vrt, missing_sources='warn') + assert 'vrt_holes' in result.attrs, "missing_sources='warn' did not stamp attrs['vrt_holes']" + holes = result.attrs['vrt_holes'] + assert len(holes) == 1 + assert isinstance(holes[0], dict), f'vrt_holes entry type drifted: {type(holes[0]).__name__}; #1734 documents a dict shape' + hole_source = holes[0]['source'] + assert 'tmp_2321_missing_src.tif' in hole_source, f'hole source path drifted: {hole_source!r}' diff --git a/xrspatial/geotiff/tests/vrt/test_window.py b/xrspatial/geotiff/tests/vrt/test_window.py new file mode 100644 index 000000000..d5975d86e --- /dev/null +++ b/xrspatial/geotiff/tests/vrt/test_window.py @@ -0,0 +1,1150 @@ +"""Consolidated VRT window / scaling / tiling / chunking tests. + +Folds nine issue-numbered VRT test files into one place. Each section +preserves its originating file's helpers, fixtures and assertions; +helpers are prefixed (e.g. ``_window_validation_*``) so the folds do +not collide. Test names dropped their trailing issue numbers where +the originating file already namespaced them. + +Sections: +* Window kwarg validation (#1697) +* Resample + window inverse parity (#1704) +* DstRect resample-time cap (#1737) +* Scaled SrcRect / DstRect nearest resampling (#1694) +* Per-source tile-size sanity check (#1823) +* Per-source max-pixel cap (#1796) +* Lazy chunks construction (#1814) +* Shared parsed VRTDataset in the chunked graph (#1923) +* Tiled VRT writer uses the threaded scheduler (#1714) + +See ``CLUSTER_AUDIT_PR6.md`` for the file:test -> section:test mapping. +""" +from __future__ import annotations + +import dask +import dask.array as da +import glob +import numpy as np +import os +import pytest +import tempfile +import uuid +import xarray as xr +from pathlib import Path +from unittest import mock +from unittest.mock import patch +from xrspatial.geotiff import read_vrt, to_geotiff +from xrspatial.geotiff._reader import PixelSafetyLimitError, read_to_array +from xrspatial.geotiff._vrt import _resample_nearest, read_vrt as _scaled_rects_read_vrt_internal +from xrspatial.geotiff._vrt import read_vrt as _dstrect_cap_read_vrt_internal +from xrspatial.geotiff._vrt import read_vrt as _resample_window_inverse_read_vrt_internal +from xrspatial.geotiff._vrt import read_vrt as _source_tile_check_read_vrt_internal +from xrspatial.geotiff._vrt import read_vrt as _window_validation_read_vrt_internal +from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# window kwarg validation (#1697) +# Originally: test_vrt_window_validation_1697.py +# --------------------------------------------------------------------------- + + +def _window_validation_unique_dir(tmp_path, label: str) -> str: + """Return a sub-path under ``tmp_path`` with a uuid suffix so + parallel test workers cannot collide on the same name.""" + d = tmp_path / f'vrt_1697_{label}_{uuid.uuid4().hex[:8]}' + d.mkdir() + return str(d) + + +def _window_validation_write_tif(path: str, size: int=4) -> None: + """Write a ``size``x``size`` float32 GeoTIFF the VRT can wrap.""" + arr = np.arange(size * size, dtype=np.float32).reshape(size, size) + y = np.linspace(float(size) - 0.5, 0.5, size) + x = np.linspace(0.5, float(size) - 0.5, size) + da = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + to_geotiff(da, path, compression='none') + + +def _window_validation_write_vrt(vrt_path: str, source_filename: str, size: int=4) -> None: + """Write a single-band VRT of dimension ``size``x``size`` pointing + at ``source_filename`` (relative to the VRT directory).""" + xml = f'\n 0, 1, 0, 0, 0, -1\n \n \n {source_filename}\n 1\n \n \n \n \n\n' + with open(vrt_path, 'w') as f: + f.write(xml) + + +@pytest.fixture +def window_validation_vrt_4x4(tmp_path): + """Return a path to a 4x4 single-band VRT wrapping a 4x4 TIFF.""" + d = _window_validation_unique_dir(tmp_path, 'fixture') + tif = os.path.join(d, 'data.tif') + _window_validation_write_tif(tif, size=4) + vrt = os.path.join(d, 'mosaic.vrt') + _window_validation_write_vrt(vrt, 'data.tif', size=4) + return vrt + + +def test_negative_r0_raises_value_error(window_validation_vrt_4x4): + """``r0 < 0`` raises ValueError instead of being clamped to 0.""" + with pytest.raises(ValueError, match='outside the VRT extent'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(-1, 0, 2, 2)) + + +def test_negative_c0_raises_value_error(window_validation_vrt_4x4): + """``c0 < 0`` raises ValueError instead of being clamped to 0.""" + with pytest.raises(ValueError, match='outside the VRT extent'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, -1, 2, 2)) + + +def test_r1_past_bottom_edge_raises_value_error(window_validation_vrt_4x4): + """``r1 > vrt.height`` raises instead of being clamped to vrt.height.""" + with pytest.raises(ValueError, match='outside the VRT extent'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 0, 5, 4)) + + +def test_c1_past_right_edge_raises_value_error(window_validation_vrt_4x4): + """``c1 > vrt.width`` raises instead of being clamped to vrt.width.""" + with pytest.raises(ValueError, match='outside the VRT extent'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 0, 4, 5)) + + +def test_window_past_both_edges_raises_value_error(window_validation_vrt_4x4): + """Windows past both right and bottom edges raise the same error.""" + with pytest.raises(ValueError, match='outside the VRT extent'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 0, 10, 10)) + + +def test_zero_size_row_window_raises_value_error(window_validation_vrt_4x4): + """``r0 == r1`` produces a zero-height window and must raise.""" + with pytest.raises(ValueError, match='non-positive size'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(2, 0, 2, 4)) + + +def test_zero_size_col_window_raises_value_error(window_validation_vrt_4x4): + """``c0 == c1`` produces a zero-width window and must raise.""" + with pytest.raises(ValueError, match='non-positive size'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 2, 4, 2)) + + +def test_fully_zero_size_window_raises_value_error(window_validation_vrt_4x4): + """``r0 == r1 and c0 == c1`` raises (current code returned a (0, 0) array).""" + with pytest.raises(ValueError, match='non-positive size'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(2, 2, 2, 2)) + + +def test_inverted_row_window_raises_value_error(window_validation_vrt_4x4): + """``r0 > r1`` is degenerate and must raise.""" + with pytest.raises(ValueError, match='non-positive size'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(3, 0, 1, 4)) + + +def test_inverted_col_window_raises_value_error(window_validation_vrt_4x4): + """``c0 > c1`` is degenerate and must raise.""" + with pytest.raises(ValueError, match='non-positive size'): + _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 3, 4, 1)) + + +def test_full_extent_window_still_works(window_validation_vrt_4x4): + """A window covering the full VRT extent still reads the full array.""" + arr, _ = _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(0, 0, 4, 4)) + assert arr.shape == (4, 4) + + +def test_interior_window_still_works(window_validation_vrt_4x4): + """An interior window returns the requested subset shape.""" + arr, _ = _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(1, 1, 3, 3)) + assert arr.shape == (2, 2) + + +def test_edge_aligned_window_still_works(window_validation_vrt_4x4): + """A window that touches but does not exceed the edges is accepted.""" + arr, _ = _window_validation_read_vrt_internal(window_validation_vrt_4x4, window=(2, 2, 4, 4)) + assert arr.shape == (2, 2) + + +def test_none_window_still_returns_full_array(window_validation_vrt_4x4): + """``window=None`` still returns the full VRT extent.""" + arr, _ = _window_validation_read_vrt_internal(window_validation_vrt_4x4) + assert arr.shape == (4, 4) + + +def test_vrt_and_local_paths_share_window_validation(tmp_path): + """Same bad window rejected on both VRT and local-TIFF paths with the + same error class and message shape (one word swap is fine).""" + d = _window_validation_unique_dir(tmp_path, 'parity') + tif = os.path.join(d, 'data.tif') + _window_validation_write_tif(tif, size=4) + vrt = os.path.join(d, 'mosaic.vrt') + _window_validation_write_vrt(vrt, 'data.tif', size=4) + bad_window = (-1, 0, 2, 2) + with pytest.raises(ValueError) as vrt_exc: + _window_validation_read_vrt_internal(vrt, window=bad_window) + with pytest.raises(ValueError) as local_exc: + read_to_array(tif, window=bad_window) + vrt_msg = str(vrt_exc.value) + local_msg = str(local_exc.value) + assert 'window=' in vrt_msg + assert 'window=' in local_msg + assert '4x4' in vrt_msg + assert '4x4' in local_msg + assert 'extent' in vrt_msg + assert 'extent' in local_msg + assert 'non-positive size' in vrt_msg + assert 'non-positive size' in local_msg + assert 'VRT extent' in vrt_msg + assert 'source extent' in local_msg + + +# --------------------------------------------------------------------------- +# resample+window inverse parity (#1704) +# Originally: test_vrt_resample_window_inverse_1704.py +# --------------------------------------------------------------------------- + + +def _resample_window_inverse_write_vrt_xml(tmp_path, xml: str, name: str) -> str: + p = str(tmp_path / name) + with open(p, 'w') as f: + f.write(xml) + return p + + +def _resample_window_inverse_write_src(tmp_path, arr: np.ndarray, name: str='tmp_1704_src.tif') -> str: + src_path = str(tmp_path / name) + write(arr, src_path, compression='none', tiled=False) + return src_path + + +def _resample_window_inverse_single_source_vrt(src_path: str, *, raster_x: int, raster_y: int, src_x: int, src_y: int, src_xsize: int, src_ysize: int, dst_x: int, dst_y: int, dst_xsize: int, dst_ysize: int, dtype: str='UInt16', nodata: str | None=None) -> str: + nodata_xml = f' {nodata}\n' if nodata is not None else '' + return f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {src_path}\n 1\n \n \n{nodata_xml} \n \n\n' + + +def test_upsample_window_matches_full_then_slice(tmp_path): + """4x upsample, then read a small window from the middle. The + windowed read must equal the full read sliced at the same offsets.""" + src = np.arange(10 * 10, dtype=np.uint16).reshape(10, 10) + 1 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=40, raster_y=40, src_x=0, src_y=0, src_xsize=10, src_ysize=10, dst_x=0, dst_y=0, dst_xsize=40, dst_ysize=40) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_up.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + assert full.shape == (40, 40) + r0, c0, r1, c1 = (7, 11, 33, 29) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(r0, c0, r1, c1)) + np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) + + +def test_downsample_window_matches_full_then_slice(tmp_path): + """4x downsample, windowed read parity with full-then-slice.""" + src = np.arange(40 * 40, dtype=np.uint16).reshape(40, 40) + 1 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=10, raster_y=10, src_x=0, src_y=0, src_xsize=40, src_ysize=40, dst_x=0, dst_y=0, dst_xsize=10, dst_ysize=10) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_down.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + assert full.shape == (10, 10) + r0, c0, r1, c1 = (2, 3, 9, 8) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(r0, c0, r1, c1)) + np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) + + +@pytest.mark.parametrize('r0,c0,r1,c1', [(0, 0, 11, 11), (1, 1, 10, 10), (3, 2, 7, 9), (0, 0, 1, 1), (10, 10, 11, 11), (5, 0, 6, 11), (0, 5, 11, 6)]) +def test_non_integer_ratio_7_to_11_window_parity(tmp_path, r0, c0, r1, c1): + """SrcRect 7x7, DstRect 11x11 (irrational ratio 7/11). The + nearest-neighbour mapping has uneven step sizes so the inverse + mapping has to handle each output index individually; this is the + case that breaks Option-2 "resample sub-shape into sub-shape" + implementations. + """ + src = np.arange(7 * 7, dtype=np.uint16).reshape(7, 7) + 100 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=11, raster_y=11, src_x=0, src_y=0, src_xsize=7, src_ysize=7, dst_x=0, dst_y=0, dst_xsize=11, dst_ysize=11) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_7_11.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(r0, c0, r1, c1)) + np.testing.assert_array_equal(windowed, full[r0:r1, c0:c1]) + + +def test_window_starting_at_origin(tmp_path): + src = np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=20, raster_y=20, src_x=0, src_y=0, src_xsize=8, src_ysize=8, dst_x=0, dst_y=0, dst_xsize=20, dst_ysize=20) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_origin.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(0, 0, 5, 5)) + np.testing.assert_array_equal(windowed, full[0:5, 0:5]) + + +def test_window_ending_at_last_pixel(tmp_path): + src = np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=20, raster_y=20, src_x=0, src_y=0, src_xsize=8, src_ysize=8, dst_x=0, dst_y=0, dst_xsize=20, dst_ysize=20) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_last.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(15, 15, 20, 20)) + np.testing.assert_array_equal(windowed, full[15:20, 15:20]) + + +def test_window_crossing_multiple_sources(tmp_path): + """Two SimpleSources tiled side by side, each with non-1:1 SrcRect / + DstRect. A window that spans both sources must equal the full read + sliced over the same range. Both sources go through the new windowed + resample path. + """ + left = np.arange(5 * 5, dtype=np.uint16).reshape(5, 5) + 1 + right = np.arange(5 * 5, dtype=np.uint16).reshape(5, 5) + 1000 + left_path = _resample_window_inverse_write_src(tmp_path, left, 'tmp_1704_left.tif') + right_path = _resample_window_inverse_write_src(tmp_path, right, 'tmp_1704_right.tif') + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {left_path}\n 1\n \n \n \n \n {right_path}\n 1\n \n \n \n \n\n' + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_multi.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + assert full.shape == (10, 20) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(2, 7, 8, 14)) + np.testing.assert_array_equal(windowed, full[2:8, 7:14]) + + +def test_nodata_round_trip_through_window(tmp_path): + """SimpleSource with ````; the sentinel inside the windowed + region must surface as NaN in a float-typed VRT. Both the full read + and the windowed read must agree on which pixels are NaN. + """ + src = (np.arange(8 * 8, dtype=np.uint16).reshape(8, 8) + 1).astype(np.uint16) + src[3, 4] = 65535 + src[5, 2] = 65535 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=16, raster_y=16, src_x=0, src_y=0, src_xsize=8, src_ysize=8, dst_x=0, dst_y=0, dst_xsize=16, dst_ysize=16, dtype='Float32', nodata='65535') + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_nodata.vrt') + full, _ = _resample_window_inverse_read_vrt_internal(vrt_path) + windowed, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(4, 4, 12, 12)) + np.testing.assert_array_equal(windowed, full[4:12, 4:12]) + assert np.isnan(windowed).any() + + +def test_only_minimal_source_rect_is_read(tmp_path): + """Patch ``read_to_array`` to record the windows requested. Under + the new path the source window must be much smaller than the full + SrcRect when the caller asks for a small sub-window. + """ + src = np.arange(40 * 40, dtype=np.uint16).reshape(40, 40) + 1 + src_path = _resample_window_inverse_write_src(tmp_path, src) + vrt_xml = _resample_window_inverse_single_source_vrt(src_path, raster_x=160, raster_y=160, src_x=0, src_y=0, src_xsize=40, src_ysize=40, dst_x=0, dst_y=0, dst_xsize=160, dst_ysize=160) + vrt_path = _resample_window_inverse_write_vrt_xml(tmp_path, vrt_xml, 'tmp_1704_bound.vrt') + seen_windows: list[tuple[int, int, int, int]] = [] + from xrspatial.geotiff import _reader as _reader_mod + real_read = _reader_mod.read_to_array + + def spy(filename, *, window, **kw): + seen_windows.append(tuple(window)) + return real_read(filename, window=window, **kw) + with mock.patch.object(_reader_mod, 'read_to_array', spy): + arr, _ = _resample_window_inverse_read_vrt_internal(vrt_path, window=(80, 80, 88, 88)) + assert arr.shape == (8, 8) + assert len(seen_windows) == 1 + r0, c0, r1, c1 = seen_windows[0] + read_h = r1 - r0 + read_w = c1 - c0 + assert read_h < 10, f'expected a small source row range, got {read_h} rows; the full SrcRect is 40 rows so the fix is not reducing the read.' + assert read_w < 10 + + +# --------------------------------------------------------------------------- +# DstRect resample cap (#1737) +# Originally: test_vrt_dstrect_resample_cap_1737.py +# --------------------------------------------------------------------------- + + +def _dstrect_cap_write_source(td: str) -> str: + """Write a 10x10 uint8 source GeoTIFF and return its path. + + Stripped (non-tiled) so the source read does not allocate a 256x256 + tile that trips ``_check_dimensions`` under the small ``max_pixels`` + values these tests pass. + """ + src_path = os.path.join(td, 'src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path, compression='none', tiled=False) + return src_path + + +def _dstrect_cap_write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, raster_x: int=100, raster_y: int=100) -> str: + """Write a VRT with a single SimpleSource using the given DstRect size.""" + vrt_path = os.path.join(td, 'mosaic.vrt') + vrt_xml = f'\n \n \n src.tif\n 1\n \n \n \n \n\n' + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_huge_dstrect_no_longer_allocates_full_intermediate(): + """After #1704 the windowed read clips a 50000x50000 DstRect down to + the 100x100 VRT extent, so the resample intermediate is 100x100 and + no longer hits the pixel-budget cap. The earlier behaviour rejected + the read up front; the new behaviour just returns the assembled + 100x100 mosaic. + """ + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=50000, dst_y_size=50000) + arr, _ = _dstrect_cap_read_vrt_internal(vrt_path) + assert arr.shape == (100, 100) + + +def test_huge_dstrect_y_axis_clipped_to_extent(): + """Asymmetric blow-up: ``ySize`` declared as 10 billion but the VRT + extent caps the clipped sub-window at 100 rows. Read succeeds with + the bounded intermediate.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=10, dst_y_size=10000000000) + arr, _ = _dstrect_cap_read_vrt_internal(vrt_path) + assert arr.shape == (100, 100) + + +def test_legitimate_upsample_still_works(): + """A legitimate upsample stays under the cap and must succeed.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=100, dst_y_size=100) + arr, _ = _dstrect_cap_read_vrt_internal(vrt_path) + assert arr.shape == (100, 100) + + +def test_per_source_cap_bites_when_sub_window_exceeds_budget(): + """The per-source pixel-budget guard applies to the clipped + sub-window, not the raw DstRect. Pick a VRT and ``max_pixels`` where + the sub-window itself exceeds the cap so the per-source check fires + even after the windowed-read change. + + The output buffer dimension check (``_check_dimensions``) is also + bounded by ``max_pixels``, so to isolate the per-source branch we + request a window whose sub-window product crosses the cap. Both + guards use the same threshold; the per-source one provides defence + in depth. + """ + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=2000, dst_y_size=2000, raster_x=2000, raster_y=2000) + with pytest.raises(ValueError, match='resample intermediate|safety limit'): + _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=1000000) + arr, _ = _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=4000000) + assert arr.shape == (2000, 2000) + + +def test_per_source_cap_inclusive_boundary(): + """The per-source cap is inclusive: exactly ``max_pixels`` succeeds, + one below rejects. Mirrors the boundary the original #1737 test + pinned down, on the new sub-window semantics.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=100, dst_y_size=100, raster_x=100, raster_y=100) + with pytest.raises(ValueError, match='resample intermediate|safety limit'): + _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=9999) + arr, _ = _dstrect_cap_read_vrt_internal(vrt_path, max_pixels=10000) + assert arr.shape == (100, 100) + + +def test_negative_dstrect_rejected(): + """Negative ``xSize`` / ``ySize`` must surface as ``ValueError`` + rather than be silently skipped by the overlap check. The error + message must call out the malformed negative size, not the pixel + budget.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=-5, dst_y_size=100) + with pytest.raises(ValueError, match='negative size'): + _dstrect_cap_read_vrt_internal(vrt_path) + + +def test_negative_dstrect_y_size_rejected(): + """Negative ``ySize`` is also rejected with the same tailored error.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _dstrect_cap_write_source(td) + vrt_path = _dstrect_cap_write_vrt(td, dst_x_size=100, dst_y_size=-5) + with pytest.raises(ValueError, match='negative size'): + _dstrect_cap_read_vrt_internal(vrt_path) + + +# --------------------------------------------------------------------------- +# scaled SrcRect/DstRect resampling (#1694) +# Originally: test_vrt_scaled_rects_1694.py +# --------------------------------------------------------------------------- + + +def _scaled_rects_write_vrt(tmp_path, xml: str, name: str='test.vrt') -> str: + p = str(tmp_path / name) + with open(p, 'w') as f: + f.write(xml) + return p + + +def test_downsample_4x4_to_2x2_does_not_raise_and_uses_nearest(tmp_path): + """SrcRect 4x4 -> DstRect 2x2: result is (2,2), nearest-neighbour. + + Before the fix the source (4,4) array was assigned directly into the + (2,2) destination slice, raising the broadcast error documented in + issue #1694. + """ + src = np.arange(16, dtype=np.uint16).reshape(4, 4) + src_path = str(tmp_path / 'src.tif') + write(src, src_path, compression='none', tiled=False) + vrt_xml = f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n \n {src_path}\n 1\n \n \n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'down.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (2, 2), f'expected (2,2), got {result.shape}; resample step missing.' + expected = np.array([[src[1, 1], src[1, 3]], [src[3, 1], src[3, 3]]], dtype=np.uint16) + np.testing.assert_array_equal(result, expected) + + +def test_upsample_2x2_to_4x4_repeats_each_source_pixel(tmp_path): + """SrcRect 2x2 -> DstRect 4x4: each source pixel repeated 2x2. + + Before the fix only the top-left 2x2 of the destination was written + and the rest stayed at the fill value (0 for integer, NaN for + float). + """ + src = np.array([[1, 2], [3, 4]], dtype=np.uint16) + src_path = str(tmp_path / 'src.tif') + write(src, src_path, compression='none', tiled=False) + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {src_path}\n 1\n \n \n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'up.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (4, 4) + expected = np.array([[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4]], dtype=np.uint16) + np.testing.assert_array_equal(result, expected) + assert not (result == 0).any(), 'upsample left zero-filled cells; resample not propagated.' + + +def test_non_integer_scale_3x3_to_2x2_no_holes(tmp_path): + """Non-integer source / destination ratio: covers index-mapping path. + + With src=(3,3) -> dst=(2,2), neither integer-ratio fast path applies. + Confirms the general nearest-neighbour gather produces the correct + shape, no holes, no out-of-bounds writes. + """ + src = np.arange(9, dtype=np.uint16).reshape(3, 3) + src_path = str(tmp_path / 'src.tif') + write(src, src_path, compression='none', tiled=False) + vrt_xml = f'\n 0.0, 1.5, 0.0, 0.0, 0.0, -1.5\n \n \n {src_path}\n 1\n \n \n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'nonint.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (2, 2) + expected = np.array([[src[0, 0], src[0, 2]], [src[2, 0], src[2, 2]]], dtype=np.uint16) + np.testing.assert_array_equal(result, expected) + + +def test_per_band_scale_mix(tmp_path): + """Mixed: band 1 downsampled, band 2 at native resolution. + + Both bands must land in the right places without a broadcast error + and without bleeding band 1's resampled values into band 2. + """ + band1_src = (np.arange(16, dtype=np.uint16) * 10).reshape(4, 4) + band2_src = np.array([[100, 200], [300, 400]], dtype=np.uint16) + p1 = str(tmp_path / 'b1.tif') + p2 = str(tmp_path / 'b2.tif') + write(band1_src, p1, compression='none', tiled=False) + write(band2_src, p2, compression='none', tiled=False) + vrt_xml = f'\n 0.0, 1.0, 0.0, 0.0, 0.0, -1.0\n \n \n {p1}\n 1\n \n \n \n \n \n \n {p2}\n 1\n \n \n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'mix.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (2, 2, 2) + expected_b1 = np.array([[band1_src[1, 1], band1_src[1, 3]], [band1_src[3, 1], band1_src[3, 3]]], dtype=np.uint16) + np.testing.assert_array_equal(result[..., 0], expected_b1) + np.testing.assert_array_equal(result[..., 1], band2_src) + + +def test_window_on_downsampled_source_returns_correct_subwindow(tmp_path): + """``window=(0,0,1,1)`` on a 4x4 -> 2x2 source returns the (0,0) cell. + + The destination cell maps to the source pixel that the resample + routine would sample for that location. Confirms the clip-after- + resample ordering: clipping in source coordinates first (as the old + code effectively did) would feed the wrong source slice into the + resampler. + """ + src = np.arange(16, dtype=np.uint16).reshape(4, 4) + src_path = str(tmp_path / 'src.tif') + write(src, src_path, compression='none', tiled=False) + vrt_xml = f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n \n {src_path}\n 1\n \n \n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'win.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path, window=(0, 0, 1, 1)) + assert result.shape == (1, 1) + assert result[0, 0] == src[1, 1] + + +def test_nodata_preserved_across_downsample(tmp_path): + """Source sentinel pixels survive the resample as NaN in the result. + + Source is uint16 with sentinel=65535. Pixels at the sampled-from + positions whose values are 65535 must appear as NaN in the float64 + VRT output. + """ + sentinel = np.uint16(65535) + src = np.array([[10, 20, 30, 40], [50, sentinel, 70, sentinel], [90, 100, 110, 120], [130, sentinel, 150, sentinel]], dtype=np.uint16) + src_path = str(tmp_path / 'src_nd.tif') + write(src, src_path, nodata=int(sentinel), compression='none', tiled=False) + vrt_xml = f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n -9999\n \n {src_path}\n 1\n \n \n 65535\n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'nd.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (2, 2) + assert result.dtype == np.float64 + assert np.isnan(result).all(), f'sentinel did not survive resample as NaN; got {result!r}' + + +def test_nodata_with_mixed_sentinel_and_valid_pixels(tmp_path): + """Mixed sentinel / valid source -> mixed NaN / valid destination. + + Confirms the mask resamples *with* the data, not against the + pre-resampled source. + """ + sentinel = np.uint16(65535) + src = np.zeros((4, 4), dtype=np.uint16) + src[1, 1] = 11 + src[1, 3] = sentinel + src[3, 1] = 31 + src[3, 3] = sentinel + src_path = str(tmp_path / 'src_mixed.tif') + write(src, src_path, nodata=int(sentinel), compression='none', tiled=False) + vrt_xml = f'\n 0.0, 2.0, 0.0, 0.0, 0.0, -2.0\n \n -9999\n \n {src_path}\n 1\n \n \n 65535\n \n \n' + vrt_path = _scaled_rects_write_vrt(tmp_path, vrt_xml, 'nd_mixed.vrt') + result, _ = _scaled_rects_read_vrt_internal(vrt_path) + assert result.shape == (2, 2) + assert result[0, 0] == 11.0 + assert np.isnan(result[0, 1]) + assert result[1, 0] == 31.0 + assert np.isnan(result[1, 1]) + + +@pytest.mark.parametrize('shape', [(0, 5), (5, 0), (0, 0)]) +def test_resample_nearest_rejects_empty_source(shape): + """``_resample_nearest`` raises ValueError on an empty source array. + + A SimpleSource with ``SrcRect xSize=0`` or ``ySize=0`` -- or a + windowed read that clamps to an empty slice -- would otherwise feed + a zero-dim array to the integer-ratio fast paths, which compute + ``out_h % src_h`` and divide by ``src_h``/``src_w`` and so would + raise an opaque ``ZeroDivisionError``. Surface the bad input with + a clear ``ValueError`` instead. + """ + src_arr = np.zeros(shape, dtype=np.float64) + with pytest.raises(ValueError, match='empty source array'): + _resample_nearest(src_arr, 2, 2) + + +# --------------------------------------------------------------------------- +# per-source tile-size sanity check (#1823) +# Originally: test_vrt_source_tile_check_1823.py +# --------------------------------------------------------------------------- + + +def _source_tile_check_write_normal_tile_source(td: str) -> str: + """10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile.""" + src = os.path.join(td, 'src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none') + return src + + +def _source_tile_check_write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, raster_x: int=100, raster_y: int=100, src_x_size: int=10, src_y_size: int=10) -> str: + vrt = os.path.join(td, 'mosaic.vrt') + xml = f'\n \n \n src.tif\n 1\n \n \n \n \n\n' + with open(vrt, 'w') as f: + f.write(xml) + return vrt + + +class TestPerTileCheckDoesNotUseCallerBudget: + """Per-tile dim sanity must not reject normal 256x256 source tiles + when the caller's ``max_pixels`` is a small output-budget value.""" + + def test_normal_tile_source_with_small_max_pixels(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _source_tile_check_write_normal_tile_source(td) + vrt = _source_tile_check_write_vrt(td, dst_x_size=100, dst_y_size=100) + arr, _ = _source_tile_check_read_vrt_internal(vrt, max_pixels=10000) + assert arr.shape == (100, 100) + + def test_normal_tile_source_with_tiny_max_pixels(self): + """An output budget below a single tile must still succeed when + the requested output window itself fits.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _source_tile_check_write_normal_tile_source(td) + vrt = _source_tile_check_write_vrt(td, dst_x_size=5, dst_y_size=5, raster_x=5, raster_y=5) + arr, _ = _source_tile_check_read_vrt_internal(vrt, max_pixels=100) + assert arr.shape == (5, 5) + + +class TestOutputWindowCheckStillEnforced: + """The output-window check still rejects a read whose VRT extent + exceeds ``max_pixels``. After #1704 the source read is bounded by + the clipped destination sub-window, so the per-source guard now + rarely fires; the top-level ``_check_dimensions`` call against the + output extent catches over-budget requests up front. The #1796 + protection (tiny VRT cannot force huge source decode) is preserved + structurally. + """ + + def test_output_extent_exceeds_max_pixels_still_rejected(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + src = os.path.join(td, 'src.tif') + to_geotiff(np.arange(64, dtype=np.uint8).reshape(8, 8), src, compression='none', tiled=False) + vrt = _source_tile_check_write_vrt(td, dst_x_size=8, dst_y_size=8, raster_x=8, raster_y=8, src_x_size=4, src_y_size=4) + with pytest.raises(ValueError, match='exceed|safety limit'): + _source_tile_check_read_vrt_internal(vrt, max_pixels=4) + + +class TestPerTileCheckStillRejectsCraftedHeader: + """A pathological ``TileWidth``/``TileLength`` must still fail at + the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``.""" + + def test_per_tile_check_caps_at_default(self, monkeypatch): + """Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site + is wired to it (rather than to the caller's budget).""" + from xrspatial.geotiff import _reader as reader_mod + monkeypatch.setattr(reader_mod, 'MAX_PIXELS_DEFAULT', 100) + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _source_tile_check_write_normal_tile_source(td) + vrt = _source_tile_check_write_vrt(td, dst_x_size=100, dst_y_size=100) + with pytest.raises(PixelSafetyLimitError, match='65,536'): + _source_tile_check_read_vrt_internal(vrt, max_pixels=1000000000) + + +# --------------------------------------------------------------------------- +# per-source max-pixel cap (#1796) +# Originally: test_vrt_source_max_pixels_1796.py +# --------------------------------------------------------------------------- + + +def test_tiny_vrt_with_huge_srcrect_now_reads_minimally(tmp_path): + """A 1x1 VRT pointing at a 4x4 SrcRect now reads only the one source + pixel that maps to the single output pixel, so ``max_pixels=1`` is + no longer exceeded. Locks in the structural improvement from #1704.""" + src = tmp_path / 'tmp_1796_source.tif' + data = np.arange(16, dtype=np.uint8).reshape(4, 4) + to_geotiff(data, str(src), compression='none') + vrt = tmp_path / 'tmp_1796_source_cap.vrt' + vrt.write_text(f'\n \n \n {os.path.basename(src)}\n 1\n \n \n \n \n\n') + arr = read_vrt(str(vrt), max_pixels=1) + assert arr.shape == (1, 1) + + +def test_source_cap_still_fires_when_sub_window_exceeds_budget(tmp_path): + """The per-source pixel-budget guard still rejects a sub-window that + exceeds ``max_pixels``. With the sub-window-bounded read, the cap is + measured against the clipped destination region rather than the raw + SrcRect; the protection from #1796 carries over to that new + measurement. + """ + src = tmp_path / 'tmp_1796_big_source.tif' + data = np.arange(64, dtype=np.uint8).reshape(8, 8) + to_geotiff(data, str(src), compression='none', tiled=False) + vrt = tmp_path / 'tmp_1796_big_cap.vrt' + vrt.write_text(f'\n \n \n {os.path.basename(src)}\n 1\n \n \n \n \n\n') + with pytest.raises(ValueError, match='exceed|safety limit'): + read_vrt(str(vrt), max_pixels=4) + + +# --------------------------------------------------------------------------- +# lazy chunks construction (#1814) +# Originally: test_vrt_lazy_chunks_1814.py +# --------------------------------------------------------------------------- + + +def _lazy_chunks_gpu_available() -> bool: + try: + import cupy + except ImportError: + return False + try: + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _lazy_chunks_gpu_available() + + +@pytest.fixture +def lazy_chunks_single_tile_vrt(): + """One 128x128 float32 tile wrapped in a VRT.""" + arr = np.arange(128 * 128, dtype=np.float32).reshape(128, 128) + y = np.linspace(41.0, 40.0, 128) + x = np.linspace(-106.0, -105.0, 128) + raster = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1814_single_') + tile_path = os.path.join(td, 'tile.tif') + to_geotiff(raster, tile_path) + vrt_path = os.path.join(td, 'mosaic.vrt') + _write_vrt_internal(vrt_path, [tile_path]) + yield (vrt_path, arr) + + +@pytest.fixture +def lazy_chunks_two_by_two_vrt(): + """4-tile mosaic via the to_geotiff(.vrt, ...) dask path.""" + arr = np.arange(256 * 256, dtype=np.float32).reshape(256, 256) + y = np.linspace(41.0, 40.0, 256) + x = np.linspace(-106.0, -105.0, 256) + raster = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1814_2x2_') + vrt_path = os.path.join(td, 'mosaic.vrt') + to_geotiff(raster, vrt_path, tile_size=128) + yield (vrt_path, arr) + + +@pytest.fixture +def lazy_chunks_multiband_vrt(): + """3-band single-tile VRT.""" + rng = np.random.default_rng(1814) + arr = rng.random((64, 64, 3), dtype=np.float32) + y = np.linspace(41.0, 40.0, 64) + x = np.linspace(-106.0, -105.0, 64) + raster = xr.DataArray(arr, dims=['y', 'x', 'band'], coords={'y': y, 'x': x, 'band': np.arange(3)}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1814_mb_') + tile_path = os.path.join(td, 'tile.tif') + to_geotiff(raster, tile_path) + vrt_path = os.path.join(td, 'mosaic.vrt') + _write_vrt_internal(vrt_path, [tile_path]) + yield (vrt_path, arr) + + +def test_chunks_builds_dask_array_with_multiple_blocks(lazy_chunks_two_by_two_vrt): + """``read_vrt(chunks=(N,N))`` returns a dask-backed DataArray + whose underlying array has more than one chunk along each spatial + axis. Before the fix the array was numpy-backed under + ``result.chunk()``, so this asserts the new lazy graph is in + play. + """ + vrt_path, _ = lazy_chunks_two_by_two_vrt + result = read_vrt(vrt_path, chunks=(64, 64)) + assert isinstance(result.data, da.Array), f'expected dask Array, got {type(result.data).__name__}' + assert result.data.numblocks == (4, 4), f'expected 4x4 blocks, got {result.data.numblocks}' + + +def test_chunks_is_lazy_does_not_call_internal_reader(monkeypatch, lazy_chunks_two_by_two_vrt): + """Construction-time call count of the internal VRT reader is zero; + after ``.compute()`` it equals the chunk count. + """ + vrt_path, _ = lazy_chunks_two_by_two_vrt + from xrspatial.geotiff import _vrt as vrt_module + counter = {'calls': 0} + real_read = vrt_module.read_vrt + + def counting_read(*args, **kwargs): + counter['calls'] += 1 + return real_read(*args, **kwargs) + monkeypatch.setattr(vrt_module, 'read_vrt', counting_read) + result = read_vrt(vrt_path, chunks=(64, 64)) + assert counter['calls'] == 0, f"_read_vrt_internal called {counter['calls']} times before .compute(); the chunked path leaked an eager decode" + computed = result.compute() + assert counter['calls'] == 16, f"expected 16 per-chunk decodes after compute, got {counter['calls']}" + assert computed.shape == (256, 256) + + +def test_chunked_compute_matches_eager(lazy_chunks_two_by_two_vrt): + vrt_path, _ = lazy_chunks_two_by_two_vrt + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=(64, 64)).compute() + assert eager.shape == chunked.shape + assert np.array_equal(eager.values, chunked.values), 'chunked compute diverged from eager read' + np.testing.assert_array_equal(eager['x'].values, chunked['x'].values) + np.testing.assert_array_equal(eager['y'].values, chunked['y'].values) + assert eager.attrs.get('transform') == chunked.attrs.get('transform') + assert eager.attrs.get('crs') == chunked.attrs.get('crs') + + +def test_chunked_single_tile_matches_eager(lazy_chunks_single_tile_vrt): + """Single-tile VRT (one source) should still match eager when + chunked. Exercises the path where many chunk windows hit the + same single source. + """ + vrt_path, _ = lazy_chunks_single_tile_vrt + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=(32, 32)).compute() + assert np.array_equal(eager.values, chunked.values) + + +def test_chunks_task_cap_raises(lazy_chunks_two_by_two_vrt): + """``chunks=(1, 1)`` on a 256x256 VRT would build 65,536 tasks, + blowing past the 50,000-task cap. The reader should refuse with + a ValueError that names ``chunks=`` and suggests a larger size. + """ + vrt_path, _ = lazy_chunks_two_by_two_vrt + with pytest.raises(ValueError, match='chunks=.*task'): + read_vrt(vrt_path, chunks=(1, 1)) + + +def test_window_plus_chunks_matches_eager(lazy_chunks_two_by_two_vrt): + """When both ``window=`` and ``chunks=`` are passed, the dask + graph must tile the window (not the full VRT extent). The output + shape and pixel values match an eager windowed read. + """ + vrt_path, _ = lazy_chunks_two_by_two_vrt + window = (32, 48, 160, 192) + eager = read_vrt(vrt_path, window=window) + chunked = read_vrt(vrt_path, window=window, chunks=(64, 64)) + assert isinstance(chunked.data, da.Array) + assert chunked.data.numblocks == (2, 3), f'expected (2, 3) numblocks over the window, got {chunked.data.numblocks}' + computed = chunked.compute() + assert computed.shape == eager.shape == (128, 144) + assert np.array_equal(eager.values, computed.values) + + +@pytest.mark.skipif(not _HAS_GPU, reason='cupy + CUDA required') +def test_gpu_plus_chunks_returns_dask_on_cupy(lazy_chunks_two_by_two_vrt): + """``read_vrt(gpu=True, chunks=...)`` must build a dask graph whose + blocks are cupy-backed (not numpy that gets cupy-wrapped at + compute time on the host). + """ + import cupy + vrt_path, _ = lazy_chunks_two_by_two_vrt + result = read_vrt(vrt_path, gpu=True, chunks=(64, 64)) + assert isinstance(result.data, da.Array) + assert isinstance(result.data._meta, cupy.ndarray), f'expected cupy _meta, got {type(result.data._meta).__module__}.{type(result.data._meta).__name__}' + computed = result.compute() + assert isinstance(computed.data, cupy.ndarray) + + +def test_multiband_plus_chunks_preserves_band_dim(lazy_chunks_multiband_vrt): + """3-band VRT read with ``chunks=`` keeps the band dimension on + every block and the assembled DataArray. + """ + vrt_path, src = lazy_chunks_multiband_vrt + result = read_vrt(vrt_path, chunks=(32, 32)) + assert isinstance(result.data, da.Array) + assert result.dims == ('y', 'x', 'band') + assert result.shape == (64, 64, 3) + assert result.data.chunks[2] == (3,) + computed = result.compute() + np.testing.assert_allclose(computed.values, src, rtol=0, atol=0) + + +def test_chunked_propagates_vrt_holes_when_source_missing(lazy_chunks_two_by_two_vrt): + """When a source referenced by the VRT does not exist on disk and + the caller opts into the lenient ``missing_sources='warn'`` path, + the chunked reader must populate ``attrs['vrt_holes']`` with the + same schema the eager reader uses, so callers can branch on + ``"vrt_holes" in da.attrs`` regardless of which code path produced + the DataArray. + + Note: the default ``missing_sources='raise'`` raises at build time + under #2265, so this test exercises the explicit ``'warn'`` opt-in. + """ + import warnings + from xrspatial.geotiff import GeoTIFFFallbackWarning + from xrspatial.geotiff._reader import _mmap_cache + vrt_path, _ = lazy_chunks_two_by_two_vrt + vrt_dir = os.path.dirname(vrt_path) + tile_files = [] + for root, _dirs, files in os.walk(vrt_dir): + for f in files: + if f.endswith('.tif'): + tile_files.append(os.path.join(root, f)) + assert len(tile_files) >= 1 + _mmap_cache.clear() + os.unlink(tile_files[0]) + with warnings.catch_warnings(): + warnings.simplefilter('ignore', GeoTIFFFallbackWarning) + result = read_vrt(vrt_path, chunks=(64, 64), missing_sources='warn') + assert 'vrt_holes' in result.attrs, 'chunked path dropped vrt_holes contract from #1734' + holes = result.attrs['vrt_holes'] + assert isinstance(holes, list) and len(holes) >= 1 + entry = holes[0] + assert set(entry.keys()) >= {'source', 'band', 'dst_rect', 'error'} + assert isinstance(entry['dst_rect'], tuple) + assert len(entry['dst_rect']) == 4 + + +def test_chunked_no_vrt_holes_attr_when_complete(lazy_chunks_two_by_two_vrt): + """When every source is on disk the chunked reader must not set + ``attrs['vrt_holes']`` (eager parity: empty hole list is omitted). + """ + vrt_path, _ = lazy_chunks_two_by_two_vrt + result = read_vrt(vrt_path, chunks=(64, 64)) + assert 'vrt_holes' not in result.attrs + + +def test_chunked_integer_no_nodata_keeps_source_dtype(): + """A uint16 source with no declared must produce a + uint16 chunked DataArray, not float64. The eager path stays integer + in this case because its runtime ``mask.any()`` is False; the + chunked path approximates with a static "any band declares nodata?" + check, which yields the same answer here. + """ + arr = np.arange(128 * 128, dtype=np.uint16).reshape(128, 128) + y = np.linspace(41.0, 40.0, 128) + x = np.linspace(-106.0, -105.0, 128) + raster = xr.DataArray(arr, dims=['y', 'x'], coords={'y': y, 'x': x}, attrs={'crs': 4326}) + td = tempfile.mkdtemp(prefix='tmp_1814_uint16_nonodata_') + tile_path = os.path.join(td, 'tile.tif') + to_geotiff(raster, tile_path) + vrt_path = os.path.join(td, 'mosaic.vrt') + _write_vrt_internal(vrt_path, [tile_path]) + result = read_vrt(vrt_path, chunks=(32, 32)) + assert result.dtype == np.uint16, f'expected uint16 (source dtype), got {result.dtype}; chunked path promoted to float64 despite no declared nodata' + computed = result.compute() + assert computed.dtype == np.uint16 + np.testing.assert_array_equal(computed.values, arr) + + +# --------------------------------------------------------------------------- +# shared parsed VRT in chunked graph (#1923) +# Originally: test_vrt_chunked_shared_dataset_1923.py +# --------------------------------------------------------------------------- + + +def _chunked_shared_dataset_make_tile_vrt(tmp_path, n_tiles_per_side=4): + """Build a small multi-source VRT for testing the chunked path. + + Each source is a 64x64 tile written to a temp directory; the VRT + stitches them into a ``(64*N, 64*N)`` mosaic. ``N=4`` keeps the + fixture cheap while still producing a multi-source VRT whose + embedded metadata is measurable. + """ + tile_dir = os.path.join(tmp_path, 'tiles') + os.makedirs(tile_dir, exist_ok=True) + tile_size = 64 + sources = [] + for r in range(n_tiles_per_side): + for c in range(n_tiles_per_side): + arr = np.full((tile_size, tile_size), fill_value=r * n_tiles_per_side + c, dtype=np.float32) + ox = c * tile_size + oy = -(r * tile_size) + da = xr.DataArray(arr, dims=['y', 'x'], attrs={'transform': (1.0, 0.0, ox, 0.0, -1.0, oy)}) + path = os.path.join(tile_dir, f'tile_{r}_{c}.tif') + to_geotiff(da, path, compression='deflate', tiled=True, tile_size=64) + sources.append((path, r, c, tile_size)) + vrt_path = os.path.join(tmp_path, 'mosaic.vrt') + width = n_tiles_per_side * tile_size + height = n_tiles_per_side * tile_size + lines = [f'', '', '0.0, 1.0, 0.0, 0.0, 0.0, -1.0', ''] + for path, r, c, ts in sources: + lines.extend(['', f'{path}', '1', f'', f'', '']) + lines.extend(['', '']) + with open(vrt_path, 'w') as f: + f.write('\n'.join(lines)) + return (vrt_path, len(sources)) + + +def test_vrt_chunked_dataset_is_shared_graph_input(tmp_path): + """Issue #1923: parsed VRTDataset is wrapped as a single Delayed. + + Walks each ``_vrt_chunk_read`` task's kwargs dict in the dask graph + and verifies that ``parsed_vrt`` is NOT an inline ``VRTDataset`` + instance (the pre-fix shape). With the fix, ``parsed_vrt`` is + routed through ``dask.delayed(vrt, pure=True)`` so each task's + ``kwargs['parsed_vrt']`` is a graph reference (an ``Alias`` / + ``TaskRef``-style placeholder pointing into a single shared + ``from-value`` layer) rather than a literal embedded dataset. + + Under the synchronous / threaded scheduler tasks are not pickled + at all, so an embedded copy is harmless in that path. The bug + surfaces under the distributed / multi-process scheduler where + each task is serialised independently and the full dataset is + shipped once per task -- so the structural shape of the graph, + not in-process behaviour, is what matters. + """ + from xrspatial.geotiff._vrt import VRTDataset + vrt_path, n_sources = _chunked_shared_dataset_make_tile_vrt(str(tmp_path), n_tiles_per_side=4) + result = read_vrt(vrt_path, chunks=32) + graph = result.__dask_graph__() + assert n_sources == 16, 'fixture build sanity check' + chunk_task_count = 0 + embedded_vrt_count = 0 + for layer_name, layer in graph.layers.items(): + if '_vrt_chunk_read' not in layer_name: + continue + for _key, task in layer.items(): + kwargs = getattr(task, 'kwargs', None) + if kwargs is None: + continue + parsed_vrt = kwargs.get('parsed_vrt') + if parsed_vrt is None: + continue + chunk_task_count += 1 + if isinstance(parsed_vrt, VRTDataset): + embedded_vrt_count += 1 + assert chunk_task_count > 1, f'fixture sanity: expected multiple chunk tasks, got {chunk_task_count}' + assert embedded_vrt_count == 0, f"#1923 regression: {embedded_vrt_count} of {chunk_task_count} _vrt_chunk_read tasks still embed an inline VRTDataset in kwargs['parsed_vrt']. The fix wraps the dataset in dask.delayed(vrt, pure=True) so kwargs['parsed_vrt'] should be a TaskRef-style graph reference, not a VRTDataset." + + +def test_vrt_chunked_decode_unchanged_after_shared_wrap(tmp_path): + """The shared-Delayed wrap must not change decoded pixel values.""" + vrt_path, _ = _chunked_shared_dataset_make_tile_vrt(str(tmp_path), n_tiles_per_side=3) + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=32).compute() + np.testing.assert_array_equal(np.asarray(eager), np.asarray(chunked)) + + +def test_vrt_chunked_band_kwarg_still_validates(tmp_path): + """Wrapping the dataset must not change band validation behaviour.""" + vrt_path, _ = _chunked_shared_dataset_make_tile_vrt(str(tmp_path), n_tiles_per_side=2) + with pytest.raises(ValueError): + read_vrt(vrt_path, chunks=32, band=5) + + +# --------------------------------------------------------------------------- +# tiled writer uses threaded scheduler (#1714) +# Originally: test_vrt_tiled_scheduler_1714.py +# --------------------------------------------------------------------------- + + +def _tiled_scheduler_make_dask_da(h: int=32, w: int=32, chunk: int=8) -> xr.DataArray: + """Return a dask-backed 2D DataArray with ``chunk``-sized chunks. + + Using ``da.from_array`` on a pre-built numpy array gives clean + ``(chunk, chunk)`` chunking. ``da.arange(...).reshape(...)`` keeps a + chunk size of 1 along the new axis, which produces a confusing test + setup. + """ + arr = np.arange(h * w, dtype=np.float32).reshape(h, w) + return xr.DataArray(da.from_array(arr, chunks=(chunk, chunk)), dims=['y', 'x']) + + +def test_vrt_tiled_uses_threaded_scheduler(): + """_write_vrt_tiled passes ``scheduler='threads'`` to dask.compute.""" + da_arr = _tiled_scheduler_make_dask_da() + with tempfile.TemporaryDirectory(prefix='vrt_sched_1714_', ignore_cleanup_errors=True) as td: + vrt = os.path.join(td, 'sched_check.vrt') + captured = {} + real_compute = dask.compute + + def spy(*args, **kwargs): + captured['scheduler'] = kwargs.get('scheduler') + return real_compute(*args, **kwargs) + with patch.object(dask, 'compute', side_effect=spy) as p: + to_geotiff(da_arr, vrt) + assert p.called, '_write_vrt_tiled never invoked dask.compute' + assert captured.get('scheduler') == 'threads', f"Expected scheduler='threads' on the VRT-tiled write but got {captured.get('scheduler')!r}" + + +def test_vrt_tiled_threaded_write_produces_all_tiles(): + """All expected tile files exist after the threaded write.""" + da_arr = _tiled_scheduler_make_dask_da(h=32, w=32, chunk=8) + with tempfile.TemporaryDirectory(prefix='vrt_sched_1714_', ignore_cleanup_errors=True) as td: + vrt = os.path.join(td, 'tile_count.vrt') + to_geotiff(da_arr, vrt) + tiles_dir = os.path.join(td, 'tile_count_tiles') + tiles = sorted(glob.glob(os.path.join(tiles_dir, '*.tif'))) + assert len(tiles) == 16, f'Expected 16 tile files, got {len(tiles)} in {tiles_dir}' + + +def test_vrt_tiled_threaded_write_is_deterministic(): + """Threaded scheduler must not introduce write ordering races. + + Each delayed task writes to its own file path, so the threaded + scheduler is safe. Run the same write twice and compare byte + contents of every tile to catch any accidental race regression. + """ + da_arr = _tiled_scheduler_make_dask_da(h=32, w=32, chunk=8) + + def _write_and_collect(vrt_path: str) -> dict[str, bytes]: + to_geotiff(da_arr, vrt_path) + stem = os.path.splitext(os.path.basename(vrt_path))[0] + tiles_dir = os.path.join(os.path.dirname(vrt_path), stem + '_tiles') + return {os.path.basename(p): Path(p).read_bytes() for p in sorted(glob.glob(os.path.join(tiles_dir, '*.tif')))} + with tempfile.TemporaryDirectory(prefix='vrt_sched_1714_', ignore_cleanup_errors=True) as td1: + with tempfile.TemporaryDirectory(prefix='vrt_sched_1714_', ignore_cleanup_errors=True) as td2: + tiles1 = _write_and_collect(os.path.join(td1, 'run1.vrt')) + tiles2 = _write_and_collect(os.path.join(td2, 'run2.vrt')) + assert set(tiles1) == set(tiles2), f'Tile file set differs between runs: {set(tiles1) ^ set(tiles2)}' + for name, blob1 in tiles1.items(): + assert blob1 == tiles2[name], f'Tile {name} differs between runs (race condition?)'