From cca00947f685e1f8ec5e921e5f5d965b1369ec76 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:37:47 -0700 Subject: [PATCH 1/2] geotiff: reject predictor=3 paired with integer SampleFormat on read (#1933) The reader accepted a malformed IFD that claimed Predictor=3 (Floating-Point Predictor, TIFF Technical Note 3) with an integer SampleFormat without complaint. The byte-swizzle unshuffle ran on integer bytes and produced garbage pixel values that look like valid integers, with no warning. The writer already rejected this combination via _writer._resolve_predictor. The fix adds _validate_predictor_sample_format in _validation.py and calls it at every IFD-read site (eager numpy, dask, GPU tiled, GPU stripped, HTTP/cloud metadata parse) so reader and writer share one contract. Regression test in test_predictor3_int_dtype_1933.py covers the helper, end-to-end eager and dask reads of a synthetic file, and a control that legitimate predictor=3 float files still round-trip. --- xrspatial/geotiff/__init__.py | 3 + xrspatial/geotiff/_backends/dask.py | 3 + xrspatial/geotiff/_backends/gpu.py | 8 +- xrspatial/geotiff/_reader.py | 5 + xrspatial/geotiff/_validation.py | 42 ++++++ .../tests/test_predictor3_int_dtype_1933.py | 141 ++++++++++++++++++ 6 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 7f61e514a..e1fbbd77d 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -136,6 +136,7 @@ def _read_geo_info(source, *, overview_level: int | None = None): _CloudSource, _coerce_path, _is_file_like, _is_fsspec_uri, _parse_cog_http_meta, ) + from ._validation import _validate_predictor_sample_format source = _coerce_path(source) if isinstance(source, str) and _is_fsspec_uri(source): @@ -155,6 +156,7 @@ def _read_geo_info(source, *, overview_level: int | None = None): _src.close() bps = resolve_bits_per_sample(_ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, _ifd.sample_format) + _validate_predictor_sample_format(_ifd.predictor, _ifd.sample_format) n_bands = ( _ifd.samples_per_pixel if _ifd.samples_per_pixel > 1 else 0 ) @@ -199,6 +201,7 @@ def _read_geo_info(source, *, overview_level: int | None = None): ifd, ifds, data, header.byte_order) bps = resolve_bits_per_sample(ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) + _validate_predictor_sample_format(ifd.predictor, ifd.sample_format) n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0 # Stash photometric + samples_per_pixel so the dask graph builder # can detect MinIsWhite and invert ``geo_info.nodata`` before diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index da930cbd5..4b5ef7673 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -149,8 +149,11 @@ def read_geotiff_dask(source: str, *, full_h = http_ifd.height full_w = http_ifd.width from .._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy + from .._validation import _validate_predictor_sample_format bps = resolve_bits_per_sample(http_ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, http_ifd.sample_format) + _validate_predictor_sample_format( + http_ifd.predictor, http_ifd.sample_format) n_bands = ( http_ifd.samples_per_pixel if http_ifd.samples_per_pixel > 1 else 0 diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 30a61f28c..176a361f4 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -30,7 +30,11 @@ _ON_GPU_FAILURE_SENTINEL, _geotiff_strict_mode, ) -from .._validation import _validate_chunks_arg, _validate_dtype_cast +from .._validation import ( + _validate_chunks_arg, + _validate_dtype_cast, + _validate_predictor_sample_format, +) from ._gpu_helpers import ( _apply_nodata_mask_gpu, _apply_orientation_geo_info, @@ -429,6 +433,7 @@ def read_geotiff_gpu(source: str, *, byte_counts = ifd.tile_byte_counts compression = ifd.compression predictor = ifd.predictor + _validate_predictor_sample_format(predictor, ifd.sample_format) samples = ifd.samples_per_pixel planar = ifd.planar_config tw = ifd.tile_width @@ -973,6 +978,7 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *, th = ifd.tile_height compression = ifd.compression predictor = ifd.predictor + _validate_predictor_sample_format(predictor, ifd.sample_format) byte_order = header.byte_order offsets = list(ifd.tile_offsets) byte_counts = list(ifd.tile_byte_counts) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index cd27338a5..ca0510940 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -36,6 +36,7 @@ select_overview_ifd, validate_tile_layout, ) +from ._validation import _validate_predictor_sample_format # --------------------------------------------------------------------------- # Allocation guard: reject TIFF dimensions that would exhaust memory @@ -1647,6 +1648,7 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, offsets = ifd.strip_offsets byte_counts = ifd.strip_byte_counts pred = ifd.predictor + _validate_predictor_sample_format(pred, ifd.sample_format) bps = resolve_bits_per_sample(ifd.bits_per_sample) bytes_per_sample = bps // 8 is_sub_byte = bps in SUB_BYTE_BPS @@ -1837,6 +1839,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, samples = ifd.samples_per_pixel compression = ifd.compression pred = ifd.predictor + _validate_predictor_sample_format(pred, ifd.sample_format) bps = resolve_bits_per_sample(ifd.bits_per_sample) bytes_per_sample = bps // 8 is_sub_byte = bps in SUB_BYTE_BPS @@ -2321,6 +2324,7 @@ def _fetch_decode_cog_http_strips( offsets = ifd.strip_offsets byte_counts = ifd.strip_byte_counts pred = ifd.predictor + _validate_predictor_sample_format(pred, ifd.sample_format) bytes_per_sample = bps // 8 is_sub_byte = bps in SUB_BYTE_BPS jpeg_tables = ifd.jpeg_tables @@ -2530,6 +2534,7 @@ def _fetch_decode_cog_http_tiles( planar = ifd.planar_config compression = ifd.compression pred = ifd.predictor + _validate_predictor_sample_format(pred, ifd.sample_format) bytes_per_sample = bps // 8 is_sub_byte = bps in SUB_BYTE_BPS jpeg_tables = ifd.jpeg_tables diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py index bde6773c6..a454091b5 100644 --- a/xrspatial/geotiff/_validation.py +++ b/xrspatial/geotiff/_validation.py @@ -179,3 +179,45 @@ def _validate_tile_size_arg(tile_size): validation path (positive int + multiple-of-16 for tiled output). """ _validate_tile_size(tile_size) + + +def _validate_predictor_sample_format(predictor, sample_format) -> None: + """Reject ``Predictor=3`` paired with a non-float ``SampleFormat`` (issue #1933). + + TIFF Technical Note 3 defines the floating-point predictor for IEEE + float samples only. A reader-side input file (malformed, hand-crafted, + or adversarial) that claims ``Predictor=3`` with an integer + ``SampleFormat`` (1=uint, 2=int) used to be accepted silently: the + byte-swizzle unshuffle ran on integer bytes and produced garbage + pixel values that look like valid integers, with no warning. + + The writer side already enforces this contract in + ``_writer._resolve_predictor`` (raises ``ValueError`` on non-float + dtypes), so this validator gives the reader symmetric behaviour. + + Parameters + ---------- + predictor : int + The IFD ``Predictor`` tag value (1=none, 2=horizontal, 3=float). + sample_format : int + The IFD ``SampleFormat`` tag value (1=uint, 2=int, 3=float, + 4=undefined). + + Raises + ------ + ValueError + If ``predictor == 3`` and ``sample_format != 3``. + """ + # Only the float-predictor case is asymmetric; predictor=1 (none) and + # predictor=2 (horizontal) are sample-format-agnostic by design. + if predictor == 3 and sample_format != 3: + raise ValueError( + f"Predictor=3 (floating-point) requires SampleFormat=3 " + f"(IEEE float), got SampleFormat={sample_format}. The TIFF " + f"file is malformed: the floating-point horizontal predictor " + f"(TIFF Technical Note 3) is only defined for float samples. " + f"Decoding integer data through it would produce garbage. " + f"Re-encode the file with a matching predictor/sample-format " + f"pair, e.g. `gdal_translate -co PREDICTOR=2` for integers or " + f"`-co PREDICTOR=1` to drop the predictor." + ) diff --git a/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py b/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py new file mode 100644 index 000000000..3573d55a3 --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py @@ -0,0 +1,141 @@ +"""Regression tests for issue #1933. + +A malformed TIFF that claims ``Predictor=3`` (Floating-Point Predictor, +TIFF Technical Note 3) paired with an integer ``SampleFormat`` used to +decode silently to garbage. The byte-swizzle unshuffle ran on integer +bytes and produced values that look like valid integers, with no +warning. The writer side already rejects this combination in +``_writer._resolve_predictor``; these tests assert reader symmetry. + +The fix lives in ``xrspatial.geotiff._validation`` (helper +``_validate_predictor_sample_format``) and is invoked at every IFD-read +site (eager numpy, dask, GPU tiled, GPU stripped). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff +from xrspatial.geotiff._compression import COMPRESSION_NONE +from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype +from xrspatial.geotiff._header import ( + TAG_BITS_PER_SAMPLE, + TAG_COMPRESSION, + TAG_IMAGE_LENGTH, + TAG_IMAGE_WIDTH, + TAG_PHOTOMETRIC, + TAG_PREDICTOR, + TAG_ROWS_PER_STRIP, + TAG_SAMPLE_FORMAT, + TAG_SAMPLES_PER_PIXEL, + TAG_STRIP_BYTE_COUNTS, + TAG_STRIP_OFFSETS, +) +from xrspatial.geotiff._validation import _validate_predictor_sample_format +from xrspatial.geotiff._writer import ( + _assemble_standard_layout, + _write_stripped, +) + + +def _build_predictor3_uint32_tiff(arr: np.ndarray) -> bytes: + """Build a malformed TIFF: predictor=3 + uint32 (integer) sample format. + + Uses the in-repo ``_assemble_standard_layout`` so we can write tags + the public writer would reject. Compression is COMPRESSION_NONE so + the strip bytes are exactly the raw integer values; the bug is then + visible by comparing the round-tripped values against the originals. + """ + rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False) + bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype) + # SampleFormat=1 (UINT) is the *integer* tag that the malformed file + # advertises; predictor=3 is the *float* predictor. Pair them. + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]), + (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]), + (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), + (TAG_COMPRESSION, SHORT, 1, 1), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), # UINT + (TAG_PREDICTOR, SHORT, 1, 3), # floating-point predictor + (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]), + (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off), + (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc), + ] + parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)] + return _assemble_standard_layout(8, [tags], parts, bigtiff=False) + + +class TestPredictor3IntegerSampleFormatRejected: + """The validator rejects predictor=3 with non-float sample formats.""" + + def test_helper_rejects_pred3_sf1_uint(self): + with pytest.raises(ValueError, match="Predictor=3"): + _validate_predictor_sample_format(3, 1) + + def test_helper_rejects_pred3_sf2_int(self): + with pytest.raises(ValueError, match="Predictor=3"): + _validate_predictor_sample_format(3, 2) + + def test_helper_rejects_pred3_sf4_undefined(self): + # SampleFormat=4 (UNDEFINED) is treated as uint by the dtype map; + # routing it through predictor=3 also produces garbage. + with pytest.raises(ValueError, match="Predictor=3"): + _validate_predictor_sample_format(3, 4) + + def test_helper_accepts_pred3_sf3_float(self): + # The legitimate combination remains a no-op. + _validate_predictor_sample_format(3, 3) + + def test_helper_accepts_pred1_with_any_sf(self): + # Predictor=1 (none) is sample-format-agnostic. + _validate_predictor_sample_format(1, 1) + _validate_predictor_sample_format(1, 2) + _validate_predictor_sample_format(1, 3) + + def test_helper_accepts_pred2_with_any_sf(self): + # Predictor=2 (horizontal) is sample-format-agnostic. + _validate_predictor_sample_format(2, 1) + _validate_predictor_sample_format(2, 2) + _validate_predictor_sample_format(2, 3) + + +class TestEagerReadRejectsMalformedFile: + """``open_geotiff`` raises on the malformed predictor=3 + uint32 file.""" + + def test_open_geotiff_eager_raises(self, tmp_path): + arr = np.array( + [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32) + path = tmp_path / "pred3_uint32.tif" + path.write_bytes(_build_predictor3_uint32_tiff(arr)) + + with pytest.raises(ValueError, match="Predictor=3"): + open_geotiff(str(path)) + + def test_open_geotiff_dask_raises(self, tmp_path): + arr = np.array( + [[10, 20, 30, 40], [50, 60, 70, 80]], dtype=np.uint32) + path = tmp_path / "pred3_uint32_dask.tif" + path.write_bytes(_build_predictor3_uint32_tiff(arr)) + + from xrspatial.geotiff import read_geotiff_dask + with pytest.raises(ValueError, match="Predictor=3"): + read_geotiff_dask(str(path), chunks=64) + + +class TestValidPredictor3StillWorks: + """The fix does not break legitimate predictor=3 float files.""" + + def test_predictor3_float32_round_trip(self, tmp_path): + tifffile = pytest.importorskip("tifffile") + pytest.importorskip("imagecodecs") + + arr = np.linspace(-1.0, 1.0, 16, dtype=np.float32).reshape(4, 4) + path = tmp_path / "pred3_float32.tif" + tifffile.imwrite( + str(path), arr, predictor=3, compression="deflate") + + result = open_geotiff(str(path)) + np.testing.assert_array_equal(result.values, arr) From 086fd65498ac177c050560d88a5cc2a33643b8fc Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 08:07:49 -0700 Subject: [PATCH 2/2] geotiff: address copilot review on #1940 - _validate_predictor_sample_format now normalizes a tuple-valued predictor (1-element or longer) to its first element, then falls back to predictor=1 on the empty-tuple case. IFD.predictor delegates to IFD.get_value, which can return a tuple for a malformed Predictor tag with count > 1; without this normalization the (3,) + integer-SampleFormat combo would slip past the guard. - Add a regression test that asserts (3,) + SF=1 still raises and that an empty tuple / non-3 tuple is a no-op. - Replace the hardcoded Compression tag value in the synthetic-TIFF builder with the already-imported COMPRESSION_NONE constant. --- xrspatial/geotiff/_validation.py | 11 ++++++++++- .../geotiff/tests/test_predictor3_int_dtype_1933.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py index a454091b5..790a3ea32 100644 --- a/xrspatial/geotiff/_validation.py +++ b/xrspatial/geotiff/_validation.py @@ -197,8 +197,11 @@ def _validate_predictor_sample_format(predictor, sample_format) -> None: Parameters ---------- - predictor : int + predictor : int or tuple The IFD ``Predictor`` tag value (1=none, 2=horizontal, 3=float). + Accepts a single-element tuple (the resolved value of a malformed + ``count > 1`` tag) and normalizes to its first element; the TIFF + spec defines ``Predictor`` as a single SHORT. sample_format : int The IFD ``SampleFormat`` tag value (1=uint, 2=int, 3=float, 4=undefined). @@ -208,6 +211,12 @@ def _validate_predictor_sample_format(predictor, sample_format) -> None: ValueError If ``predictor == 3`` and ``sample_format != 3``. """ + # IFD.predictor delegates to IFD.get_value, which can return a tuple + # for a malformed Predictor tag with count > 1. tuple == 3 is always + # False, so a tuple-valued predictor would bypass the guard. Normalize + # to int first so the (3, non-3) case still fires. + if isinstance(predictor, tuple): + predictor = predictor[0] if predictor else 1 # Only the float-predictor case is asymmetric; predictor=1 (none) and # predictor=2 (horizontal) are sample-format-agnostic by design. if predictor == 3 and sample_format != 3: diff --git a/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py b/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py index 3573d55a3..5f28eba99 100644 --- a/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py +++ b/xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py @@ -55,7 +55,7 @@ def _build_predictor3_uint32_tiff(arr: np.ndarray) -> bytes: (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]), (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]), (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), - (TAG_COMPRESSION, SHORT, 1, 1), + (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE), (TAG_PHOTOMETRIC, SHORT, 1, 1), (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), (TAG_SAMPLE_FORMAT, SHORT, 1, 1), # UINT @@ -101,6 +101,17 @@ def test_helper_accepts_pred2_with_any_sf(self): _validate_predictor_sample_format(2, 2) _validate_predictor_sample_format(2, 3) + def test_helper_normalizes_tuple_predictor(self): + # IFD.predictor delegates to get_value, which returns a tuple for + # a malformed Predictor tag with count > 1. The validator must + # still fire on the (3,) + non-float pair. + with pytest.raises(ValueError, match="Predictor=3"): + _validate_predictor_sample_format((3,), 1) + # Empty tuple falls back to predictor=1 (none) -> no-op. + _validate_predictor_sample_format((), 1) + # Non-3 tuple predictor + non-float sample_format -> no-op. + _validate_predictor_sample_format((2,), 1) + class TestEagerReadRejectsMalformedFile: """``open_geotiff`` raises on the malformed predictor=3 + uint32 file."""