Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def _read_geo_info(source, *, overview_level: int | None = None):
_CloudSource, _coerce_path, _is_file_like, _is_fsspec_uri,
_parse_cog_http_meta,
)
from ._validation import _validate_predictor_sample_format

source = _coerce_path(source)
if isinstance(source, str) and _is_fsspec_uri(source):
Expand All @@ -155,6 +156,7 @@ def _read_geo_info(source, *, overview_level: int | None = None):
_src.close()
bps = resolve_bits_per_sample(_ifd.bits_per_sample)
file_dtype = tiff_dtype_to_numpy(bps, _ifd.sample_format)
_validate_predictor_sample_format(_ifd.predictor, _ifd.sample_format)
n_bands = (
_ifd.samples_per_pixel if _ifd.samples_per_pixel > 1 else 0
)
Expand Down Expand Up @@ -199,6 +201,7 @@ def _read_geo_info(source, *, overview_level: int | None = None):
ifd, ifds, data, header.byte_order)
bps = resolve_bits_per_sample(ifd.bits_per_sample)
file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
_validate_predictor_sample_format(ifd.predictor, ifd.sample_format)
n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
# Stash photometric + samples_per_pixel so the dask graph builder
# can detect MinIsWhite and invert ``geo_info.nodata`` before
Expand Down
3 changes: 3 additions & 0 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ def read_geotiff_dask(source: str, *,
full_h = http_ifd.height
full_w = http_ifd.width
from .._dtypes import resolve_bits_per_sample, tiff_dtype_to_numpy
from .._validation import _validate_predictor_sample_format
bps = resolve_bits_per_sample(http_ifd.bits_per_sample)
file_dtype = tiff_dtype_to_numpy(bps, http_ifd.sample_format)
_validate_predictor_sample_format(
http_ifd.predictor, http_ifd.sample_format)
n_bands = (
http_ifd.samples_per_pixel
if http_ifd.samples_per_pixel > 1 else 0
Expand Down
8 changes: 7 additions & 1 deletion xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@
_ON_GPU_FAILURE_SENTINEL,
_geotiff_strict_mode,
)
from .._validation import _validate_chunks_arg, _validate_dtype_cast
from .._validation import (
_validate_chunks_arg,
_validate_dtype_cast,
_validate_predictor_sample_format,
)
from ._gpu_helpers import (
_apply_nodata_mask_gpu,
_apply_orientation_geo_info,
Expand Down Expand Up @@ -429,6 +433,7 @@ def read_geotiff_gpu(source: str, *,
byte_counts = ifd.tile_byte_counts
compression = ifd.compression
predictor = ifd.predictor
_validate_predictor_sample_format(predictor, ifd.sample_format)
samples = ifd.samples_per_pixel
planar = ifd.planar_config
tw = ifd.tile_width
Expand Down Expand Up @@ -973,6 +978,7 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
th = ifd.tile_height
compression = ifd.compression
predictor = ifd.predictor
_validate_predictor_sample_format(predictor, ifd.sample_format)
byte_order = header.byte_order
offsets = list(ifd.tile_offsets)
byte_counts = list(ifd.tile_byte_counts)
Expand Down
5 changes: 5 additions & 0 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
select_overview_ifd,
validate_tile_layout,
)
from ._validation import _validate_predictor_sample_format

# ---------------------------------------------------------------------------
# Allocation guard: reject TIFF dimensions that would exhaust memory
Expand Down Expand Up @@ -1647,6 +1648,7 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
offsets = ifd.strip_offsets
byte_counts = ifd.strip_byte_counts
pred = ifd.predictor
_validate_predictor_sample_format(pred, ifd.sample_format)
bps = resolve_bits_per_sample(ifd.bits_per_sample)
bytes_per_sample = bps // 8
is_sub_byte = bps in SUB_BYTE_BPS
Expand Down Expand Up @@ -1837,6 +1839,7 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
samples = ifd.samples_per_pixel
compression = ifd.compression
pred = ifd.predictor
_validate_predictor_sample_format(pred, ifd.sample_format)
bps = resolve_bits_per_sample(ifd.bits_per_sample)
bytes_per_sample = bps // 8
is_sub_byte = bps in SUB_BYTE_BPS
Expand Down Expand Up @@ -2321,6 +2324,7 @@ def _fetch_decode_cog_http_strips(
offsets = ifd.strip_offsets
byte_counts = ifd.strip_byte_counts
pred = ifd.predictor
_validate_predictor_sample_format(pred, ifd.sample_format)
bytes_per_sample = bps // 8
is_sub_byte = bps in SUB_BYTE_BPS
jpeg_tables = ifd.jpeg_tables
Expand Down Expand Up @@ -2530,6 +2534,7 @@ def _fetch_decode_cog_http_tiles(
planar = ifd.planar_config
compression = ifd.compression
pred = ifd.predictor
_validate_predictor_sample_format(pred, ifd.sample_format)
bytes_per_sample = bps // 8
is_sub_byte = bps in SUB_BYTE_BPS
jpeg_tables = ifd.jpeg_tables
Expand Down
51 changes: 51 additions & 0 deletions xrspatial/geotiff/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,54 @@ def _validate_tile_size_arg(tile_size):
validation path (positive int + multiple-of-16 for tiled output).
"""
_validate_tile_size(tile_size)


def _validate_predictor_sample_format(predictor, sample_format) -> None:
"""Reject ``Predictor=3`` paired with a non-float ``SampleFormat`` (issue #1933).

TIFF Technical Note 3 defines the floating-point predictor for IEEE
float samples only. A reader-side input file (malformed, hand-crafted,
or adversarial) that claims ``Predictor=3`` with an integer
``SampleFormat`` (1=uint, 2=int) used to be accepted silently: the
byte-swizzle unshuffle ran on integer bytes and produced garbage
pixel values that look like valid integers, with no warning.

The writer side already enforces this contract in
``_writer._resolve_predictor`` (raises ``ValueError`` on non-float
dtypes), so this validator gives the reader symmetric behaviour.

Parameters
----------
predictor : int or tuple
The IFD ``Predictor`` tag value (1=none, 2=horizontal, 3=float).
Accepts a single-element tuple (the resolved value of a malformed
``count > 1`` tag) and normalizes to its first element; the TIFF
spec defines ``Predictor`` as a single SHORT.
sample_format : int
The IFD ``SampleFormat`` tag value (1=uint, 2=int, 3=float,
4=undefined).

Raises
------
ValueError
If ``predictor == 3`` and ``sample_format != 3``.
"""
# IFD.predictor delegates to IFD.get_value, which can return a tuple
# for a malformed Predictor tag with count > 1. tuple == 3 is always
# False, so a tuple-valued predictor would bypass the guard. Normalize
# to int first so the (3, non-3) case still fires.
if isinstance(predictor, tuple):
predictor = predictor[0] if predictor else 1
# Only the float-predictor case is asymmetric; predictor=1 (none) and
# predictor=2 (horizontal) are sample-format-agnostic by design.
if predictor == 3 and sample_format != 3:
raise ValueError(
f"Predictor=3 (floating-point) requires SampleFormat=3 "
f"(IEEE float), got SampleFormat={sample_format}. The TIFF "
Comment on lines +220 to +225
f"file is malformed: the floating-point horizontal predictor "
f"(TIFF Technical Note 3) is only defined for float samples. "
f"Decoding integer data through it would produce garbage. "
f"Re-encode the file with a matching predictor/sample-format "
f"pair, e.g. `gdal_translate -co PREDICTOR=2` for integers or "
f"`-co PREDICTOR=1` to drop the predictor."
)
152 changes: 152 additions & 0 deletions xrspatial/geotiff/tests/test_predictor3_int_dtype_1933.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Regression tests for issue #1933.

A malformed TIFF that claims ``Predictor=3`` (Floating-Point Predictor,
TIFF Technical Note 3) paired with an integer ``SampleFormat`` used to
decode silently to garbage. The byte-swizzle unshuffle ran on integer
bytes and produced values that look like valid integers, with no
warning. The writer side already rejects this combination in
``_writer._resolve_predictor``; these tests assert reader symmetry.

The fix lives in ``xrspatial.geotiff._validation`` (helper
``_validate_predictor_sample_format``) and is invoked at every IFD-read
site (eager numpy, dask, GPU tiled, GPU stripped).
"""
from __future__ import annotations

import numpy as np
import pytest

from xrspatial.geotiff import open_geotiff
from xrspatial.geotiff._compression import COMPRESSION_NONE
from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype
from xrspatial.geotiff._header import (
TAG_BITS_PER_SAMPLE,
TAG_COMPRESSION,
TAG_IMAGE_LENGTH,
TAG_IMAGE_WIDTH,
TAG_PHOTOMETRIC,
TAG_PREDICTOR,
TAG_ROWS_PER_STRIP,
TAG_SAMPLE_FORMAT,
TAG_SAMPLES_PER_PIXEL,
TAG_STRIP_BYTE_COUNTS,
TAG_STRIP_OFFSETS,
)
from xrspatial.geotiff._validation import _validate_predictor_sample_format
from xrspatial.geotiff._writer import (
_assemble_standard_layout,
_write_stripped,
)


def _build_predictor3_uint32_tiff(arr: np.ndarray) -> bytes:
"""Build a malformed TIFF: predictor=3 + uint32 (integer) sample format.

Uses the in-repo ``_assemble_standard_layout`` so we can write tags
the public writer would reject. Compression is COMPRESSION_NONE so
the strip bytes are exactly the raw integer values; the bug is then
visible by comparing the round-tripped values against the originals.
"""
rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False)
bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype)
# SampleFormat=1 (UINT) is the *integer* tag that the malformed file
# advertises; predictor=3 is the *float* predictor. Pair them.
tags = [
(TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]),
(TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]),
(TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample),
(TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE),
(TAG_PHOTOMETRIC, SHORT, 1, 1),
(TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
(TAG_SAMPLE_FORMAT, SHORT, 1, 1), # UINT
Comment on lines +56 to +61
(TAG_PREDICTOR, SHORT, 1, 3), # floating-point predictor
(TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]),
(TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off),
(TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc),
]
parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)]
return _assemble_standard_layout(8, [tags], parts, bigtiff=False)


class TestPredictor3IntegerSampleFormatRejected:
"""The validator rejects predictor=3 with non-float sample formats."""

def test_helper_rejects_pred3_sf1_uint(self):
with pytest.raises(ValueError, match="Predictor=3"):
_validate_predictor_sample_format(3, 1)

def test_helper_rejects_pred3_sf2_int(self):
with pytest.raises(ValueError, match="Predictor=3"):
_validate_predictor_sample_format(3, 2)

def test_helper_rejects_pred3_sf4_undefined(self):
# SampleFormat=4 (UNDEFINED) is treated as uint by the dtype map;
# routing it through predictor=3 also produces garbage.
with pytest.raises(ValueError, match="Predictor=3"):
_validate_predictor_sample_format(3, 4)

def test_helper_accepts_pred3_sf3_float(self):
# The legitimate combination remains a no-op.
_validate_predictor_sample_format(3, 3)

def test_helper_accepts_pred1_with_any_sf(self):
# Predictor=1 (none) is sample-format-agnostic.
_validate_predictor_sample_format(1, 1)
_validate_predictor_sample_format(1, 2)
_validate_predictor_sample_format(1, 3)

def test_helper_accepts_pred2_with_any_sf(self):
# Predictor=2 (horizontal) is sample-format-agnostic.
_validate_predictor_sample_format(2, 1)
_validate_predictor_sample_format(2, 2)
_validate_predictor_sample_format(2, 3)

def test_helper_normalizes_tuple_predictor(self):
# IFD.predictor delegates to get_value, which returns a tuple for
# a malformed Predictor tag with count > 1. The validator must
# still fire on the (3,) + non-float pair.
with pytest.raises(ValueError, match="Predictor=3"):
_validate_predictor_sample_format((3,), 1)
# Empty tuple falls back to predictor=1 (none) -> no-op.
_validate_predictor_sample_format((), 1)
# Non-3 tuple predictor + non-float sample_format -> no-op.
_validate_predictor_sample_format((2,), 1)


class TestEagerReadRejectsMalformedFile:
"""``open_geotiff`` raises on the malformed predictor=3 + uint32 file."""

def test_open_geotiff_eager_raises(self, tmp_path):
arr = np.array(
[[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32)
path = tmp_path / "pred3_uint32.tif"
path.write_bytes(_build_predictor3_uint32_tiff(arr))

with pytest.raises(ValueError, match="Predictor=3"):
open_geotiff(str(path))

def test_open_geotiff_dask_raises(self, tmp_path):
arr = np.array(
[[10, 20, 30, 40], [50, 60, 70, 80]], dtype=np.uint32)
path = tmp_path / "pred3_uint32_dask.tif"
path.write_bytes(_build_predictor3_uint32_tiff(arr))

from xrspatial.geotiff import read_geotiff_dask
with pytest.raises(ValueError, match="Predictor=3"):
read_geotiff_dask(str(path), chunks=64)


class TestValidPredictor3StillWorks:
"""The fix does not break legitimate predictor=3 float files."""

def test_predictor3_float32_round_trip(self, tmp_path):
tifffile = pytest.importorskip("tifffile")
pytest.importorskip("imagecodecs")

arr = np.linspace(-1.0, 1.0, 16, dtype=np.float32).reshape(4, 4)
path = tmp_path / "pred3_float32.tif"
tifffile.imwrite(
str(path), arr, predictor=3, compression="deflate")

result = open_geotiff(str(path))
np.testing.assert_array_equal(result.values, arr)
Loading