Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def _read_geo_info(source, *, overview_level: int | None = None,
# binding it into the chunk closure (#1809).
geo_info._ifd_photometric = _ifd.photometric
geo_info._ifd_samples_per_pixel = _ifd.samples_per_pixel
geo_info._ifd_compression = _ifd.compression
return geo_info, _ifd.height, _ifd.width, file_dtype, n_bands
if _is_file_like(source):
# File-like: read its full bytes; we don't try to mmap arbitrary
Expand Down Expand Up @@ -316,6 +317,11 @@ def _read_geo_info(source, *, overview_level: int | None = None,
# binding it into the chunk closure (#1809).
geo_info._ifd_photometric = ifd.photometric
geo_info._ifd_samples_per_pixel = ifd.samples_per_pixel
# Stash compression so the dask graph builder can fire the
# experimental / internal-only codec opt-in gate at graph build
# rather than waiting for the per-chunk task to fail (PR 4 of
# epic #2340).
geo_info._ifd_compression = ifd.compression
return geo_info, ifd.height, ifd.width, file_dtype, n_bands
finally:
if close_data:
Expand All @@ -339,6 +345,8 @@ def open_geotiff(source: str | BinaryIO, *,
missing_sources: str = _MISSING_SOURCES_SENTINEL,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
allow_experimental_codecs: bool = False,
allow_internal_only_jpeg: bool = False,
band_nodata: str | None = None,
mask_nodata: bool = True,
) -> xr.DataArray:
Expand Down Expand Up @@ -535,6 +543,23 @@ def open_geotiff(source: str | BinaryIO, *,
behaviour where the citation field passes through unchanged.
Matches the same kwarg on ``to_geotiff`` / ``write_geotiff_gpu``
so a value the reader accepted can survive a round-trip.
allow_experimental_codecs : bool, default False
Read-side opt-in for sources compressed with the Tier 3
experimental codecs (``lerc``, ``jpeg2000`` / ``j2k``, ``lz4``).
Default ``False`` rejects the read with ``ValueError`` naming
the flag; cross-backend numerical parity is not claimed and
reader support across GDAL versions is uneven. Matches the
same kwarg on the writers so a round-trip through a Tier 3
codec stays opt-in on both sides. See SUPPORTED_FEATURES tier
``'experimental'`` (epic #2340 PR 4).
allow_internal_only_jpeg : bool, default False
Read-side opt-in for JPEG-in-TIFF sources. The encoder writes
self-contained JFIF tiles without the TIFF JPEGTables tag
(347), so the read path is not interoperable with libtiff /
GDAL / rasterio. ``allow_experimental_codecs=True`` does NOT
cover this codec; the dedicated flag is its only gate. See
SUPPORTED_FEATURES tier ``'internal_only'`` for ``codec.jpeg``
(epic #2340 PR 4, original writer gate #1845).

Returns
-------
Expand Down Expand Up @@ -665,6 +690,8 @@ def open_geotiff(source: str | BinaryIO, *,
max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
allow_experimental_codecs=allow_experimental_codecs,
allow_internal_only_jpeg=allow_internal_only_jpeg,
band_nodata=band_nodata,
mask_nodata=mask_nodata,
**vrt_kwargs)
Expand All @@ -685,6 +712,10 @@ def open_geotiff(source: str | BinaryIO, *,
max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
allow_experimental_codecs=(
allow_experimental_codecs),
allow_internal_only_jpeg=(
allow_internal_only_jpeg),
mask_nodata=mask_nodata,
**gpu_kwargs)

Expand All @@ -696,6 +727,10 @@ def open_geotiff(source: str | BinaryIO, *,
max_pixels=max_pixels, name=name,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
allow_experimental_codecs=(
allow_experimental_codecs),
allow_internal_only_jpeg=(
allow_internal_only_jpeg),
mask_nodata=mask_nodata)

kwargs = {}
Expand All @@ -714,6 +749,8 @@ def open_geotiff(source: str | BinaryIO, *,
source, window=window,
overview_level=overview_level, band=band,
allow_rotated=allow_rotated,
allow_experimental_codecs=allow_experimental_codecs,
allow_internal_only_jpeg=allow_internal_only_jpeg,
**kwargs,
)

Expand Down
160 changes: 160 additions & 0 deletions xrspatial/geotiff/_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,166 @@
)


# Map TIFF compression tag values to codec names so the read-side opt-in
# gate (PR 4 of epic #2340) can name the codec in the rejection message
# without each call site repeating the integer-to-name table. The keys
# are the TIFF 6 Compression tag values (tag 259) used inside
# ``_compression.py``; the values match the codec names that appear on
# the ``SUPPORTED_FEATURES`` keys (``codec.<name>``).
_COMPRESSION_TAG_TO_NAME = {
1: 'none',
5: 'lzw',
7: 'jpeg',
8: 'deflate',
32773: 'packbits',
# Adobe Deflate (32946) decodes through the same zlib path as
# plain Deflate (8) and is collapsed onto the same codec name on
# purpose: both tags share the stable-tier classification in
# ``SUPPORTED_FEATURES`` (``codec.deflate``). A future Adobe-
# Deflate-specific tier would need its own ``codec.<name>`` entry
# AND its own mapping line here; the collapse is deliberate, not
# a passthrough.
32946: 'deflate',
34712: 'jpeg2000',
34887: 'lerc',
50000: 'zstd',
50004: 'lz4',
}


def _validate_read_codec_optin(
compression: int,
*,
allow_experimental_codecs: bool,
allow_internal_only_jpeg: bool,
entry_point: str = "open_geotiff",
) -> None:
"""Reject experimental / internal-only codecs on the read side.

Mirrors the writer-side gate in ``_writers/eager.py`` /
``_writers/gpu.py`` so a caller cannot decode a file produced with
an experimental or internal-only codec without naming the matching
opt-in flag at the call site. The flag and feature both appear in
the rejection message so the caller learns the name from the error
rather than the docs.

Part of PR 4 of epic #2340 (the GeoTIFF release contract). The
writer side has carried these gates since #2137 / #1845; this
helper extends the same shape to the read entry points.

Parameters
----------
compression : int
TIFF Compression tag value (tag 259) from the parsed IFD.
allow_experimental_codecs : bool
Opt-in for Tier 3 read paths (LERC, JPEG2000 / J2K, LZ4).
allow_internal_only_jpeg : bool
Opt-in for Tier 4 read path (JPEG-in-TIFF). Does not collapse
into ``allow_experimental_codecs`` for the same reason as on
the writer: internal-only is a stricter tier and keeps its own
dedicated flag.
entry_point : str
Name of the public read function for the rejection message.
"""
codec_name = _COMPRESSION_TAG_TO_NAME.get(int(compression))
if codec_name is None:
# Unknown compression tags are validated separately by the
# decoder; the opt-in gate only fires for codecs the reader
# otherwise accepts.
return
if codec_name == 'jpeg' and not allow_internal_only_jpeg:
raise ValueError(
f"{entry_point}: source uses compression='jpeg' (TIFF "
"tag 259 = 7), which is internal-only: the encoder writes "
"self-contained JFIF tiles without the TIFF JPEGTables tag "
"(347), so the read path is not interoperable with libtiff "
"/ GDAL / rasterio. Pass allow_internal_only_jpeg=True to "
"opt in to the internal-reader-only decode path. See "
"SUPPORTED_FEATURES tier 'internal_only' for codec.jpeg "
"(epic #2340, original gate #1845).")
if codec_name in _EXPERIMENTAL_CODECS and not allow_experimental_codecs:
raise ValueError(
f"{entry_point}: source uses compression={codec_name!r} "
"which is experimental on the read side: cross-backend "
"numerical parity is not claimed and reader support across "
"GDAL versions is uneven. Pass allow_experimental_codecs="
"True to opt in, or re-encode the source with a stable "
"lossless codec ('deflate', 'zstd', or 'lzw'). See "
f"SUPPORTED_FEATURES tier 'experimental' for codec.{codec_name} "
"(epic #2340, original writer gate #2137).")


# Writer rich-tag attrs that ride the Experimental tier (PR 4 of epic
# #2340). ``writer.gdal_metadata_xml`` and ``writer.extra_tags`` carry
# free-form payloads through to the on-disk TIFF; their interop with
# other readers (rasterio, libtiff, GDAL) depends on the payload and is
# not part of the release promise. The opt-in keeps the surface narrow
# without removing the capability.
#
# Round-trip exemption: when the attrs carry the
# ``_xrspatial_geotiff_contract`` marker, they came from a previous
# xrspatial read. The reader populated ``gdal_metadata_xml`` /
# ``extra_tags`` from the source file; gating the write would force
# every read-then-write caller to opt in. Skip the gate on
# round-tripped attrs so the canonical contract from #1984 stays a
# no-flag operation. The gate still fires when a caller adds those
# attrs to a fresh DataArray that did not come from a read.
def _validate_write_rich_tag_optin(
attrs: dict,
*,
gdal_metadata_xml_kwarg: object = None,
extra_tags_kwarg: object = None,
allow_experimental_codecs: bool,
entry_point: str = "to_geotiff",
) -> None:
"""Reject writes that include ``gdal_metadata_xml`` or ``extra_tags``
unless the caller opted in via ``allow_experimental_codecs=True``.

Part of PR 4 of epic #2340. Mirrors the existing codec-flag shape
so the rejection names the same opt-in the caller already learned
from the LERC / J2K / LZ4 paths. Round-tripped attrs (carrying
the ``_xrspatial_geotiff_contract`` marker) are exempt so the
canonical attrs round-trip (#1984) stays a no-flag operation; the
gate fires only when a caller constructs a fresh DataArray with
one of the rich-tag attrs set.
"""
if allow_experimental_codecs:
return
# Round-trip exemption: a DataArray that came from
# ``open_geotiff`` / ``read_geotiff_dask`` / ``read_geotiff_gpu``
# carries the contract marker. Writing it back is the canonical
# round-trip and should not require a new flag (issue #1984).
#
# This is a soft gate by design: a caller who hand-builds an
# attrs dict with the contract marker could bypass it. Forging
# the marker is a deliberate act, and the alternative (gating
# every read-then-write call) would break the canonical attrs
# round-trip that downstream code already depends on. The hard
# guarantee is "fresh DataArrays carrying these attrs need the
# opt-in"; the soft exemption keeps round-trips frictionless.
if '_xrspatial_geotiff_contract' in attrs:
return
triggered: list[str] = []
if attrs.get('gdal_metadata_xml') is not None:
triggered.append("attrs['gdal_metadata_xml']")
if attrs.get('extra_tags') is not None:
triggered.append("attrs['extra_tags']")
if gdal_metadata_xml_kwarg is not None:
triggered.append('gdal_metadata_xml kwarg')
if extra_tags_kwarg is not None:
triggered.append('extra_tags kwarg')
if not triggered:
return
raise ValueError(
f"{entry_point}: {', '.join(triggered)} pass-through is "
"experimental: the on-disk bytes are written verbatim and "
"interop with other readers (rasterio, libtiff, GDAL) depends "
"on the payload. Pass allow_experimental_codecs=True to opt "
"in to the rich-tag write path, or drop the attr before the "
"write. See SUPPORTED_FEATURES tier 'experimental' for "
"writer.gdal_metadata_xml / writer.extra_tags (epic #2340).")


# TIFF type ids needed when synthesizing extra_tags entries from attrs.
_TIFF_BYTE = 1
_TIFF_ASCII = 2
Expand Down
58 changes: 51 additions & 7 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def read_geotiff_dask(source: str, *,
missing_sources: str = _MISSING_SOURCES_SENTINEL,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
allow_experimental_codecs: bool = False,
allow_internal_only_jpeg: bool = False,
band_nodata: str | None = None,
mask_nodata: bool = True) -> xr.DataArray:
"""Read a GeoTIFF as a dask-backed DataArray for out-of-core processing.
Expand Down Expand Up @@ -124,6 +126,16 @@ def read_geotiff_dask(source: str, *,
instead of carrying the unrecognised payload through
``attrs['crs_wkt']``. See ``open_geotiff`` for the full
description.
allow_experimental_codecs : bool, default False
[advanced] Read-side opt-in for Tier 3 experimental codecs
(``lerc``, ``jpeg2000`` / ``j2k``, ``lz4``). Fires at graph
build, before any chunk task is scheduled. See ``open_geotiff``
for the full description (epic #2340 PR 4).
allow_internal_only_jpeg : bool, default False
[advanced] Read-side opt-in for JPEG-in-TIFF sources. Not
covered by ``allow_experimental_codecs``. See ``open_geotiff``
for the full description (epic #2340 PR 4, original writer gate
#1845).
on_gpu_failure : str, optional
[internal-only] Accepted for cross-backend signature symmetry
only. The dask path runs CPU decoders, so passing this kwarg
Expand Down Expand Up @@ -315,6 +327,7 @@ def read_geotiff_dask(source: str, *,
# Stash IFD photometric for the MinIsWhite nodata-inversion check below.
geo_info._ifd_photometric = http_ifd.photometric
geo_info._ifd_samples_per_pixel = http_ifd.samples_per_pixel
geo_info._ifd_compression = http_ifd.compression
else:
# Metadata-only read: O(1) memory via mmap, no pixel decompression.
# Lazy import for the same circular-import reason as ``read_vrt``
Expand All @@ -323,6 +336,28 @@ def read_geotiff_dask(source: str, *,
geo_info, full_h, full_w, file_dtype, n_bands = _read_geo_info(
source, overview_level=overview_level,
allow_rotated=allow_rotated)

# Reject experimental / internal-only codecs at graph build, before
# any chunk task is scheduled. The compression tag is stashed on
# ``geo_info`` by ``_read_geo_info`` (local / fsspec) and by the
# HTTP / fsspec branch above. PR 4 of epic #2340.
#
# ``getattr(..., None)`` is intentional: a synthesised geo_info
# (non-TIFF source) carries no compression tag, so the gate must
# skip rather than reject. Every TIFF source path stashes
# ``_ifd_compression`` in lockstep with ``_ifd_photometric`` and
# ``_ifd_samples_per_pixel`` so the skip never silently bypasses
# a real TIFF read.
_compression_tag = getattr(geo_info, '_ifd_compression', None)
if _compression_tag is not None:
from .._attrs import _validate_read_codec_optin
_validate_read_codec_optin(
_compression_tag,
allow_experimental_codecs=allow_experimental_codecs,
allow_internal_only_jpeg=allow_internal_only_jpeg,
entry_point="read_geotiff_dask",
)

# PR-C #2226: centralize the nodata lifecycle in one value object.
# ``raw_sentinel`` carries the pre-inversion sentinel that
# ``attrs['nodata']`` must preserve; ``effective_sentinel`` is what
Expand Down Expand Up @@ -553,7 +588,11 @@ def read_geotiff_dask(source: str, *,
target_dtype=target_dtype,
http_meta_key=http_meta_key,
max_pixels=max_pixels,
allow_rotated=allow_rotated),
allow_rotated=allow_rotated,
allow_experimental_codecs=(
allow_experimental_codecs),
allow_internal_only_jpeg=(
allow_internal_only_jpeg)),
shape=block_shape,
dtype=target_dtype,
)
Expand All @@ -575,7 +614,9 @@ def read_geotiff_dask(source: str, *,

def _delayed_read_window(source, r0, c0, r1, c1, overview_level, nodata,
band, *, target_dtype=None, http_meta_key=None,
max_pixels=None, allow_rotated=False):
max_pixels=None, allow_rotated=False,
allow_experimental_codecs=False,
allow_internal_only_jpeg=False):
"""Dask-delayed function to read a single window.

*http_meta_key* is an optional ``Delayed[(TIFFHeader, IFD)]`` parsed
Expand Down Expand Up @@ -631,11 +672,14 @@ def _read(http_meta):
_r2a_kwargs = {}
if max_pixels is not None:
_r2a_kwargs['max_pixels'] = max_pixels
arr, _ = _read_to_array(source, window=(r0, c0, r1, c1),
overview_level=overview_level,
band=band,
allow_rotated=allow_rotated,
**_r2a_kwargs)
arr, _ = _read_to_array(
source, window=(r0, c0, r1, c1),
overview_level=overview_level,
band=band,
allow_rotated=allow_rotated,
allow_experimental_codecs=allow_experimental_codecs,
allow_internal_only_jpeg=allow_internal_only_jpeg,
**_r2a_kwargs)
if nodata is not None:
# ``arr`` was just decoded by ``_fetch_decode_cog_http_tiles``
# or ``read_to_array``; both return freshly-allocated buffers
Expand Down
Loading
Loading