Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,8 @@ def read_geotiff_dask(source: str, *,
# and ``_CloudSource`` satisfies that contract. Going through it
# bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of
# fetching the whole remote object up front. See PR #1755 review.
is_http = (
isinstance(source, str)
and source.startswith(('http://', 'https://'))
)
from .._reader import _is_fsspec_uri
from .._reader import _is_fsspec_uri, _is_http_url
is_http = _is_http_url(source)
is_fsspec = isinstance(source, str) and _is_fsspec_uri(source)
http_meta = None
http_meta_key = None
Expand Down Expand Up @@ -576,8 +573,8 @@ def _read(http_meta):
# fsspec-addressable remotes (s3://, gs://, az://, memory://, ...).
# Both source classes expose ``read_range``, which is all
# ``_fetch_decode_cog_http_tiles`` needs.
_is_http_src = isinstance(source, str) and source.startswith(
('http://', 'https://'))
from .._reader import _is_http_url as _ihu
_is_http_src = _ihu(source)
_is_fsspec_src = False
if http_meta is not None and isinstance(source, str) and \
not _is_http_src:
Expand Down
11 changes: 9 additions & 2 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,9 @@ def read_geotiff_gpu(source: str, *,
# whole image either way for the eager path; the trade-off is a CPU
# decode instead of nvCOMP-on-GPU. Callers who want bounded GPU
# memory should pass ``chunks=...``.
from .._reader import _is_http_url
if isinstance(source, str) and (
source.startswith(('http://', 'https://'))
_is_http_url(source)
or _is_fsspec_uri(source)):
return _read_geotiff_gpu_eager_via_cpu(
source, dtype=dtype, window=window,
Expand Down Expand Up @@ -1053,7 +1054,13 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation):
"""
if not isinstance(source, str):
return False
if source.startswith(('http://', 'https://')):
# The http(s) gate must NOT live inside a ``try/except`` -- a hidden
# import failure would silently let an HTTP URL into the kvikio
# branch (which opens the path as a local file and panics). The
# canonical case-insensitive helper is a sibling module, so the
# import is safe at module load time (#2323).
from .._sources import _is_http_url
if _is_http_url(source):
return False
try:
from .._reader import _is_fsspec_uri
Expand Down
4 changes: 2 additions & 2 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
_BytesIOSource, _CloudSource, _coerce_path, _FileSource, _get_http_pool,
_get_pinned_conn_classes, _http_allow_private_hosts, _http_connect_timeout,
_http_read_timeout, _http_timeout_from_env, _HTTPSource, _ip_is_private,
_is_file_like, _is_fsspec_uri, _make_pinned_pool,
_is_file_like, _is_fsspec_uri, _is_http_url, _make_pinned_pool,
_max_coalesced_range_bytes_from_env, _max_tile_bytes_from_env, _mmap_cache,
_mmap_cache_size_from_env, _MmapCache, _open_source,
_resolve_max_cloud_bytes, _validate_http_url, coalesce_ranges,
Expand Down Expand Up @@ -142,7 +142,7 @@ def _read_to_array(source, *, window=None, overview_level: int | None = None,
(np.ndarray, GeoInfo) tuple
"""
source = _coerce_path(source)
if isinstance(source, str) and source.startswith(('http://', 'https://')):
if _is_http_url(source):
return _read_cog_http(source, overview_level=overview_level, band=band,
max_pixels=max_pixels, window=window,
allow_rotated=allow_rotated)
Expand Down
7 changes: 6 additions & 1 deletion xrspatial/geotiff/_sidecar.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
# ``_reader`` imports ``_sidecar`` lazily (inside functions), so this
# top-level import does not form a cycle at module load time.
from ._reader import _is_fsspec_uri
# Canonical case-insensitive http(s) check (#2323). Reused via the
# wrapper ``_is_http_url`` below so existing imports keep working.
from ._sources import _is_http_url as _canonical_is_http_url

#: Type of the bytes-like buffer a sidecar carries: an mmap for local
#: files, bytes for HTTP / fsspec downloads. Narrowed from ``object``
Expand All @@ -43,7 +46,9 @@ class SidecarOverviews(NamedTuple):


def _is_http_url(source: str) -> bool:
return source.startswith(("http://", "https://"))
# Delegate to the canonical case-insensitive check so uppercase
# ``HTTP://`` URLs cannot dodge SSRF validation (issue #2323).
return _canonical_is_http_url(source)


def find_sidecar(source) -> str | None:
Expand Down
28 changes: 25 additions & 3 deletions xrspatial/geotiff/_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import threading
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse

import urllib3

Expand Down Expand Up @@ -1451,11 +1452,32 @@ def close(self):
_CLOUD_SCHEMES = ('s3://', 'gs://', 'az://', 'abfs://')


def _is_http_url(path) -> bool:
"""Return True if *path* is an ``http://`` or ``https://`` URL.

Case-insensitive: URL schemes are case-insensitive per RFC 3986, so an
uppercase ``HTTP://`` or mixed-case ``Http://`` must dispatch to the
SSRF-validating :class:`_HTTPSource`, not to the fsspec branch. See
issue #2323.
"""
if not isinstance(path, str):
return False
try:
scheme = urlparse(path).scheme
except (ValueError, TypeError):
return False
return scheme.lower() in ('http', 'https')


def _is_fsspec_uri(path: str) -> bool:
"""Check if a path is a fsspec-compatible URI (not http/https/local)."""
"""Check if a path is a fsspec-compatible URI (not http/https/local).

Excludes http(s) case-insensitively so uppercase URLs cannot dodge the
SSRF allow-list and pinned DNS in :class:`_HTTPSource` (issue #2323).
"""
if not isinstance(path, str):
return False
if path.startswith(('http://', 'https://')):
if _is_http_url(path):
return False
return '://' in path

Expand Down Expand Up @@ -1636,7 +1658,7 @@ def _open_source(source):
raise TypeError(
f"source must be a str path/URL or a binary file-like object "
f"with read+seek methods, got {type(source).__name__}")
if source.startswith(('http://', 'https://')):
if _is_http_url(source):
return _HTTPSource(source)
if _is_fsspec_uri(source):
return _CloudSource(source)
Expand Down
11 changes: 9 additions & 2 deletions xrspatial/geotiff/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@
_assemble_tiff, _build_ifd, _compute_classic_ifd_overhead,
_float_to_rational, _pack_tag_value, _promote_offsets_to_long8,
_serialize_tag_value, _should_use_bigtiff_streaming)
# Canonical case-insensitive http(s) check (#2323) so the writer-side
# fsspec gate cannot be tricked by uppercase URLs.
from ._sources import _is_http_url as _is_http_url_canonical

# Tag IDs the writer must never accept from ``extra_tags``. NewSubfileType
# (254) is a per-IFD status flag the writer emits on its own for overview
Expand Down Expand Up @@ -1238,10 +1241,14 @@ def _write_streaming(dask_data, path: str, *,


def _is_fsspec_uri(path) -> bool:
"""Check if a path is a fsspec-compatible URI (string only)."""
"""Check if a path is a fsspec-compatible URI (string only).

Excludes http(s) case-insensitively so uppercase URLs are not routed
through fsspec on the writer side (issue #2323).
"""
if not isinstance(path, str):
return False
if path.startswith(('http://', 'https://')):
if _is_http_url_canonical(path):
return False
return '://' in path

Expand Down
Loading
Loading