diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index 6da17cb3..e68128e7 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -194,8 +194,13 @@ def read_geotiff_dask(source: str, *, # and ``_CloudSource`` satisfies that contract. Going through it # bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of # fetching the whole remote object up front. See PR #1755 review. - from .._reader import _is_fsspec_uri, _is_http_url - is_http = _is_http_url(source) + # Local imports: backend modules avoid eager-importing the reader / + # sources layer at module load so the package can be imported without + # urllib3 in environments that only consume the dask path. + # Issues #2323 / #2332. + from .._reader import _is_fsspec_uri + from .._sources import _is_http_source + is_http = _is_http_source(source) is_fsspec = isinstance(source, str) and _is_fsspec_uri(source) http_meta = None http_meta_key = None @@ -573,8 +578,8 @@ def _read(http_meta): # fsspec-addressable remotes (s3://, gs://, az://, memory://, ...). # Both source classes expose ``read_range``, which is all # ``_fetch_decode_cog_http_tiles`` needs. - from .._reader import _is_http_url as _ihu - _is_http_src = _ihu(source) + from .._sources import _is_http_source as _ihs + _is_http_src = _ihs(source) _is_fsspec_src = False if http_meta is not None and isinstance(source, str) and \ not _is_http_src: diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index cdab4946..695b9aa2 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -316,6 +316,7 @@ def read_geotiff_gpu(source: str, *, from .._header import parse_all_ifds, parse_header, select_overview_ifd, validate_tile_layout from .._reader import (MAX_PIXELS_DEFAULT, _check_dimensions, _FileSource, _is_fsspec_uri, _max_tile_bytes_from_env, _resolve_masked_fill) + from .._sources import _is_http_source # ``source`` is already coerced above (before the dispatch # validator); no need to re-coerce here. @@ -346,9 +347,8 @@ def read_geotiff_gpu(source: str, *, # whole image either way for the eager path; the trade-off is a CPU # decode instead of nvCOMP-on-GPU. Callers who want bounded GPU # memory should pass ``chunks=...``. - from .._reader import _is_http_url if isinstance(source, str) and ( - _is_http_url(source) + _is_http_source(source) or _is_fsspec_uri(source)): return _read_geotiff_gpu_eager_via_cpu( source, dtype=dtype, window=window, @@ -1075,9 +1075,9 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): # import failure would silently let an HTTP URL into the kvikio # branch (which opens the path as a local file and panics). The # canonical case-insensitive helper is a sibling module, so the - # import is safe at module load time (#2323). - from .._sources import _is_http_url - if _is_http_url(source): + # import is safe at module load time. Issues #2323 / #2332. + from .._sources import _is_http_source + if _is_http_source(source): return False try: from .._reader import _is_fsspec_uri diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index f1351868..1963b6d1 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -93,7 +93,8 @@ _BytesIOSource, _CloudSource, _coerce_path, _FileSource, _get_http_pool, _get_pinned_conn_classes, _http_allow_private_hosts, _http_connect_timeout, _http_read_timeout, _http_timeout_from_env, _HTTPSource, _ip_is_private, - _is_file_like, _is_fsspec_uri, _is_http_url, _make_pinned_pool, + _is_file_like, _is_fsspec_uri, _is_http_source, _is_http_url, + _make_pinned_pool, _max_coalesced_range_bytes_from_env, _max_tile_bytes_from_env, _mmap_cache, _mmap_cache_size_from_env, _MmapCache, _open_source, _resolve_max_cloud_bytes, _validate_http_url, coalesce_ranges, @@ -142,7 +143,7 @@ def _read_to_array(source, *, window=None, overview_level: int | None = None, (np.ndarray, GeoInfo) tuple """ source = _coerce_path(source) - if _is_http_url(source): + if _is_http_source(source): return _read_cog_http(source, overview_level=overview_level, band=band, max_pixels=max_pixels, window=window, allow_rotated=allow_rotated) diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 6fcf2ac2..c84a18cc 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -26,9 +26,7 @@ # ``_reader`` imports ``_sidecar`` lazily (inside functions), so this # top-level import does not form a cycle at module load time. from ._reader import _is_fsspec_uri -# Canonical case-insensitive http(s) check (#2323). Reused via the -# wrapper ``_is_http_url`` below so existing imports keep working. -from ._sources import _is_http_url as _canonical_is_http_url +from ._sources import _is_http_source #: Type of the bytes-like buffer a sidecar carries: an mmap for local #: files, bytes for HTTP / fsspec downloads. Narrowed from ``object`` @@ -46,9 +44,13 @@ class SidecarOverviews(NamedTuple): def _is_http_url(source: str) -> bool: - # Delegate to the canonical case-insensitive check so uppercase - # ``HTTP://`` URLs cannot dodge SSRF validation (issue #2323). - return _canonical_is_http_url(source) + """Case-insensitive HTTP(S) scheme test for sidecar routing. + + Delegates to :func:`xrspatial.geotiff._sources._is_http_source` so + the SSRF-relevant routing decision matches the rest of the package + (issues #2323 / #2332). + """ + return _is_http_source(source) def find_sidecar(source) -> str | None: diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index 39cdbe4f..217271d4 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -1452,32 +1452,45 @@ def close(self): _CLOUD_SCHEMES = ('s3://', 'gs://', 'az://', 'abfs://') -def _is_http_url(path) -> bool: - """Return True if *path* is an ``http://`` or ``https://`` URL. - - Case-insensitive: URL schemes are case-insensitive per RFC 3986, so an - uppercase ``HTTP://`` or mixed-case ``Http://`` must dispatch to the - SSRF-validating :class:`_HTTPSource`, not to the fsspec branch. See - issue #2323. +def _is_http_source(source) -> bool: + """Return True if ``source`` is an HTTP(S) URL, case-insensitively. + + Centralized so every routing call site in ``xrspatial/geotiff/`` + classifies the scheme the same way. Before this helper existed, + each call site did ``source.startswith(('http://', 'https://'))``, + which is case-sensitive and let ``HTTP://example.internal/...`` + (uppercase) slip past :class:`_HTTPSource` and the SSRF allow-list + in :func:`_validate_http_url`. Per RFC 3986 section 3.1 URI schemes + are case-insensitive, so any uppercase / mixed-case variant has to + route through the same validator as ``http`` / ``https``. + + Non-string inputs (``None``, ``bytes``, ``os.PathLike``, file-like + objects) return ``False`` so callers can drop the surrounding + ``isinstance(_, str)`` check where they want to. Issues #2323 / #2332. """ - if not isinstance(path, str): - return False - try: - scheme = urlparse(path).scheme - except (ValueError, TypeError): + if not isinstance(source, str) or not source: return False - return scheme.lower() in ('http', 'https') + # ``urlparse`` strips off the scheme cleanly even for unusual inputs + # (e.g. ``HTTP:`` with no ``//``) and avoids the prefix-tuple trap. + return urlparse(source).scheme.lower() in ('http', 'https') + + +# Back-compat alias: earlier patches (#2323) shipped this same helper under +# the name ``_is_http_url`` and downstream tests / re-exports still use that +# name. Keep the alias so importers and the regression tests stay green. +_is_http_url = _is_http_source def _is_fsspec_uri(path: str) -> bool: """Check if a path is a fsspec-compatible URI (not http/https/local). Excludes http(s) case-insensitively so uppercase URLs cannot dodge the - SSRF allow-list and pinned DNS in :class:`_HTTPSource` (issue #2323). + SSRF allow-list and pinned DNS in :class:`_HTTPSource` (issues #2323 / + #2332). """ if not isinstance(path, str): return False - if _is_http_url(path): + if _is_http_source(path): return False return '://' in path @@ -1658,7 +1671,7 @@ def _open_source(source): raise TypeError( f"source must be a str path/URL or a binary file-like object " f"with read+seek methods, got {type(source).__name__}") - if _is_http_url(source): + if _is_http_source(source): return _HTTPSource(source) if _is_fsspec_uri(source): return _CloudSource(source) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 53f614ec..96bf710f 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -61,13 +61,11 @@ # (``_write``, ``_write_streaming``) and external importers (the # ``_writers`` subpackage, tests, ``_gpu_decode``) keep using the # ``xrspatial.geotiff._writer`` import path. +from ._sources import _is_http_source from ._write_layout import (BO, _assemble_cog_layout, _assemble_standard_layout, # noqa: F401 _assemble_tiff, _build_ifd, _compute_classic_ifd_overhead, _float_to_rational, _pack_tag_value, _promote_offsets_to_long8, _serialize_tag_value, _should_use_bigtiff_streaming) -# Canonical case-insensitive http(s) check (#2323) so the writer-side -# fsspec gate cannot be tricked by uppercase URLs. -from ._sources import _is_http_url as _is_http_url_canonical # Tag IDs the writer must never accept from ``extra_tags``. NewSubfileType # (254) is a per-IFD status flag the writer emits on its own for overview @@ -1243,12 +1241,16 @@ def _write_streaming(dask_data, path: str, *, def _is_fsspec_uri(path) -> bool: """Check if a path is a fsspec-compatible URI (string only). - Excludes http(s) case-insensitively so uppercase URLs are not routed - through fsspec on the writer side (issue #2323). + HTTP(S) URLs are deliberately excluded here so the writer can raise + a typed "writes not supported over HTTP" error instead of handing + the URL to fsspec. Uses :func:`_sources._is_http_source` so the + HTTP detection is case-insensitive (RFC 3986); without that, an + uppercase ``HTTP://...`` slipped past this check and into fsspec. + Issues #2323 / #2332. """ if not isinstance(path, str): return False - if _is_http_url_canonical(path): + if _is_http_source(path): return False return '://' in path @@ -1280,6 +1282,16 @@ def _write_bytes(file_bytes: bytes | bytearray, path) -> None: path.write(file_bytes) return + # Reject HTTP(S) write targets with a typed error before the local + # file path tries to treat the URL as a filename. ``_is_http_source`` + # is case-insensitive so ``HTTP://...`` reports the same friendly + # error as ``http://...``. Issue #2332. + if isinstance(path, str) and _is_http_source(path): + raise NotImplementedError( + f"Writes are not supported over HTTP(S). Got {path!r}. " + "Write to a local path or an fsspec-supported cloud URL " + "(s3://, gs://, az://, ...) instead.") + if _is_fsspec_uri(path): try: import fsspec diff --git a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py new file mode 100644 index 00000000..e2ed2c0f --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py @@ -0,0 +1,318 @@ +"""Case-insensitive HTTP(S) scheme routing for SSRF protection (#2332). + +Issue #2321 sub-task 5. + +Background +---------- +Several routing call sites in ``xrspatial/geotiff/`` historically used +``startswith(('http://', 'https://'))`` to decide whether a string source +should be opened by ``_HTTPSource`` (which runs SSRF + DNS-pinning checks +via ``_validate_http_url``) or handed off to fsspec. That comparison is +case-sensitive, so a URL like ``HTTP://169.254.169.254/latest/meta-data`` +slipped past ``_HTTPSource`` entirely and fell through to fsspec, which +has no SSRF allow-list. Uppercase schemes are valid per RFC 3986 sect. 3.1 +(``scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )``, case-insensitive). + +The fix centralizes scheme detection on a single helper, ``_is_http_source``, +that does ``urlparse(url).scheme.lower() in ('http', 'https')``, and routes +every call site through it. + +These tests exercise: + +* The helper itself across mixed-case schemes. +* ``_open_source`` returning ``_HTTPSource`` for uppercase URLs. +* The dispatch boolean in every other call site (reader, writer, sidecar, + dask backend, gpu backend, fsspec classifier). +* The SSRF allow-list still rejecting uppercase URLs that resolve to + private / loopback / link-local addresses. + +All tests are offline: ``socket.getaddrinfo`` is monkeypatched so the +validator never opens a real connection. +""" +from __future__ import annotations + +import socket + +import pytest + +from xrspatial.geotiff import UnsafeURLError +from xrspatial.geotiff import _reader as _reader_mod +from xrspatial.geotiff import _sources as _sources_mod + + +# --------------------------------------------------------------------------- +# Helpers (mirrors test_ssrf_hardening_1664.py) +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + def _resolver(host, port, *args, **kwargs): + if ':' in ip: + return [(socket.AF_INET6, socket.SOCK_STREAM, 0, '', + (ip, port or 0, 0, 0))] + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +# --------------------------------------------------------------------------- +# Helper unit tests +# --------------------------------------------------------------------------- + + +class TestIsHttpSourceHelper: + """``_is_http_source`` is the single source of truth for HTTP routing.""" + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + 'hTTpS://example.com/x.tif', + 'http://EXAMPLE.COM/x.tif', # host case must not matter either + ]) + def test_http_schemes_match(self, url): + assert _sources_mod._is_http_source(url) is True + + @pytest.mark.parametrize("url", [ + 's3://bucket/key.tif', + 'gs://bucket/key.tif', + 'az://container/blob.tif', + 'abfs://container/blob.tif', + 'file:///etc/passwd', + 'ftp://example.com/x.tif', + 'gopher://example.com/', + 'memory://x.tif', + '/local/path/file.tif', + 'relative/path.tif', + 'C:\\windows\\file.tif', + ]) + def test_non_http_schemes_do_not_match(self, url): + assert _sources_mod._is_http_source(url) is False + + @pytest.mark.parametrize("value", [None, 42, b'http://x', object()]) + def test_non_string_does_not_match(self, value): + # Be defensive: routing call sites also gate on isinstance(_, str) + # in some places, but the helper itself must not raise on junk. + assert _sources_mod._is_http_source(value) is False + + def test_empty_string_does_not_match(self): + assert _sources_mod._is_http_source('') is False + + def test_scheme_only_prefix_does_not_match(self): + # ``urlparse('http')`` returns scheme=''; only ``http:`` or + # ``http://`` should classify as HTTP. + assert _sources_mod._is_http_source('http') is False + + def test_scheme_colon_no_slashes_classifies_as_http(self): + # ``urlparse('http:foo').scheme == 'http'``: this is broader than + # the old ``startswith('http://')`` gate but is RFC-correct. The + # validator rejects these downstream as "no hostname", so the + # security posture is unchanged. Locking the broader classifier + # in here keeps any future tightening explicit. Issue #2332. + assert _sources_mod._is_http_source('http:foo') is True + assert _sources_mod._is_http_source('HTTP:foo') is True + + def test_open_source_http_colon_no_hostname_raises(self): + # End-to-end follow-up: ``_open_source('http:foo')`` now routes + # into ``_HTTPSource``, which calls ``_validate_http_url`` and + # raises ``UnsafeURLError('... has no hostname')``. The previous + # case-sensitive gate would have sent this to fsspec instead. + with pytest.raises(UnsafeURLError): + _sources_mod._open_source('http:foo') + + +# --------------------------------------------------------------------------- +# Dispatch: ``_open_source`` must route uppercase URLs through ``_HTTPSource`` +# --------------------------------------------------------------------------- + + +class TestOpenSourceRoutesUppercase: + """``_open_source('HTTP://...')`` must build an ``_HTTPSource``. + + We intercept ``_HTTPSource.__init__`` so the test never opens a real + HTTP connection; getting the call at all is what we are verifying. + """ + + def test_uppercase_http_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + # Skip the real validator / urllib3 pool setup. + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTP://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTP://example.com/x.tif'] + + def test_uppercase_https_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTPS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTPS://example.com/x.tif'] + + def test_mixed_case_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('hTTpS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['hTTpS://example.com/x.tif'] + + +# --------------------------------------------------------------------------- +# Dispatch booleans elsewhere in the code base +# --------------------------------------------------------------------------- + + +class TestDispatchBooleansAreCaseInsensitive: + """Every routing site must use the centralized helper, not startswith. + + Each call site below historically read:: + + source.startswith(('http://', 'https://')) + + which is the bug. We assert ``_is_http_source`` returns True for the + uppercase forms; the implementation modules import and call the same + helper at the dispatch site. + """ + + @pytest.mark.parametrize("url", [ + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_helper_recognizes_uppercase(self, url): + assert _sources_mod._is_http_source(url) is True + + def test_is_fsspec_uri_excludes_uppercase_http(self): + # ``_is_fsspec_uri`` is the partner classifier in both + # ``_sources.py`` and ``_writer.py``. If it returned True for + # ``HTTP://...`` the writer would hand the URL to fsspec instead + # of raising the typed "writes not supported over HTTP" error. + assert _sources_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _sources_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + # sanity: real fsspec URIs still classify as fsspec + assert _sources_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_writer_is_fsspec_uri_excludes_uppercase_http(self): + from xrspatial.geotiff import _writer as _writer_mod + assert _writer_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_sidecar_helper_is_case_insensitive(self): + from xrspatial.geotiff import _sidecar as _sidecar_mod + assert _sidecar_mod._is_http_url('HTTP://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('HTTPS://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('http://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('s3://b/k.tif') is False + + +# --------------------------------------------------------------------------- +# End-to-end: uppercase scheme + private host must still be rejected +# --------------------------------------------------------------------------- + + +class TestUppercaseSchemeStillRejectsPrivateHosts: + """The whole point of the fix: uppercase URLs go through the SSRF gate. + + Before the fix, ``HTTP://169.254.169.254/...`` would skip the validator + and try to open via fsspec. After the fix, it routes through + ``_HTTPSource``, which calls ``_validate_http_url``, which raises + ``UnsafeURLError``. + """ + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http', 'hTTpS']) + @pytest.mark.parametrize("ip", [ + '127.0.0.1', + '169.254.169.254', + '10.0.0.1', + '192.168.1.1', + '0.0.0.0', + ]) + def test_private_host_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + url = f'{scheme}://attacker.test/x.tif' + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(url) + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_localhost_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://localhost:8080/x.tif') + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_uppercase_scheme_to_127_literal_rejected( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://127.0.0.1/x.tif') + + def test_open_source_uppercase_private_host_raises(self, monkeypatch): + """End-to-end: ``_open_source`` -> ``_HTTPSource`` -> validator. + + Confirms the dispatch wiring actually drives the URL through the + validator (not just that the validator works in isolation). + """ + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('169.254.169.254')) + # Make sure the env override is not set; the validator skips + # resolution when ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS`` is on. + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', raising=False) + with pytest.raises(UnsafeURLError): + _sources_mod._open_source( + 'HTTP://metadata.google.internal/computeMetadata/v1/') + + +# --------------------------------------------------------------------------- +# Writer: HTTP(S) destinations must raise a typed error, not a raw OSError +# --------------------------------------------------------------------------- + + +class TestWriterRejectsHttpTargets: + """``_write_bytes(_, 'HTTP://...')`` must raise ``NotImplementedError``. + + Without the early gate the uppercase URL fell through ``_is_fsspec_uri`` + (correctly returns False) and into the local file write path, which + surfaced an OS-specific ``OSError`` for the colon-in-filename. The + typed error matches the lowercase-HTTP behaviour and points users at + the supported destinations. Follow-up to issue #2332 review. + """ + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_write_bytes_rejects_http(self, url): + from xrspatial.geotiff import _writer as _writer_mod + with pytest.raises(NotImplementedError) as excinfo: + _writer_mod._write_bytes(b'IIxxxx', url) + msg = str(excinfo.value) + assert 'HTTP' in msg + assert url in msg