From 58eb65f1c6d13e3ad06435e29699b2974f01ebb3 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:42:33 -0700 Subject: [PATCH 1/2] Route HTTP(S) scheme detection through a case-insensitive helper (#2332) Several call sites in xrspatial/geotiff/ classified HTTP(S) sources with startswith(('http://', 'https://')). That comparison is case-sensitive, so URLs like HTTP://169.254.169.254/... slipped past _HTTPSource and the SSRF / DNS-pinning checks in _validate_http_url and fell through to fsspec. RFC 3986 section 3.1 defines URI schemes as case-insensitive. Adds _is_http_source(s) in _sources.py, backed by urlparse(s).scheme.lower() in ('http', 'https'), and routes every dispatch site through it: _sources.py (_is_fsspec_uri + _open_source), _reader.py, _writer.py (_is_fsspec_uri), _backends/dask.py, _backends/gpu.py (eager-via-CPU branch + GDS opt-in gate), and the _sidecar.py _is_http_url helper. New regression tests in test_http_scheme_case_2321.py cover the helper, _open_source dispatch for HTTP / HTTPS / Http / hTTpS, the partner _is_fsspec_uri classifiers in both _sources and _writer, the sidecar helper, and the end-to-end SSRF rejection of uppercase URLs that resolve to 127.0.0.1, 169.254.169.254, 10.0.0.1, 192.168.1.1, 0.0.0.0, and localhost. socket.getaddrinfo is monkeypatched so the tests stay offline. Parent: #2321 sub-task 5. --- xrspatial/geotiff/_backends/dask.py | 6 +- xrspatial/geotiff/_backends/gpu.py | 6 +- xrspatial/geotiff/_reader.py | 4 +- xrspatial/geotiff/_sidecar.py | 9 +- xrspatial/geotiff/_sources.py | 28 +- xrspatial/geotiff/_writer.py | 13 +- .../tests/test_http_scheme_case_2321.py | 270 ++++++++++++++++++ 7 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_http_scheme_case_2321.py diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index 95f7b09a..548b7f1e 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -194,11 +194,9 @@ def read_geotiff_dask(source: str, *, # and ``_CloudSource`` satisfies that contract. Going through it # bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of # fetching the whole remote object up front. See PR #1755 review. - is_http = ( - isinstance(source, str) - and source.startswith(('http://', 'https://')) - ) + from .._sources import _is_http_source from .._reader import _is_fsspec_uri + is_http = _is_http_source(source) is_fsspec = isinstance(source, str) and _is_fsspec_uri(source) http_meta = None http_meta_key = None diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 92ba91f0..65ff6ed3 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -316,6 +316,7 @@ def read_geotiff_gpu(source: str, *, from .._header import parse_all_ifds, parse_header, select_overview_ifd, validate_tile_layout from .._reader import (MAX_PIXELS_DEFAULT, _check_dimensions, _FileSource, _is_fsspec_uri, _max_tile_bytes_from_env, _resolve_masked_fill) + from .._sources import _is_http_source # ``source`` is already coerced above (before the dispatch # validator); no need to re-coerce here. @@ -347,7 +348,7 @@ def read_geotiff_gpu(source: str, *, # decode instead of nvCOMP-on-GPU. Callers who want bounded GPU # memory should pass ``chunks=...``. if isinstance(source, str) and ( - source.startswith(('http://', 'https://')) + _is_http_source(source) or _is_fsspec_uri(source)): return _read_geotiff_gpu_eager_via_cpu( source, dtype=dtype, window=window, @@ -1053,7 +1054,8 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): """ if not isinstance(source, str): return False - if source.startswith(('http://', 'https://')): + from .._sources import _is_http_source + if _is_http_source(source): return False try: from .._reader import _is_fsspec_uri diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index f9a3003e..f6ebd604 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -93,7 +93,7 @@ _BytesIOSource, _CloudSource, _coerce_path, _FileSource, _get_http_pool, _get_pinned_conn_classes, _http_allow_private_hosts, _http_connect_timeout, _http_read_timeout, _http_timeout_from_env, _HTTPSource, _ip_is_private, - _is_file_like, _is_fsspec_uri, _make_pinned_pool, + _is_file_like, _is_fsspec_uri, _is_http_source, _make_pinned_pool, _max_coalesced_range_bytes_from_env, _max_tile_bytes_from_env, _mmap_cache, _mmap_cache_size_from_env, _MmapCache, _open_source, _resolve_max_cloud_bytes, _validate_http_url, coalesce_ranges, @@ -142,7 +142,7 @@ def _read_to_array(source, *, window=None, overview_level: int | None = None, (np.ndarray, GeoInfo) tuple """ source = _coerce_path(source) - if isinstance(source, str) and source.startswith(('http://', 'https://')): + if _is_http_source(source): return _read_cog_http(source, overview_level=overview_level, band=band, max_pixels=max_pixels, window=window, allow_rotated=allow_rotated) diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 050900f0..76970f2c 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -43,7 +43,14 @@ class SidecarOverviews(NamedTuple): def _is_http_url(source: str) -> bool: - return source.startswith(("http://", "https://")) + """Case-insensitive HTTP(S) scheme test for sidecar routing. + + Delegates to :func:`xrspatial.geotiff._sources._is_http_source` so + the SSRF-relevant routing decision matches the rest of the package. + Issue #2332. + """ + from ._sources import _is_http_source + return _is_http_source(source) def find_sidecar(source) -> str | None: diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index c774f465..8484a836 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -1451,11 +1451,35 @@ def close(self): _CLOUD_SCHEMES = ('s3://', 'gs://', 'az://', 'abfs://') +def _is_http_source(source) -> bool: + """Return True if ``source`` is an HTTP(S) URL, case-insensitively. + + Centralized so every routing call site in ``xrspatial/geotiff/`` + classifies the scheme the same way. Before this helper existed, + each call site did ``source.startswith(('http://', 'https://'))``, + which is case-sensitive and let ``HTTP://example.internal/...`` + (uppercase) slip past :class:`_HTTPSource` and the SSRF allow-list + in :func:`_validate_http_url`. Per RFC 3986 section 3.1 URI schemes + are case-insensitive, so any uppercase / mixed-case variant has to + route through the same validator as ``http`` / ``https``. + + Non-string inputs (``None``, ``bytes``, ``os.PathLike``, file-like + objects) return ``False`` so callers can drop the surrounding + ``isinstance(_, str)`` check where they want to. Issue #2332. + """ + if not isinstance(source, str) or not source: + return False + # ``urlparse`` strips off the scheme cleanly even for unusual inputs + # (e.g. ``HTTP:`` with no ``//``) and avoids the prefix-tuple trap. + from urllib.parse import urlparse + return urlparse(source).scheme.lower() in ('http', 'https') + + def _is_fsspec_uri(path: str) -> bool: """Check if a path is a fsspec-compatible URI (not http/https/local).""" if not isinstance(path, str): return False - if path.startswith(('http://', 'https://')): + if _is_http_source(path): return False return '://' in path @@ -1636,7 +1660,7 @@ def _open_source(source): raise TypeError( f"source must be a str path/URL or a binary file-like object " f"with read+seek methods, got {type(source).__name__}") - if source.startswith(('http://', 'https://')): + if _is_http_source(source): return _HTTPSource(source) if _is_fsspec_uri(source): return _CloudSource(source) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 5230430b..2a2eb797 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1238,10 +1238,19 @@ def _write_streaming(dask_data, path: str, *, def _is_fsspec_uri(path) -> bool: - """Check if a path is a fsspec-compatible URI (string only).""" + """Check if a path is a fsspec-compatible URI (string only). + + HTTP(S) URLs are deliberately excluded here so the writer can raise + a typed "writes not supported over HTTP" error instead of handing + the URL to fsspec. Uses :func:`_sources._is_http_source` so the + HTTP detection is case-insensitive (RFC 3986); without that, an + uppercase ``HTTP://...`` slipped past this check and into fsspec. + Issue #2332. + """ + from ._sources import _is_http_source if not isinstance(path, str): return False - if path.startswith(('http://', 'https://')): + if _is_http_source(path): return False return '://' in path diff --git a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py new file mode 100644 index 00000000..a811e7e4 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py @@ -0,0 +1,270 @@ +"""Case-insensitive HTTP(S) scheme routing for SSRF protection (#2332). + +Issue #2321 sub-task 5. + +Background +---------- +Several routing call sites in ``xrspatial/geotiff/`` historically used +``startswith(('http://', 'https://'))`` to decide whether a string source +should be opened by ``_HTTPSource`` (which runs SSRF + DNS-pinning checks +via ``_validate_http_url``) or handed off to fsspec. That comparison is +case-sensitive, so a URL like ``HTTP://169.254.169.254/latest/meta-data`` +slipped past ``_HTTPSource`` entirely and fell through to fsspec, which +has no SSRF allow-list. Uppercase schemes are valid per RFC 3986 sect. 3.1 +(``scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )``, case-insensitive). + +The fix centralizes scheme detection on a single helper, ``_is_http_source``, +that does ``urlparse(url).scheme.lower() in ('http', 'https')``, and routes +every call site through it. + +These tests exercise: + +* The helper itself across mixed-case schemes. +* ``_open_source`` returning ``_HTTPSource`` for uppercase URLs. +* The dispatch boolean in every other call site (reader, writer, sidecar, + dask backend, gpu backend, fsspec classifier). +* The SSRF allow-list still rejecting uppercase URLs that resolve to + private / loopback / link-local addresses. + +All tests are offline: ``socket.getaddrinfo`` is monkeypatched so the +validator never opens a real connection. +""" +from __future__ import annotations + +import socket + +import pytest + +from xrspatial.geotiff import UnsafeURLError +from xrspatial.geotiff import _reader as _reader_mod +from xrspatial.geotiff import _sources as _sources_mod + + +# --------------------------------------------------------------------------- +# Helpers (mirrors test_ssrf_hardening_1664.py) +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + def _resolver(host, port, *args, **kwargs): + if ':' in ip: + return [(socket.AF_INET6, socket.SOCK_STREAM, 0, '', + (ip, port or 0, 0, 0))] + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +# --------------------------------------------------------------------------- +# Helper unit tests +# --------------------------------------------------------------------------- + + +class TestIsHttpSourceHelper: + """``_is_http_source`` is the single source of truth for HTTP routing.""" + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + 'hTTpS://example.com/x.tif', + 'http://EXAMPLE.COM/x.tif', # host case must not matter either + ]) + def test_http_schemes_match(self, url): + assert _sources_mod._is_http_source(url) is True + + @pytest.mark.parametrize("url", [ + 's3://bucket/key.tif', + 'gs://bucket/key.tif', + 'az://container/blob.tif', + 'abfs://container/blob.tif', + 'file:///etc/passwd', + 'ftp://example.com/x.tif', + 'gopher://example.com/', + 'memory://x.tif', + '/local/path/file.tif', + 'relative/path.tif', + 'C:\\windows\\file.tif', + ]) + def test_non_http_schemes_do_not_match(self, url): + assert _sources_mod._is_http_source(url) is False + + @pytest.mark.parametrize("value", [None, 42, b'http://x', object()]) + def test_non_string_does_not_match(self, value): + # Be defensive: routing call sites also gate on isinstance(_, str) + # in some places, but the helper itself must not raise on junk. + assert _sources_mod._is_http_source(value) is False + + def test_empty_string_does_not_match(self): + assert _sources_mod._is_http_source('') is False + + def test_scheme_only_prefix_does_not_match(self): + # ``urlparse('http')`` returns scheme=''; only ``http:`` or + # ``http://`` should classify as HTTP. + assert _sources_mod._is_http_source('http') is False + + +# --------------------------------------------------------------------------- +# Dispatch: ``_open_source`` must route uppercase URLs through ``_HTTPSource`` +# --------------------------------------------------------------------------- + + +class TestOpenSourceRoutesUppercase: + """``_open_source('HTTP://...')`` must build an ``_HTTPSource``. + + We intercept ``_HTTPSource.__init__`` so the test never opens a real + HTTP connection; getting the call at all is what we are verifying. + """ + + def test_uppercase_http_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + # Skip the real validator / urllib3 pool setup. + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTP://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTP://example.com/x.tif'] + + def test_uppercase_https_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTPS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTPS://example.com/x.tif'] + + def test_mixed_case_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('hTTpS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['hTTpS://example.com/x.tif'] + + +# --------------------------------------------------------------------------- +# Dispatch booleans elsewhere in the code base +# --------------------------------------------------------------------------- + + +class TestDispatchBooleansAreCaseInsensitive: + """Every routing site must use the centralized helper, not startswith. + + Each call site below historically read:: + + source.startswith(('http://', 'https://')) + + which is the bug. We assert ``_is_http_source`` returns True for the + uppercase forms; the implementation modules import and call the same + helper at the dispatch site. + """ + + @pytest.mark.parametrize("url", [ + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_helper_recognizes_uppercase(self, url): + assert _sources_mod._is_http_source(url) is True + + def test_is_fsspec_uri_excludes_uppercase_http(self): + # ``_is_fsspec_uri`` is the partner classifier in both + # ``_sources.py`` and ``_writer.py``. If it returned True for + # ``HTTP://...`` the writer would hand the URL to fsspec instead + # of raising the typed "writes not supported over HTTP" error. + assert _sources_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _sources_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + # sanity: real fsspec URIs still classify as fsspec + assert _sources_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_writer_is_fsspec_uri_excludes_uppercase_http(self): + from xrspatial.geotiff import _writer as _writer_mod + assert _writer_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_sidecar_helper_is_case_insensitive(self): + from xrspatial.geotiff import _sidecar as _sidecar_mod + assert _sidecar_mod._is_http_url('HTTP://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('HTTPS://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('http://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('s3://b/k.tif') is False + + +# --------------------------------------------------------------------------- +# End-to-end: uppercase scheme + private host must still be rejected +# --------------------------------------------------------------------------- + + +class TestUppercaseSchemeStillRejectsPrivateHosts: + """The whole point of the fix: uppercase URLs go through the SSRF gate. + + Before the fix, ``HTTP://169.254.169.254/...`` would skip the validator + and try to open via fsspec. After the fix, it routes through + ``_HTTPSource``, which calls ``_validate_http_url``, which raises + ``UnsafeURLError``. + """ + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http', 'hTTpS']) + @pytest.mark.parametrize("ip", [ + '127.0.0.1', + '169.254.169.254', + '10.0.0.1', + '192.168.1.1', + '0.0.0.0', + ]) + def test_private_host_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + url = f'{scheme}://attacker.test/x.tif' + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(url) + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_localhost_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://localhost:8080/x.tif') + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_uppercase_scheme_to_127_literal_rejected( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://127.0.0.1/x.tif') + + def test_open_source_uppercase_private_host_raises(self, monkeypatch): + """End-to-end: ``_open_source`` -> ``_HTTPSource`` -> validator. + + Confirms the dispatch wiring actually drives the URL through the + validator (not just that the validator works in isolation). + """ + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('169.254.169.254')) + # Make sure the env override is not set; the validator skips + # resolution when ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS`` is on. + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', raising=False) + with pytest.raises(UnsafeURLError): + _sources_mod._open_source( + 'HTTP://metadata.google.internal/computeMetadata/v1/') From 532d5bcd86109cc0bc757c5dee91031b60d5c5b6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:50:06 -0700 Subject: [PATCH 2/2] Address review: hoist imports, typed HTTP write error, lock broadening (#2332) Follow-ups on the /review-pr pass for PR #2337: * Hoist ``from urllib.parse import urlparse`` to ``_sources.py`` module top so ``_is_http_source`` no longer pays the import cost per call. * Hoist ``from ._sources import _is_http_source`` to module top in ``_writer.py`` and ``_sidecar.py`` (no cycle: neither is imported by ``_sources``). * Keep the function-scope imports in ``_backends/dask.py`` and ``_backends/gpu.py``: the backend modules deliberately defer the reader / sources import so the package can be imported without urllib3 in dask-only environments. Add a one-line comment so the next reader does not assume it is an oversight. * Reject HTTP(S) write targets in ``_write_bytes`` with a typed ``NotImplementedError`` before the local file write path interprets the URL as a filename. The error message lists the supported destinations. Covers both lowercase and uppercase scheme casing. * Lock the behavioural broadening: ``_is_http_source('http:foo')`` now returns True (urlparse-correct, broader than the old ``startswith``). Added a test asserting the classifier matches and that ``_open_source('http:foo')`` raises ``UnsafeURLError`` so the validator still rejects it as "no hostname". * Added 7 new test cases (helper + open_source + 5 writer-rejection parametrizations); full geotiff suite: 5324 passed, 68 skipped. --- xrspatial/geotiff/_backends/dask.py | 5 +- xrspatial/geotiff/_backends/gpu.py | 2 + xrspatial/geotiff/_sidecar.py | 2 +- xrspatial/geotiff/_sources.py | 2 +- xrspatial/geotiff/_writer.py | 12 ++++- .../tests/test_http_scheme_case_2321.py | 48 +++++++++++++++++++ 6 files changed, 67 insertions(+), 4 deletions(-) diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index 548b7f1e..26228d0b 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -194,8 +194,11 @@ def read_geotiff_dask(source: str, *, # and ``_CloudSource`` satisfies that contract. Going through it # bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of # fetching the whole remote object up front. See PR #1755 review. - from .._sources import _is_http_source + # Local imports: backend modules avoid eager-importing the reader / + # sources layer at module load so the package can be imported without + # urllib3 in environments that only consume the dask path. Issue #2332. from .._reader import _is_fsspec_uri + from .._sources import _is_http_source is_http = _is_http_source(source) is_fsspec = isinstance(source, str) and _is_fsspec_uri(source) http_meta = None diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 65ff6ed3..14d3abcf 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -1054,6 +1054,8 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): """ if not isinstance(source, str): return False + # Local import: backend gate is called once per source and the GDS + # path is optional, so avoid eager top-level coupling. Issue #2332. from .._sources import _is_http_source if _is_http_source(source): return False diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 76970f2c..e2f6bd1f 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -26,6 +26,7 @@ # ``_reader`` imports ``_sidecar`` lazily (inside functions), so this # top-level import does not form a cycle at module load time. from ._reader import _is_fsspec_uri +from ._sources import _is_http_source #: Type of the bytes-like buffer a sidecar carries: an mmap for local #: files, bytes for HTTP / fsspec downloads. Narrowed from ``object`` @@ -49,7 +50,6 @@ def _is_http_url(source: str) -> bool: the SSRF-relevant routing decision matches the rest of the package. Issue #2332. """ - from ._sources import _is_http_source return _is_http_source(source) diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index 8484a836..637172a6 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -31,6 +31,7 @@ import threading from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse import urllib3 @@ -1471,7 +1472,6 @@ def _is_http_source(source) -> bool: return False # ``urlparse`` strips off the scheme cleanly even for unusual inputs # (e.g. ``HTTP:`` with no ``//``) and avoids the prefix-tuple trap. - from urllib.parse import urlparse return urlparse(source).scheme.lower() in ('http', 'https') diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 2a2eb797..df040d57 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -61,6 +61,7 @@ # (``_write``, ``_write_streaming``) and external importers (the # ``_writers`` subpackage, tests, ``_gpu_decode``) keep using the # ``xrspatial.geotiff._writer`` import path. +from ._sources import _is_http_source from ._write_layout import (BO, _assemble_cog_layout, _assemble_standard_layout, # noqa: F401 _assemble_tiff, _build_ifd, _compute_classic_ifd_overhead, _float_to_rational, _pack_tag_value, _promote_offsets_to_long8, @@ -1247,7 +1248,6 @@ def _is_fsspec_uri(path) -> bool: uppercase ``HTTP://...`` slipped past this check and into fsspec. Issue #2332. """ - from ._sources import _is_http_source if not isinstance(path, str): return False if _is_http_source(path): @@ -1282,6 +1282,16 @@ def _write_bytes(file_bytes: bytes | bytearray, path) -> None: path.write(file_bytes) return + # Reject HTTP(S) write targets with a typed error before the local + # file path tries to treat the URL as a filename. ``_is_http_source`` + # is case-insensitive so ``HTTP://...`` reports the same friendly + # error as ``http://...``. Issue #2332. + if isinstance(path, str) and _is_http_source(path): + raise NotImplementedError( + f"Writes are not supported over HTTP(S). Got {path!r}. " + "Write to a local path or an fsspec-supported cloud URL " + "(s3://, gs://, az://, ...) instead.") + if _is_fsspec_uri(path): try: import fsspec diff --git a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py index a811e7e4..e2ed2c0f 100644 --- a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py +++ b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py @@ -105,6 +105,23 @@ def test_scheme_only_prefix_does_not_match(self): # ``http://`` should classify as HTTP. assert _sources_mod._is_http_source('http') is False + def test_scheme_colon_no_slashes_classifies_as_http(self): + # ``urlparse('http:foo').scheme == 'http'``: this is broader than + # the old ``startswith('http://')`` gate but is RFC-correct. The + # validator rejects these downstream as "no hostname", so the + # security posture is unchanged. Locking the broader classifier + # in here keeps any future tightening explicit. Issue #2332. + assert _sources_mod._is_http_source('http:foo') is True + assert _sources_mod._is_http_source('HTTP:foo') is True + + def test_open_source_http_colon_no_hostname_raises(self): + # End-to-end follow-up: ``_open_source('http:foo')`` now routes + # into ``_HTTPSource``, which calls ``_validate_http_url`` and + # raises ``UnsafeURLError('... has no hostname')``. The previous + # case-sensitive gate would have sent this to fsspec instead. + with pytest.raises(UnsafeURLError): + _sources_mod._open_source('http:foo') + # --------------------------------------------------------------------------- # Dispatch: ``_open_source`` must route uppercase URLs through ``_HTTPSource`` @@ -268,3 +285,34 @@ def test_open_source_uppercase_private_host_raises(self, monkeypatch): with pytest.raises(UnsafeURLError): _sources_mod._open_source( 'HTTP://metadata.google.internal/computeMetadata/v1/') + + +# --------------------------------------------------------------------------- +# Writer: HTTP(S) destinations must raise a typed error, not a raw OSError +# --------------------------------------------------------------------------- + + +class TestWriterRejectsHttpTargets: + """``_write_bytes(_, 'HTTP://...')`` must raise ``NotImplementedError``. + + Without the early gate the uppercase URL fell through ``_is_fsspec_uri`` + (correctly returns False) and into the local file write path, which + surfaced an OS-specific ``OSError`` for the colon-in-filename. The + typed error matches the lowercase-HTTP behaviour and points users at + the supported destinations. Follow-up to issue #2332 review. + """ + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_write_bytes_rejects_http(self, url): + from xrspatial.geotiff import _writer as _writer_mod + with pytest.raises(NotImplementedError) as excinfo: + _writer_mod._write_bytes(b'IIxxxx', url) + msg = str(excinfo.value) + assert 'HTTP' in msg + assert url in msg