From 28b8177b40034925eb493799bf7ec0a699940675 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 08:16:17 -0700 Subject: [PATCH 1/2] geotiff: golden corpus phase 3 PR 5, HTTP/COG backend (#1930) The HTTP path in ``xrspatial.geotiff`` is COG-only: ``open_geotiff('http://...')`` routes to ``_read_cog_http`` which uses range requests to fetch only metadata and the tiles a window needs. Plain stripped/tiled fixtures would require a full-object download, which is a different code path; this module focuses narrowly on the COG fixture. The in-process HTTP server reuses the range-supporting ``BaseHTTPRequestHandler`` pattern from ``test_http_meta_buffer_1718.py``. It binds an ephemeral 127.0.0.1 port, serves the COG fixture's bytes, and lets the test read through ``open_geotiff(url)`` with the SSRF guard opened for loopback via ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``. The single corpus COG fixture (``cog_internal_overview_uint16``) passes the oracle through the HTTP path. A ``test_at_least_one_cog_fixture_exists`` guard surfaces the case where a future refactor accidentally drops the only COG entry. --- .../tests/test_golden_corpus_http_1930.py | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_golden_corpus_http_1930.py diff --git a/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py new file mode 100644 index 000000000..cd852cb31 --- /dev/null +++ b/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py @@ -0,0 +1,172 @@ +"""HTTP/COG backend cells against the golden-corpus oracle (issue #1930). + +Phase 3 PR 5 of the corpus plan. The HTTP path in ``xrspatial.geotiff`` +is COG-only: ``open_geotiff('http://...')`` routes to +``_reader._read_cog_http`` which uses range requests to fetch only +metadata and the tiles a window needs. The plain stripped / tiled +fixtures in the corpus would require a full-object download to read, +which is a different code path; this module focuses narrowly on the +COG fixture. + +The in-process HTTP server is the same range-supporting +``BaseHTTPRequestHandler`` pattern used by +``test_http_meta_buffer_1718.py`` and friends. It serves the COG +fixture bytes on a random ``127.0.0.1`` port, and the test reads +``http://127.0.0.1:/cog.tif`` through ``open_geotiff``. The +server is bound to localhost, which the HTTPSource's SSRF guard +normally rejects; the ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` env +var (the same hatch every other in-process HTTP test uses) opens the +loopback path for tests. + +The shared parity gaps (``_PARITY_GAPS``) carry over from the eager / +dask / GPU modules: the codec and attrs layers are the same. The +current COG fixture is ``cog_internal_overview_uint16`` which is not +in any gap or skip list, so the HTTP path should pass cleanly. +""" +from __future__ import annotations + +import http.server +import pathlib +import socketserver +import threading + +import pytest + +pytest.importorskip("yaml") +pytest.importorskip("rasterio") + +from xrspatial.geotiff import open_geotiff # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus._oracle import ( # noqa: E402 + compare_to_oracle, +) + + +FIXTURES_DIR = ( + pathlib.Path(generate.__file__).resolve().parent / "fixtures" +) + + +def _cog_fixture_ids() -> list[str]: + """Return manifest ids for fixtures the COG HTTP reader can serve. + + Only ``cog: true`` entries qualify. Returns sorted ids for stable + parametrize output. + """ + manifest = generate.load_manifest() + entries = generate.validate(manifest) + ids = sorted(e["id"] for e in entries if e.get("cog")) + return ids + + +class _RangeHandler(http.server.BaseHTTPRequestHandler): + """Serve a single fixture's bytes with HTTP Range support. + + The bound subclass (built per-test via ``type(...)`` to bind a + different payload) is what gets handed to ``socketserver.TCPServer``. + """ + + payload: bytes = b"" + + def do_GET(self): # noqa: N802 + rng = self.headers.get("Range") + if rng and rng.startswith("bytes="): + spec = rng[len("bytes="):] + start_s, _, end_s = spec.partition("-") + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header("Content-Type", "application/octet-stream") + self.send_header( + "Content-Range", + f"bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}", + ) + self.send_header("Content-Length", str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header("Content-Type", "application/octet-stream") + self.send_header("Content-Length", str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + def log_message(self, *_args, **_kwargs): + # Quiet the per-request stderr noise that BaseHTTPRequestHandler + # otherwise emits. + pass + + +def _serve(payload: bytes) -> tuple[socketserver.TCPServer, threading.Thread]: + handler_cls = type( + "_RangeHandlerBound", (_RangeHandler,), {"payload": payload} + ) + httpd = socketserver.TCPServer(("127.0.0.1", 0), handler_cls) + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, thread + + +@pytest.fixture +def allow_private_http(monkeypatch): + """Open the SSRF guard for loopback addresses for the duration of one test. + + ``_HTTPSource`` normally rejects ``127.0.0.1`` and other private + hosts. The same env var that every other in-process HTTP test in + the repo uses (``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS``) reopens + the loopback path. + """ + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + + +_COG_IDS = _cog_fixture_ids() + + +@pytest.mark.skipif( + not _COG_IDS, + reason="no COG fixtures in the manifest; nothing to serve over HTTP", +) +@pytest.mark.parametrize("fixture_id", _COG_IDS, ids=_COG_IDS) +def test_http_cog_parity(fixture_id: str, allow_private_http) -> None: + """``open_geotiff(http_url)`` for a COG fixture agrees with the oracle. + + Serves the fixture's bytes on an ephemeral 127.0.0.1 port, reads + through the HTTP COG path, and compares against the rasterio + reference read of the same on-disk file. The fixture and the + served bytes are byte-identical, so any mismatch points at the + HTTP plumbing rather than at the codec. + """ + path = FIXTURES_DIR / f"{fixture_id}.tif" + if not path.exists(): + pytest.skip( + f"COG fixture {fixture_id!r} has no .tif on disk; run " + f"`python -m xrspatial.geotiff.tests.golden_corpus.generate`" + ) + + with open(path, "rb") as f: + payload = f.read() + + httpd, _thread = _serve(payload) + try: + host, port = httpd.server_address + url = f"http://{host}:{port}/{fixture_id}.tif" + candidate = open_geotiff(url) + compare_to_oracle(path, candidate) + finally: + httpd.shutdown() + httpd.server_close() + + +def test_at_least_one_cog_fixture_exists() -> None: + """Sanity check: the corpus has something for this backend to exercise. + + Today there is exactly one COG fixture (``cog_internal_overview_uint16``, + phase 2 PR 7). If a future refactor accidentally drops the only + COG entry, this test surfaces it so the HTTP cells do not silently + cover nothing. + """ + assert _COG_IDS, ( + "manifest has no COG fixtures; the HTTP backend has nothing to " + "test against" + ) From e9e66e1093097d6ac269cb7f166d03caca1d3489 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 08:57:39 -0700 Subject: [PATCH 2/2] geotiff: HTTP/COG review follow-ups (#1930) Address phase 3 PR 5 review: * Pull the ``lossy`` flag from the manifest entry via a new ``_is_lossy(fixture_id)`` helper. The hardcoded ``lossy=False`` in the oracle call would silently fail bit-exact comparison if a future lossy COG (e.g. JPEG-COG) landed in the corpus. * Add ``_resolved_fixtures()`` so the manifest is parsed once per helper. Mirrors the eager / dask / GPU modules. * Update the module docstring to reflect that the current corpus has no COG fixture in a parity gap; document the path forward if one is added. * Join the HTTP server thread with a 2 s timeout in the test's ``finally`` block so a hung ``httpd.shutdown()`` surfaces cleanly rather than waiting on process exit. --- .../tests/test_golden_corpus_http_1930.py | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py index cd852cb31..5f4bab8d5 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_http_1930.py @@ -18,10 +18,12 @@ var (the same hatch every other in-process HTTP test uses) opens the loopback path for tests. -The shared parity gaps (``_PARITY_GAPS``) carry over from the eager / -dask / GPU modules: the codec and attrs layers are the same. The -current COG fixture is ``cog_internal_overview_uint16`` which is not -in any gap or skip list, so the HTTP path should pass cleanly. +The current corpus has one COG fixture (``cog_internal_overview_uint16``) +which is not subject to any of the parity gaps the eager / dask / GPU +modules flag (no integer nodata, EPSG-coded CRS, single band), so this +module ships without a ``_PARITY_GAPS`` table. If a future COG fixture +hits one of the shared gaps, mirror the eager module's tables and the +``_build_param`` plumbing here. """ from __future__ import annotations @@ -47,16 +49,30 @@ ) +def _resolved_fixtures() -> list[dict]: + """Return manifest entries with defaults merged, sorted by id.""" + manifest = generate.load_manifest() + entries = generate.validate(manifest) + entries.sort(key=lambda e: e["id"]) + return entries + + def _cog_fixture_ids() -> list[str]: """Return manifest ids for fixtures the COG HTTP reader can serve. Only ``cog: true`` entries qualify. Returns sorted ids for stable parametrize output. """ - manifest = generate.load_manifest() - entries = generate.validate(manifest) - ids = sorted(e["id"] for e in entries if e.get("cog")) - return ids + return [e["id"] for e in _resolved_fixtures() if e.get("cog")] + + +def _is_lossy(fixture_id: str) -> bool: + """Look up the lossy flag for a fixture id from the manifest.""" + for e in _resolved_fixtures(): + if e["id"] == fixture_id: + tol = e.get("tolerance") or {} + return bool(tol.get("lossy", False)) + return False class _RangeHandler(http.server.BaseHTTPRequestHandler): @@ -147,15 +163,19 @@ def test_http_cog_parity(fixture_id: str, allow_private_http) -> None: with open(path, "rb") as f: payload = f.read() - httpd, _thread = _serve(payload) + httpd, thread = _serve(payload) try: host, port = httpd.server_address url = f"http://{host}:{port}/{fixture_id}.tif" candidate = open_geotiff(url) - compare_to_oracle(path, candidate) + compare_to_oracle(path, candidate, lossy=_is_lossy(fixture_id)) finally: httpd.shutdown() httpd.server_close() + # Daemon threads exit on process tear-down, but joining here + # makes test-by-test cleanup deterministic and surfaces a hang + # if ``shutdown()`` ever stops returning. + thread.join(timeout=2.0) def test_at_least_one_cog_fixture_exists() -> None: