From cd70929ef1ea4c0bd9fbcc80f1f3d31fb5bc7752 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 09:56:44 -0500 Subject: [PATCH 01/17] feat(waterdata): Add multi-value GET-parameter chunker for OGC API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For multi-value waterdata queries (e.g. monitoring_location_id with ~300+ sites), the GET URL produced by PR #233 blows past the server's ~8 KB nginx buffer and the API returns HTTP 414. This PR adds a chunker that transparently splits long list params across sub-requests so each URL fits the byte budget. The chunker is a decorator applied to ``_fetch_once`` outside the existing ``@filters.chunked`` (CQL chunker), so list-chunking is the outer loop and filter-chunking is the inner loop: @chunking.multi_value_chunked(build_request=_construct_api_requests) @filters.chunked(build_request=_construct_api_requests) def _fetch_once(args): ... Key design points: - ``_plan_chunks`` greedy-halves the largest chunk across all dimensions until the worst-case sub-request fits ``url_limit`` (URL + body, via ``_request_bytes``, so POST routes are sized correctly). Cartesian product of per-dim partitions becomes the sub-request set; capped at ``max_chunks=1000``. - ``_filter_aware_probe_args`` coordinates with ``filters.chunked``: the planner probes URL length using a synthetic clause that matches the inner filter chunker's bail-floor size (longest single clause, scaled by worst-case URL encoding ratio). Without this coordination, the outer planner would raise ``RequestTooLarge`` on combinations the stacked chunkers can actually handle. - ``QuotaExhausted`` mid-call guard reads ``x-ratelimit-remaining`` after each sub-request; if it drops below ``quota_safety_floor=50``, the wrapper raises with the partial frame, completed-chunk offset, and last observed remaining quota — letting callers salvage or resume after the rate-limit window resets, rather than crash into a silent mid-pagination 429. - ``RequestTooLarge`` is raised when the smallest reducible plan still exceeds ``url_limit`` (every multi-value param at a singleton chunk and any chunkable filter at the inner chunker's bail floor) or when the cartesian product exceeds ``max_chunks``. - All defaults (``url_limit``, ``max_chunks``, ``quota_safety_floor``) resolve at call time, so monkey-patching ``filters._WATERDATA_URL_ BYTE_LIMIT`` for tests / non-default quotas affects the decorator uniformly. Public additions: - ``dataretrieval.waterdata.chunking.multi_value_chunked`` - ``dataretrieval.waterdata.chunking.RequestTooLarge`` - ``dataretrieval.waterdata.chunking.QuotaExhausted`` (carries ``partial_frame``, ``partial_response``, ``completed_chunks``, ``total_chunks``, ``remaining``) Tests (30 new): - ``_filter_aware_probe_args`` worst-case-clause modelling - ``_plan_chunks`` greedy halving, RequestTooLarge floor, filter- chunker coordination, ``max_chunks`` cap, lazy-default reads - ``multi_value_chunked`` pass-through, cartesian-product shape, end-to-end with stacked filter chunker - ``QuotaExhausted`` header parsing, mid-call abort, last-chunk no- abort, zero-floor disable - ``RequestTooLarge`` message contents and triggering conditions End-to-end correctness verified against the live API: identical per-site cell-for-cell output between unchunked (single call) and chunked (forced fan-out via patched limit) paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/api.py | 15 + dataretrieval/waterdata/chunking.py | 478 ++++++++++++++++++++++++ dataretrieval/waterdata/filters.py | 51 ++- dataretrieval/waterdata/utils.py | 61 ++- tests/waterdata_test.py | 561 ++++++++++++++++++++++++++++ 6 files changed, 1146 insertions(+), 22 deletions(-) create mode 100644 dataretrieval/waterdata/chunking.py diff --git a/NEWS.md b/NEWS.md index 7761e29b..3d9a5814 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/17/2026:** The OGC `waterdata` getters (`get_daily`, `get_continuous`, `get_field_measurements`, and the rest of the multi-value-capable functions) now transparently chunk requests whose URLs would otherwise exceed the server's ~8 KB byte limit. A common chained-query pattern — pull a long site list from `get_monitoring_locations`, then feed it into `get_daily` — previously failed with HTTP 414 once the resulting URL grew past the limit; it now fans out across multiple sub-requests under the hood and returns one combined DataFrame. The chunker coordinates with the existing CQL `filter` chunker (long top-level-`OR` filters still split correctly when used alongside long multi-value lists), caps cartesian-product plans at 1000 sub-requests (the default USGS hourly quota), and aborts mid-call with a structured `QuotaExhausted` exception — carrying the partial result and a resume offset — if `x-ratelimit-remaining` drops below a safety floor. Mirrors R `dataRetrieval`'s [#870](https://github.com/DOI-USGS/dataRetrieval/pull/870), generalized to N dimensions. Note one metadata-behavior change for paginated/chunked calls: `BaseMetadata.url` still reflects the user's original query (unchanged), but `BaseMetadata.header` now carries the *last* page's / sub-request's headers (so `x-ratelimit-remaining` is current) rather than the first, and `BaseMetadata.query_time` is now the cumulative wall-clock across pages instead of the first page's elapsed. + **05/16/2026:** Fixed silent truncation in the paginated `waterdata` request loops (`_walk_pages` and `get_stats_data`). Mid-pagination failures (HTTP 429, 5xx, network error) were previously swallowed — pagination would quietly stop and the function would return whatever rows it had collected, leaving callers with truncated DataFrames they had no way to detect. The loops now status-check every page like the initial request and raise `RuntimeError` on any failure, with the upstream exception chained as `__cause__` and a short menu of recovery actions (wait and retry, reduce the request, or obtain an API token) in the message. **Behavior change**: callers that previously consumed partial DataFrames on transient upstream blips will now see an exception; retry the call (possibly with a smaller `limit` or narrower query). **05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index ad268194..025aafcd 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -230,6 +230,21 @@ def get_daily( ... parameter_code="00060", ... last_modified="P7D", ... ) + + >>> # Chain queries: pull all stream sites in a state, then their + >>> # daily discharge for the last week. The site list can be hundreds + >>> # of values long — the request is transparently chunked across + >>> # multiple sub-requests so the URL stays under the server's byte + >>> # limit. Combined output looks like a single query. + >>> sites_df, _ = dataretrieval.waterdata.get_monitoring_locations( + ... state_name="Ohio", + ... site_type="Stream", + ... ) + >>> df, md = dataretrieval.waterdata.get_daily( + ... monitoring_location_id=sites_df["monitoring_location_id"].tolist(), + ... parameter_code="00060", + ... time="P7D", + ... ) """ service = "daily" output_id = "daily_id" diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py new file mode 100644 index 00000000..bbdf5b12 --- /dev/null +++ b/dataretrieval/waterdata/chunking.py @@ -0,0 +1,478 @@ +"""Multi-value GET-parameter chunking for the Water Data OGC getters. + +PR 233 routes most services through GET with comma-separated values +(e.g. ``monitoring_location_id=USGS-A,USGS-B,...``). Long lists can blow +the server's ~8 KB URL byte limit. This module adds a decorator that +sits OUTSIDE ``filters.chunked`` and splits multi-value list params +across multiple sub-requests so each URL fits. See ``get_daily``'s +docstring for an end-to-end chained-query example. + +Design (orthogonal to filter chunking): + +- N-dimensional cartesian product: for each chunkable list param, the + values are partitioned into sub-lists; the planner emits the cartesian + product of those partitions. Sub-chunks of the same dim never overlap, + so frame concat needs no dedup across multi-value chunks. +- Greedy halving of the largest chunk in any dim until the worst-case + sub-request URL fits the limit. Minimises total request count. +- Date params, ``bbox``, and ``properties`` are not chunked: dates are + intervals not enumerable sets; bbox is a coord array; ``properties`` + determines output schema and chunking it would shard columns. + +Coordination with ``filters.chunked``: +The planner probes the URL with a synthetic clause sized to the inner +chunker's bail floor — ``len(longest_clause) * max(per-clause encoding +ratio)`` — when a chunkable filter is present. The inner chunker bails +(emits the full filter) when any single clause's URL-encoded length +exceeds its per-sub-request budget; mirroring +``filters._effective_filter_budget``, that floor already accounts for +the worst per-call encoding ratio, so a long alphanumeric clause +coexisting with a shorter heavily-encoded clause is sized correctly. +Without this coordination, a long OR-filter plus multi-value lists +would trigger a premature ``RequestTooLarge`` even though the combined +chunkers would have made things fit. +""" + +from __future__ import annotations + +import functools +import itertools +import math +from collections.abc import Callable +from typing import Any +from urllib.parse import quote_plus + +import pandas as pd +import requests + +from . import filters +from .filters import ( + _combine_chunk_frames, + _combine_chunk_responses, + _FetchOnce, + _is_chunkable, + _max_per_clause_encoding_ratio, + _split_top_level_or, +) + +# Params that look like lists but must NOT be chunked. ``properties`` is +# excluded because it defines the response schema; chunking it would +# return frames with different columns per sub-request. ``bbox`` is a +# fixed 4-element coord tuple. Date params are intervals not sets. The +# CQL ``filter`` (and its ``filter_lang``) is a string that has its own +# inner chunker (``filters.chunked``); if a caller passes ``filter`` as +# a list, treating it as a multi-value param would emit malformed CQL. +_NEVER_CHUNK = frozenset( + { + "properties", + "bbox", + "datetime", + "last_modified", + "begin", + "begin_utc", + "end", + "end_utc", + "time", + "filter", + "filter_lang", + } +) + +# Default cap on the number of sub-requests a single chunked call may +# emit. The USGS Water Data API rate-limits each HTTP request (including +# pagination), so the true budget is ``hourly_quota / avg_pages_per_chunk``. +# 1000 matches the default hourly quota and is a reasonable upper bound +# for single-page sub-requests; tune lower if your queries paginate. +# Override per-decorator via ``max_chunks=`` or by monkeypatching this +# module attribute — both the decorator wrapper and ``_plan_chunks`` +# read it lazily. +_DEFAULT_MAX_CHUNKS = 1000 + +# When ``x-ratelimit-remaining`` drops below this between sub-requests, +# the chunker bails with ``QuotaExhausted`` rather than risk a mid-call +# HTTP 429. Carries the partial result so callers can resume from a +# known offset instead of retrying the whole chunked call from scratch. +_DEFAULT_QUOTA_SAFETY_FLOOR = 50 + +# Response header USGS uses to advertise remaining hourly quota. +_QUOTA_HEADER = "x-ratelimit-remaining" + +# Sentinel returned by ``_read_remaining`` when the response has no +# parseable ``x-ratelimit-remaining`` header. Large enough to beat any +# plausible safety floor so a missing/malformed header doesn't trigger +# spurious ``QuotaExhausted`` aborts. +_QUOTA_UNKNOWN = 10**9 + + +class RequestTooLarge(ValueError): + """Raised when a chunked request cannot be issued. Two cases: + (1) URL exceeds the byte limit even with every multi-value param at + a singleton chunk and any chunkable filter at the inner chunker's + bail-floor size (the URL contribution of its longest single + OR-clause, after URL-encoding); (2) the cartesian-product plan + would issue more than ``max_chunks`` sub-requests.""" + + +class QuotaExhausted(RuntimeError): + """Raised mid-chunked-call when the API's reported remaining quota + (``x-ratelimit-remaining`` header) drops below the configured safety + floor. The chunker stops before issuing the next sub-request to + avoid a mid-call HTTP 429 that would silently truncate paginated + results (see PR #273 for the pagination side of that bug). + + The exception carries everything needed to resume: the combined + partial frame from completed sub-requests, the metadata for the + last successful sub-request, the number of chunks completed out of + the plan total, and the last-observed ``remaining`` value. + + Attributes + ---------- + partial_frame : pd.DataFrame + Concatenated, deduplicated result of every sub-request that + completed before the floor was crossed. + partial_response : requests.Response + Aggregated response with the canonical URL restored to the + user's full original query (rebuilt from ``args`` rather than + copied from the first sub-request, which only carries the + first chunk's parameters). Headers come from the last + completed sub-request (so callers inspecting + ``x-ratelimit-remaining`` see current quota state), and + ``elapsed`` is summed across completed sub-requests. Wrap in + ``BaseMetadata`` to surface to the caller alongside the + partial frame. + completed_chunks : int + Number of sub-requests successfully completed. + total_chunks : int + Total sub-requests in the cartesian-product plan. + remaining : int + Last observed ``x-ratelimit-remaining`` value. + """ + + def __init__( + self, + *, + partial_frame: pd.DataFrame, + partial_response: requests.Response, + completed_chunks: int, + total_chunks: int, + remaining: int, + ) -> None: + super().__init__( + f"x-ratelimit-remaining dropped to {remaining} after " + f"{completed_chunks}/{total_chunks} chunks; aborting to avoid " + f"mid-call HTTP 429. Catch QuotaExhausted to access " + f".partial_frame and resume from chunk {completed_chunks}." + ) + self.partial_frame = partial_frame + self.partial_response = partial_response + self.completed_chunks = completed_chunks + self.total_chunks = total_chunks + self.remaining = remaining + + +def _chunkable_params(args: dict[str, Any]) -> dict[str, list[Any]]: + """Return ``{name: list(values)}`` for every list/tuple kwarg with + >1 element that is allowed to chunk.""" + return { + k: list(v) + for k, v in args.items() + if k not in _NEVER_CHUNK and isinstance(v, (list, tuple)) and len(v) > 1 + } + + +def _filter_aware_probe_args(args: dict[str, Any]) -> dict[str, Any]: + """Substitute the filter with a synthetic ASCII clause sized to the + inner chunker's bail floor, so the planner's URL probe matches what + the inner chunker would emit. + + The inner ``filters.chunked`` bails (emits the full filter) when any + single OR-clause's URL-encoded length exceeds the per-sub-request + budget. Mirroring ``filters._effective_filter_budget``, that floor + is ``len(longest_clause) * max(per-clause encoding ratio)``. + Substituting an ASCII clause of that exact length makes + ``quote_plus`` a no-op, so the URL builder sees exactly the + bail-floor byte count. + """ + filter_expr = args.get("filter") + filter_lang = args.get("filter_lang") + if not _is_chunkable(filter_expr, filter_lang): + return args + parts = _split_top_level_or(filter_expr) + if len(parts) < 2: + return args # one-clause filter — inner chunker can't shrink it + longest_raw = max(len(p) for p in parts) + probe_size = math.ceil(longest_raw * _max_per_clause_encoding_ratio(parts)) + return {**args, "filter": "x" * probe_size} + + +def _chunk_bytes(chunk: list[Any]) -> int: + """URL-encoded byte length of ``chunk`` when comma-joined into a + URL parameter value. + + Used as the planner's biggest-chunk comparator in + ``_worst_case_args`` and the halving loop. ``quote_plus`` (rather + than raw ``,``-join length) keeps the comparator faithful to what + the real URL builder produces, so values containing characters + that expand under URL encoding (``%``, ``+``, ``/``, ``&``, …) + can't be mis-ranked. For typical USGS multi-value workloads + (alphanumeric IDs and codes) raw and encoded lengths are equal, + but the encoded form is always correct. + """ + return len(quote_plus(",".join(map(str, chunk)))) + + +def _request_bytes(req: requests.PreparedRequest) -> int: + """Total bytes of a prepared request: URL + body. + + GET routes have ``body=None`` and reduce to URL length. POST routes + (CQL2 JSON body) need body bytes — the URL stays short regardless of + payload, so URL-only sizing would underestimate the request and skip + chunking when it's needed. + + Raises ``TypeError`` on non-sizable bodies (generators, file-like + streams). Size-based planning needs a deterministic byte count; + silently treating an unknown body as zero bytes would under-chunk + and let the request blow past the server's POST-body limit. + """ + url_len = len(req.url) + body = req.body + if body is None: + return url_len + if isinstance(body, (bytes, bytearray)): + return url_len + len(body) + if isinstance(body, str): + return url_len + len(body.encode("utf-8")) + raise TypeError( + f"multi_value_chunked cannot size a request body of type " + f"{type(body).__name__!r}; pass str, bytes, or None. Streaming " + f"bodies (generators, file-like) are not supported because the " + f"planner needs a deterministic byte count up front." + ) + + +def _plan_total(plan: dict[str, list[list[Any]]]) -> int: + """Sub-request count a plan will issue: the cartesian product of + per-dim chunk counts. Computed in two places (planner's max_chunks + early-bail and wrapper's QuotaExhausted payload) — centralized + here so the two can't drift.""" + return math.prod(len(chunks) for chunks in plan.values()) + + +def _worst_case_args( + probe_args: dict[str, Any], plan: dict[str, list[list[Any]]] +) -> dict[str, Any]: + """Args representing the worst-case sub-request the plan will issue: + each dim's largest chunk (by URL-encoded bytes), composed onto + the ``probe_args`` already returned by ``_filter_aware_probe_args`` + so any chunkable filter sits at the inner chunker's bail-floor + size. The planner feeds these args through ``_request_bytes`` to + decide whether the biggest sub-request fits the budget.""" + out = dict(probe_args) + for k, chunks in plan.items(): + out[k] = max(chunks, key=_chunk_bytes) + return out + + +def _plan_chunks( + args: dict[str, Any], + build_request: Callable[..., Any], + url_limit: int, + max_chunks: int | None = None, +) -> dict[str, list[list[Any]]] | None: + """Greedy halving until the worst-case sub-request fits ``url_limit``. + + Budget is total request bytes (URL + body, via ``_request_bytes``) + so POST routes size correctly — see ``multi_value_chunked`` for the + parameter-name caveat. + + Returns ``None`` when no chunking is needed (request as-is fits or + no chunkable lists). Raises ``RequestTooLarge`` when: + - the smallest reducible plan still exceeds ``url_limit`` (every + multi-value param at a singleton chunk and any chunkable filter + already at the inner chunker's bail-floor size), or + - the cartesian-product plan exceeds ``max_chunks`` sub-requests + (the hourly API budget); checked after each split so we bail + promptly once the cap is unreachable. + + ``max_chunks`` defaults to ``_DEFAULT_MAX_CHUNKS`` resolved at call + time, so monkeypatching the module constant takes effect for + direct callers too. + """ + if max_chunks is None: + max_chunks = _DEFAULT_MAX_CHUNKS + if max_chunks < 1: + raise ValueError( + f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " + f"values would silently bypass the cap on the no-chunking path." + ) + chunkable = _chunkable_params(args) + if not chunkable: + return None + probe_args = _filter_aware_probe_args(args) + if _request_bytes(build_request(**probe_args)) <= url_limit: + return None + + plan: dict[str, list[list[Any]]] = {k: [v] for k, v in chunkable.items()} + + while True: + worst = _worst_case_args(probe_args, plan) + if _request_bytes(build_request(**worst)) <= url_limit: + return plan + + # Largest splittable chunk across all dims, by URL-encoded bytes. + splittable = ( + (dim, idx, chunk) + for dim, dim_chunks in plan.items() + for idx, chunk in enumerate(dim_chunks) + if len(chunk) > 1 + ) + biggest = max(splittable, key=lambda t: _chunk_bytes(t[2]), default=None) + if biggest is None: + raise RequestTooLarge( + f"Request exceeds {url_limit} bytes (URL + body) at the " + f"smallest reducible plan: every multi-value parameter " + f"at a singleton chunk and any chunkable filter at the " + f"inner chunker's bail-floor size. Reduce the number " + f"of values, shorten the filter, or split the call " + f"manually." + ) + dim, idx, chunk = biggest + mid = len(chunk) // 2 + plan[dim] = plan[dim][:idx] + [chunk[:mid], chunk[mid:]] + plan[dim][idx + 1 :] + + # Each split only grows the cartesian product, so once we + # cross max_chunks we can never come back under. Bail now + # rather than keep splitting (the URL probe could still take + # many more iterations). + total = _plan_total(plan) + if total > max_chunks: + raise RequestTooLarge( + f"Chunked plan would issue {total} sub-requests, exceeding " + f"max_chunks={max_chunks} (USGS API's default hourly rate " + f"limit per key). Reduce input list sizes, narrow the time " + f"window, or raise max_chunks if you have a higher quota." + ) + + +def _read_remaining(response: requests.Response) -> int: + """Parse ``x-ratelimit-remaining`` from a response. Missing or + malformed header → return ``_QUOTA_UNKNOWN`` so the safety check + treats it as 'plenty of quota' (don't abort on header glitches).""" + raw = response.headers.get(_QUOTA_HEADER) + if raw is None: + return _QUOTA_UNKNOWN + try: + return int(raw) + except (TypeError, ValueError): + return _QUOTA_UNKNOWN + + +def multi_value_chunked( + *, + build_request: Callable[..., Any], + url_limit: int | None = None, + max_chunks: int | None = None, + quota_safety_floor: int | None = None, +) -> Callable[[_FetchOnce], _FetchOnce]: + """Decorator that splits multi-value list params across sub-requests + so each sub-request fits ``url_limit`` bytes (defaults to + ``filters._WATERDATA_URL_BYTE_LIMIT``) and the cartesian-product + plan stays ≤ ``max_chunks`` sub-requests (defaults to + ``_DEFAULT_MAX_CHUNKS``). All defaults are resolved at call time so + tests/users that patch the module constants affect this decorator + uniformly. + + ``url_limit`` is enforced against total request bytes (URL + body, + via ``_request_bytes``); the name reflects the dominant GET case + where body is empty. POST routes (e.g. ``monitoring-locations`` via + CQL2 JSON) are conservatively sized — never under-chunks, but may + over-chunk at the body's true ceiling. + + Between sub-requests the wrapper reads ``x-ratelimit-remaining`` from + each response. If it drops below ``quota_safety_floor`` (default + ``_DEFAULT_QUOTA_SAFETY_FLOOR``), the wrapper raises ``QuotaExhausted`` + carrying the combined partial result and the chunk offset so callers + can resume after the hourly window resets, instead of crashing into + a mid-pagination HTTP 429 (which the upstream pagination loop in + ``_walk_pages`` historically truncated silently — see PR #273). + + Sits OUTSIDE ``@filters.chunked``: list-chunking is the outer loop, + filter-chunking is the inner loop. The wrapped function has the same + signature as ``filters.chunked`` expects — ``(args: dict) -> (frame, + response)`` — so the two decorators compose cleanly. The planner is + filter-aware so it doesn't raise prematurely when the inner filter + chunker would have shrunk the per-sub-request URL on its own. + + Sub-requests run sequentially with no per-call timeout enforced here. + A hung single sub-request will block the entire chunked call; the + caller is responsible for configuring an HTTP-layer timeout (e.g. + via a ``requests.Session`` wrapper) if bounded latency matters. + + Cartesian-product iteration order is deterministic for a given + ``args`` dict: the wrapper iterates ``plan.values()`` in insertion + order (Python 3.7+ guarantee), which equals the order in which + chunkable params appeared in ``args``. For the public waterdata + getters that order is the function-signature order, so + ``QuotaExhausted.completed_chunks`` maps to the same sub-requests + across repeated calls with the same arguments — resume is + well-defined. + """ + + def decorator(fetch_once: _FetchOnce) -> _FetchOnce: + @functools.wraps(fetch_once) + def wrapper( + args: dict[str, Any], + ) -> tuple[pd.DataFrame, requests.Response]: + limit = ( + url_limit + if url_limit is not None + else filters._WATERDATA_URL_BYTE_LIMIT + ) + floor = ( + quota_safety_floor + if quota_safety_floor is not None + else _DEFAULT_QUOTA_SAFETY_FLOOR + ) + plan = _plan_chunks(args, build_request, limit, max_chunks) + if plan is None: + return fetch_once(args) + + # Pre-build the canonical URL representing the user's full + # original query. The chunker sends sub-requests with sliced + # multi-value lists; without this restore, the aggregated + # response's ``.url`` would only show the first chunk and + # callers logging ``md.url`` for reproducibility would see a + # truncated view of their own query. + canonical_url = build_request(**args).url + + keys = list(plan) + total = _plan_total(plan) + frames: list[pd.DataFrame] = [] + responses: list[requests.Response] = [] + for i, combo in enumerate(itertools.product(*(plan[k] for k in keys))): + sub_args = {**args, **dict(zip(keys, combo))} + frame, response = fetch_once(sub_args) + frames.append(frame) + responses.append(response) + # Skip the quota check after the last sub-request — + # nothing left to abort. + if i < total - 1: + remaining = _read_remaining(response) + if remaining < floor: + partial = _combine_chunk_responses(responses) + partial.url = canonical_url + raise QuotaExhausted( + partial_frame=_combine_chunk_frames(frames), + partial_response=partial, + completed_chunks=i + 1, + total_chunks=total, + remaining=remaining, + ) + + combined = _combine_chunk_responses(responses) + combined.url = canonical_url + return _combine_chunk_frames(frames), combined + + return wrapper # type: ignore[return-value] + + return decorator diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py index 4c136b82..deac263e 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/waterdata/filters.py @@ -152,6 +152,18 @@ def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]: return chunks +def _max_per_clause_encoding_ratio(parts: list[str]) -> float: + """Worst per-clause ``len(quote_plus(p)) / len(p)`` across OR-clauses. + + Any sub-request chunk could end up containing only the heavier-encoding + clauses, so per-sub-request byte budgets must be sized against the + worst (not average) ratio to avoid overflow. Used by both this + module's filter chunker and the outer ``chunking._filter_aware_probe_args``; + pinning the formula here keeps the two from drifting. + """ + return max(len(quote_plus(p)) / len(p) for p in parts) + + def _effective_filter_budget( args: dict[str, Any], filter_expr: str, @@ -163,8 +175,7 @@ def _effective_filter_budget( non-filter URL bytes by building the request with a 1-byte placeholder filter, subtract from the URL limit to get the bytes available for the encoded filter, then convert back to raw CQL bytes via the *maximum* - per-clause encoding ratio (a chunk could contain only the heavier-encoding - clauses, so budgeting by the average ratio could overflow). + per-clause encoding ratio. """ # Fast path: encoded filter clearly fits with room for any plausible # non-filter URL. Skips the PreparedRequest build and splitter scan. @@ -179,7 +190,7 @@ def _effective_filter_budget( # the caller sees one 414 instead of N parallel sub-request failures. return len(filter_expr) + 1 parts = _split_top_level_or(filter_expr) or [filter_expr] - encoding_ratio = max(len(quote_plus(p)) / len(p) for p in parts) + encoding_ratio = _max_per_clause_encoding_ratio(parts) return max(100, int(available_url_bytes / encoding_ratio)) @@ -268,13 +279,24 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: def _combine_chunk_responses( responses: list[requests.Response], ) -> requests.Response: - """Return one response: first chunk's URL/headers + summed ``elapsed``. - - Mutates the first response in place (only ``elapsed``); downstream only - reads ``elapsed`` (in ``BaseMetadata.query_time``), URL, and headers. + """Return one response with the last chunk's headers (for current + rate-limit state) and summed ``elapsed`` (for total wall-clock). + + The returned response's ``.url`` is the *first chunk's* URL, which + only reflects the first slice of the user's query. Callers wanting + the canonical original-query URL on ``BaseMetadata`` must overwrite + ``.url`` themselves (using ``build_request(**original_args).url``); + the decorator wrappers in ``filters.chunked`` and + ``chunking.multi_value_chunked`` do this. + + Mutates the first response in place: ``.headers`` is replaced with + the last response's headers and ``.elapsed`` is accumulated across + all chunks. Downstream reads ``.url``, ``.headers``, and + ``.elapsed`` (via ``BaseMetadata``). """ head = responses[0] if len(responses) > 1: + head.headers = responses[-1].headers head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed) return head @@ -295,8 +317,12 @@ def chunked(*, build_request: Callable[..., Any]) -> Callable[[_FetchOnce], _Fet - Chunkable cql-text filter: run the lexicographic-pitfall guard, split into URL-length-safe sub-expressions, call the wrapped function once per chunk, concatenate frames (drop empties, dedup by feature ``id``), - and return an aggregated response (first chunk's URL/headers, summed - ``elapsed``). + and return an aggregated response with ``.url`` restored to the + canonical full-filter URL (so ``BaseMetadata.url`` reflects the + user's original query rather than the first filter chunk), last + chunk's headers (so callers see current ``x-ratelimit-remaining``, + which the outer ``multi_value_chunked`` decorator's ``QuotaExhausted`` + guard depends on), and summed ``elapsed``. Either way the return shape matches the undecorated function's, so the caller wraps the response in ``BaseMetadata`` the same way in both paths. @@ -327,7 +353,12 @@ def wrapper( frames.append(frame) responses.append(response) - return _combine_chunk_frames(frames), _combine_chunk_responses(responses) + # Restore the canonical URL representing the user's full filter + # (the aggregated response otherwise carries only the first + # filter-chunk's URL, which misleads callers logging md.url). + combined = _combine_chunk_responses(responses) + combined.url = build_request(**args).url + return _combine_chunk_frames(frames), combined return wrapper # type: ignore[return-value] diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 9245bb92..fe419100 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -5,7 +5,7 @@ import os import re from collections.abc import Iterable, Mapping -from datetime import datetime +from datetime import datetime, timedelta from typing import Any, get_args from zoneinfo import ZoneInfo @@ -14,7 +14,7 @@ from dataretrieval import __version__ from dataretrieval.utils import BaseMetadata -from dataretrieval.waterdata import filters +from dataretrieval.waterdata import chunking, filters from dataretrieval.waterdata.types import ( PROFILE_LOOKUP, PROFILES, @@ -644,6 +644,26 @@ def _get_resp_data(resp: requests.Response, geopd: bool) -> pd.DataFrame: return df +def _finalize_paginated_response( + initial: requests.Response, + last: requests.Response, + total_elapsed: timedelta, +) -> None: + """Carry the last page's headers + cumulative elapsed onto the initial + response in place. + + The initial response stays canonical for ``md.url`` (user's original + query), but its ``.headers`` and ``.elapsed`` are overwritten so the + multi-value chunker's ``QuotaExhausted`` guard sees current + ``x-ratelimit-remaining`` and ``md.query_time`` reflects total + wall-clock across pages. No-op when ``initial is last`` (single page). + """ + if last is initial: + return + initial.headers = last.headers + initial.elapsed = total_elapsed + + def _walk_pages( geopd: bool, req: requests.PreparedRequest, @@ -669,7 +689,11 @@ def _walk_pages( pd.DataFrame A DataFrame containing the aggregated results from all pages. requests.Response - The initial response object containing metadata about the first request. + Aggregated response: the initial request's URL (for query + identity), the final page's headers (so downstream callers see + current rate-limit state, which the multi-value chunker's + ``QuotaExhausted`` guard relies on), and cumulative ``elapsed`` + summed across every page. Raises ------ @@ -700,9 +724,11 @@ def _walk_pages( try: resp = client.send(req) _raise_for_non_200(resp) - - # Store the initial response for metadata + # Keep the original-request response as the "canonical" one for + # ``md.url`` reproducibility; ``.headers`` and ``.elapsed`` get + # overwritten with latest/cumulative values below. initial_response = resp + total_elapsed = resp.elapsed # Grab some aspects of the original request: headers and the # request type (GET or POST) @@ -723,6 +749,7 @@ def _walk_pages( ) _raise_for_non_200(resp) dfs.append(_get_resp_data(resp, geopd=geopd)) + total_elapsed += resp.elapsed curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 logger.warning( @@ -730,6 +757,8 @@ def _walk_pages( ) raise RuntimeError(_paginated_failure_message(len(dfs), e)) from e + _finalize_paginated_response(initial_response, resp, total_elapsed) + # Concatenate all pages at once for efficiency return pd.concat(dfs, ignore_index=True), initial_response finally: @@ -957,17 +986,20 @@ def get_ogc_data( return return_list, BaseMetadata(response) +@chunking.multi_value_chunked(build_request=_construct_api_requests) @filters.chunked(build_request=_construct_api_requests) def _fetch_once( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: """Send one prepared-args OGC request; return the frame + response. - Filter chunking is added orthogonally by the ``@filters.chunked`` - decorator: with no filter (or an un-chunkable one) the decorator - passes ``args`` through to this body; with a chunkable filter it - fans out and calls this body once per sub-filter, then combines. - Either way the return shape is ``(frame, response)``. + Two orthogonal chunkers wrap this body. ``@chunking.multi_value_chunked`` + (outer) splits multi-value list params (e.g. ``monitoring_location_id``) + across sub-requests so each URL fits the server byte limit; the + cartesian product of per-dim chunks is iterated. ``@filters.chunked`` + (inner) splits long cql-text filters at top-level ``OR``. With no + chunkable inputs both pass through unchanged. Either way the return + shape is ``(frame, response)``. """ req = _construct_api_requests(**args) return _walk_pages(geopd=GEOPANDAS, req=req) @@ -1158,9 +1190,11 @@ def get_stats_data( try: resp = client.send(req) _raise_for_non_200(resp) - - # Store the initial response for metadata + # Keep the original-request response as the "canonical" one for + # ``md.url`` reproducibility; ``.headers`` and ``.elapsed`` get + # overwritten with latest/cumulative values below. initial_response = resp + total_elapsed = resp.elapsed # Grab some aspects of the original request: headers and the # request type (GET or POST) @@ -1186,6 +1220,7 @@ def get_stats_data( _raise_for_non_200(resp) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) + total_elapsed += resp.elapsed next_token = body["next"] except Exception as e: # noqa: BLE001 logger.warning( @@ -1196,6 +1231,8 @@ def get_stats_data( ) raise RuntimeError(_paginated_failure_message(len(all_dfs), e)) from e + _finalize_paginated_response(initial_response, resp, total_elapsed) + dfs = pd.concat(all_dfs, ignore_index=True) if len(all_dfs) > 1 else all_dfs[0] # . If expand percentiles is True, make each percentile diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 18e78594..30637a5e 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,7 +1,9 @@ import datetime import json +import math import sys from unittest import mock +from urllib.parse import quote_plus import pandas as pd import pytest @@ -10,6 +12,8 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) +from dataretrieval.waterdata import chunking as _chunking +from dataretrieval.waterdata import filters as _filters from dataretrieval.waterdata import ( get_channel, get_combined_metadata, @@ -28,6 +32,18 @@ get_stats_por, get_time_series_metadata, ) +from dataretrieval.waterdata.chunking import ( + _DEFAULT_MAX_CHUNKS, + _DEFAULT_QUOTA_SAFETY_FLOOR, + _QUOTA_HEADER, + QuotaExhausted, + RequestTooLarge, + _chunkable_params, + _filter_aware_probe_args, + _plan_chunks, + _read_remaining, + multi_value_chunked, +) from dataretrieval.waterdata.utils import ( _check_monitoring_location_id, _check_profiles, @@ -207,6 +223,551 @@ def test_construct_api_requests_two_element_date_list_becomes_interval(): assert "time=2024-01-01%2F2024-01-31" in req.url +# ----- Multi-value GET-parameter chunker (chunking.py) ---------------------- +# +# These tests exercise the planner with a fake ``build_request`` whose URL +# byte length is a deterministic function of its inputs. Tests below model: +# - non-chunkable args contribute ``base_bytes``, +# - every multi-value list contributes ``len(",".join(map(str, v)))``, +# - the ``filter`` kwarg contributes ``len(filter)``. +# This isolates planner behaviour from the real HTTP request builder. + + +class _FakeReq: + __slots__ = ("url", "body") + + def __init__(self, url, body=None): + self.url = url + self.body = body + + +def _fake_build(*, base=200, **kwargs): + """Fake build_request: URL length deterministic in its inputs. + + Mirrors the GET-routed shape: payload goes in the URL, body is None. + List/string values are URL-encoded via ``quote_plus`` so the fake's + byte count matches what the real ``_construct_api_requests`` would + produce; otherwise an alphanumeric test could pass against the fake + but fail in production once values containing ``%``, ``+``, ``/``, + ``&`` etc. (which expand under encoding) reach the same code path. + """ + bytes_ = base + for v in kwargs.values(): + if isinstance(v, (list, tuple)): + bytes_ += len(quote_plus(",".join(map(str, v)))) + elif isinstance(v, str): + bytes_ += len(quote_plus(v)) + return _FakeReq("x" * bytes_) + + +def test_filter_aware_probe_args_passes_through_when_not_chunkable(): + """No filter, json-lang filter, single-clause filter — return unchanged.""" + assert _filter_aware_probe_args({"a": 1}) == {"a": 1} + assert _filter_aware_probe_args({"filter": "a='1'", "filter_lang": "cql-json"}) == { + "filter": "a='1'", + "filter_lang": "cql-json", + } + args = {"filter": "a='single clause with no OR'"} + assert _filter_aware_probe_args(args) == args + + +def test_filter_aware_probe_args_models_inner_chunker_bail_floor(): + """Chunkable filter → return args with filter replaced by a synthetic + ASCII clause whose URL byte count equals the inner chunker's bail + floor ``len(longest) * max(per_clause_encoding_ratio)``. Mirrors + ``filters._effective_filter_budget``'s worst-case model so the + planner doesn't approve plans the inner chunker would refuse.""" + args = {"filter": "a='1' OR a='22' OR a='333'", "x": 7} + probe = _filter_aware_probe_args(args) + parts = ["a='1'", "a='22'", "a='333'"] + expected = math.ceil( + max(len(p) for p in parts) * max(len(quote_plus(p)) / len(p) for p in parts) + ) + assert len(probe["filter"]) == expected + assert probe["filter"].isascii() and probe["filter"].isalnum() + assert probe["x"] == 7 + assert args["filter"] == "a='1' OR a='22' OR a='333'" # input not mutated + + +def test_plan_chunks_returns_none_when_request_fits(): + """URL under limit → planner returns None, decorator passes through.""" + args = {"monitoring_location_id": ["A", "B", "C"]} + plan = _plan_chunks(args, _fake_build, url_limit=8000) + assert plan is None + + +def test_plan_chunks_returns_none_when_no_chunkable_lists(): + """No multi-value lists, however over-limit → planner can't help, returns None + (decorator falls through; server may 414 but that's not chunker's job).""" + args = {"monitoring_location_id": "scalar-only"} + plan = _plan_chunks(args, _fake_build, url_limit=10) + assert plan is None + + +def test_plan_chunks_greedy_halving_targets_largest_dim(): + """Two dims with one much larger — the heavy dim halves first.""" + args = { + "monitoring_location_id": ["X" * 30, "Y" * 30, "Z" * 30, "W" * 30], + "parameter_code": ["00060", "00065"], + } + # full URL ≈ 200 + 123 + 12 = 335; force splitting heavy dim only. + plan = _plan_chunks(args, _fake_build, url_limit=310) + assert len(plan["monitoring_location_id"]) > 1 + assert len(plan["parameter_code"]) == 1 # heavy-dim split was enough + + +def test_plan_chunks_raises_request_too_large_at_singleton_floor(): + """Limit below singleton-per-dim floor (with no chunkable filter to + fall back on) → RequestTooLarge with a clear message.""" + args = {"monitoring_location_id": ["A", "B"]} + # base=200 alone exceeds limit; no relief possible. + with pytest.raises(RequestTooLarge, match="multi-value parameter"): + _plan_chunks(args, _fake_build, url_limit=100) + + +def test_plan_chunks_coordinates_with_filter_chunker(monkeypatch): + """COORDINATION REGRESSION TEST. + + With the FULL filter in URL-length probes, singleton-per-dim URL still + exceeds the limit and the planner would raise RequestTooLarge. With + filter-aware probing, the planner models the per-sub-request URL as + ``worst-dim-chunk + longest-clause-after-encoding`` (the inner filter + chunker's bail floor — it returns the FULL filter if any single + clause exceeds the budget, so the longest clause is the smallest + floor it can guarantee). The probe fits, plan returns. + + Sanity-check the *negative*: with filter-aware probing disabled, the + same inputs would raise. + """ + clauses = [f"f='{i}'" for i in range(10)] + args = { + "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], + "filter": " OR ".join(clauses), + } + # singleton+full-filter ≈ 200 + 10 + 86 = 296 (over limit 240) — would raise. + # longest-clause probe model ≈ 200 + 10 + 5 = 215 (under limit) — plan succeeds. + # (Here all clauses are the same length, so longest == shortest; the + # encoding-ratio coordination matters for lopsided clauses.) + plan = _plan_chunks(args, _fake_build, url_limit=240) + assert plan is not None # coordination prevented the premature raise + assert len(plan["monitoring_location_id"]) > 1 # planner did split + + # Negative control: patch the probe helper to be a no-op (model "no + # filter awareness") and confirm the same inputs raise. + monkeypatch.setattr(_chunking, "_filter_aware_probe_args", lambda a: a) + with pytest.raises(RequestTooLarge): + _plan_chunks(args, _fake_build, url_limit=240) + + +def test_plan_chunks_probes_with_max_clause_not_min(): + """Regression: with lopsided OR-clauses (one short, one long), probing + with min(parts) lets the planner falsely declare a plan feasible that + the inner filter chunker can't actually deliver — it bails when any + single clause exceeds the per-sub-request budget. Probing with the + longest clause is the safe lower bound on per-sub-request filter + size, so the planner correctly raises when no plan can fit.""" + args = { + "sites": ["A" * 10, "B" * 10], + "filter": "x='1' OR x='" + "a" * 28 + "'", # 5-char and 33-char clauses + } + # base 200 + singleton sites 10 + min-clause 5 = 215 (limit 230 -> fits) + # base 200 + singleton sites 10 + max-clause 33 = 243 (limit 230 -> exceeds) + # With min: planner succeeds, but real URL with full filter (42) exceeds + # 230 -> server 414. With max: planner raises early, as it should. + with pytest.raises(RequestTooLarge): + _plan_chunks(args, _fake_build, url_limit=230) + + +def test_plan_chunks_still_raises_when_even_min_clause_doesnt_fit(): + """If the limit is so tight that singleton + shortest-clause STILL + exceeds it, filter chunker can't save us either — raise.""" + args = { + "monitoring_location_id": ["A" * 10, "B" * 10], + "filter": "x='12345' OR x='67890'", # min clause is 9 chars + } + # Singleton + min-clause ≈ 200 + 10 + 9 = 219; limit below that → unrecoverable. + with pytest.raises(RequestTooLarge): + _plan_chunks(args, _fake_build, url_limit=210) + + +def test_multi_value_chunked_passes_through_when_url_fits(): + """No planning needed → decorator calls underlying function exactly once + with the original args.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=8000) + def fetch(args): + calls.append(args) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch({"monitoring_location_id": ["A", "B"]}) + assert len(calls) == 1 + assert calls[0]["monitoring_location_id"] == ["A", "B"] + + +def test_multi_value_chunked_emits_cartesian_product(): + """Two chunkable dims, each split into 2 chunks → exactly 4 sub-calls, + each pairing one chunk from each dim.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + calls.append({k: v for k, v in args.items() if k in ("sites", "pcodes")}) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch( + { + "sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10], + "pcodes": ["P1" * 10, "P2" * 10, "P3" * 10, "P4" * 10], + } + ) + # Both heavy → planner should split both dims. Confirm a cartesian shape: + # every unique site-chunk pairs with every unique pcode-chunk. + sites_seen = {tuple(c["sites"]) for c in calls} + pcodes_seen = {tuple(c["pcodes"]) for c in calls} + assert len(calls) == len(sites_seen) * len(pcodes_seen) + assert len(sites_seen) > 1 + assert len(pcodes_seen) > 1 + + +def test_multi_value_chunked_emits_3d_cartesian_product(): + """Three chunkable dims, each forced to split → exhaustive cartesian + product across all three. Verifies the planner's halving loop handles + N>2 dims uniformly and the wrapper's ``itertools.product`` enumerates + every combination exactly once.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + calls.append(tuple(tuple(args[k]) for k in ("sites", "pcodes", "stats"))) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch( + { + "sites": ["S" * 12 + str(i) for i in range(4)], + "pcodes": ["P" * 12 + str(i) for i in range(4)], + "stats": ["T" * 12 + str(i) for i in range(4)], + } + ) + + # Three independent axes — every (site_chunk, pcode_chunk, stat_chunk) + # triple must appear exactly once. Confirm: + sites_seen = {c[0] for c in calls} + pcodes_seen = {c[1] for c in calls} + stats_seen = {c[2] for c in calls} + + assert len(sites_seen) > 1, "sites dim was not split" + assert len(pcodes_seen) > 1, "pcodes dim was not split" + assert len(stats_seen) > 1, "stats dim was not split" + + # Cartesian shape: # sub-calls == product of unique chunks across dims + expected = len(sites_seen) * len(pcodes_seen) * len(stats_seen) + assert len(calls) == expected, ( + f"expected {expected} cartesian-product sub-calls, got {len(calls)}" + ) + # And no triple repeats (exhaustive enumeration, no duplicates). + assert len(set(calls)) == len(calls) + # The chunked values, when unioned across calls, recover the original list. + assert {x for tup in sites_seen for x in tup} == { + "S" * 12 + str(i) for i in range(4) + } + assert {x for tup in pcodes_seen for x in tup} == { + "P" * 12 + str(i) for i in range(4) + } + assert {x for tup in stats_seen for x in tup} == { + "T" * 12 + str(i) for i in range(4) + } + + +def test_multi_value_chunked_lazy_url_limit(monkeypatch): + """``url_limit=None`` → resolve filters._WATERDATA_URL_BYTE_LIMIT at call + time, so tests that patch the constant affect this decorator too.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build) # url_limit defaults to None + def fetch(args): + calls.append(args) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + monkeypatch.setattr(_filters, "_WATERDATA_URL_BYTE_LIMIT", 240) + # 4 sites of 10 chars → exceeds 240 → planner splits. + fetch({"sites": ["S" * 10 + str(i) for i in range(4)]}) + assert len(calls) > 1, "patched constant should drive chunking" + + +def test_default_max_chunks_matches_hourly_api_quota(): + """The default cap mirrors the USGS Water Data API's documented + per-API-key hourly limit. Locking this in so future changes have to + explicitly acknowledge the quota.""" + assert _DEFAULT_MAX_CHUNKS == 1000 + + +def test_plan_chunks_raises_when_plan_exceeds_max_chunks(): + """A converged plan with more sub-requests than ``max_chunks`` must + raise rather than silently issue them and burn the user's API quota.""" + # 2 dims with long values, each needing many singleton-ish chunks. + # Pick chunk sizes that converge to a plan exceeding a tight cap. + args = { + "dim_a": [f"long-string-value-{i}" for i in range(50)], + "dim_b": [f"another-long-value-{i}" for i in range(50)], + } + # url_limit forces splitting; max_chunks=10 forces the cap to fire. + with pytest.raises(RequestTooLarge, match="exceeding max_chunks=10"): + _plan_chunks(args, _fake_build, url_limit=250, max_chunks=10) + + +def test_plan_chunks_respects_default_cap_without_explicit_arg(): + """Default kwarg path: ``max_chunks`` defaults to _DEFAULT_MAX_CHUNKS + when not specified, so direct callers (e.g., other library code) get + the same safety net as the decorator wrapper.""" + args = { + "dim_a": [f"v{i:03d}" for i in range(60)], + "dim_b": [f"v{i:03d}" for i in range(60)], + "dim_c": [f"v{i:03d}" for i in range(60)], + } + # Without explicit max_chunks: defaults to 1000. The plan for these + # inputs would emit > 1000 sub-requests at a tight limit, so should + # raise on default cap alone. + with pytest.raises(RequestTooLarge, match=r"max_chunks=1000"): + _plan_chunks(args, _fake_build, url_limit=220) + + +def test_multi_value_chunked_cap_override(): + """A decorator-time ``max_chunks`` override lets callers with higher + quotas raise the ceiling without monkeypatching the module constant.""" + + @multi_value_chunked(build_request=_fake_build, url_limit=220, max_chunks=10) + def fetch(args): + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + with pytest.raises(RequestTooLarge, match="exceeding max_chunks=10"): + fetch( + { + "dim_a": [f"longer-v{i}" for i in range(30)], + "dim_b": [f"longer-v{i}" for i in range(30)], + } + ) + + +def _quota_response(remaining: int | str | None) -> mock.Mock: + """A mock requests.Response-like object whose ``x-ratelimit-remaining`` + header reflects the given value (None → header absent).""" + resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + resp.headers = {} if remaining is None else {_QUOTA_HEADER: str(remaining)} + return resp + + +def test_read_remaining_parses_header(): + assert _read_remaining(_quota_response(42)) == 42 + + +def test_read_remaining_treats_missing_header_as_plenty(): + """Servers that don't echo a rate-limit header must not trigger + spurious QuotaExhausted aborts. Sentinel is a large integer so any + plausible safety floor compares cleanly.""" + assert _read_remaining(_quota_response(None)) >= 1_000_000 + + +def test_read_remaining_treats_malformed_header_as_plenty(): + """Defensive: non-integer header value → don't abort.""" + assert _read_remaining(_quota_response("not-a-number")) >= 1_000_000 + + +def test_default_quota_safety_floor(): + """Default floor lives at 50 — enough headroom for one final + chunked call's pagination spike without breaching the hourly cap.""" + assert _DEFAULT_QUOTA_SAFETY_FLOOR == 50 + + +def test_multi_value_chunked_aborts_when_quota_floor_breached(): + """Mid-call, when ``x-ratelimit-remaining`` drops below the floor, + the chunker must raise ``QuotaExhausted`` *before* issuing the next + sub-request — and the exception must carry the partial frame plus + the chunk offset so callers can resume.""" + # Build a fetch_once whose response 'remaining' header decrements + # through 200, 100, 40 (below floor=50), 10. + remaining_seq = iter([200, 100, 40, 10]) + page_idx = iter(range(10)) + + def fetch(args): + idx = next(page_idx) + return ( + pd.DataFrame( + {"site": list(args["sites"]), "page": [idx] * len(args["sites"])} + ), + _quota_response(next(remaining_seq)), + ) + + decorated = multi_value_chunked( + build_request=_fake_build, + url_limit=240, + quota_safety_floor=50, + )(fetch) + + # Plan forces 4 sub-requests (4 singleton site chunks). + with pytest.raises(QuotaExhausted) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + + err = excinfo.value + # Aborted after the 3rd sub-request (remaining=40 < floor=50). + assert err.completed_chunks == 3 + assert err.total_chunks == 4 + assert err.remaining == 40 + # Partial frame combines rows from the first three completed sub-requests. + assert err.partial_frame is not None + assert set(err.partial_frame["page"]) == {0, 1, 2} + + +def test_multi_value_chunked_does_not_abort_on_last_chunk(): + """Aborting on the final sub-request would be pointless — there's + no 'next' to protect. The check is skipped there. Earlier chunks + stay above the floor; only the last drops below, and we still + return cleanly because the check is skipped at i == total-1.""" + remaining_seq = iter([500, 5]) # only the LAST chunk dips below floor=50 + + def fetch(args): + return ( + pd.DataFrame({"site": list(args["sites"])}), + _quota_response(next(remaining_seq)), + ) + + decorated = multi_value_chunked( + build_request=_fake_build, + url_limit=240, + quota_safety_floor=50, + )(fetch) + + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10]}) # forces 2 chunks + assert len(df) == 2 # no raise — both chunks ran + + +def test_multi_value_chunked_quota_check_disabled_with_zero_floor(): + """Setting the floor to 0 effectively disables the quota guard — + counter can go to 1 without aborting (since 1 > 0 = floor).""" + remaining_seq = iter([5, 1]) + + def fetch(args): + return ( + pd.DataFrame({"site": list(args["sites"])}), + _quota_response(next(remaining_seq)), + ) + + decorated = multi_value_chunked( + build_request=_fake_build, + url_limit=240, + quota_safety_floor=0, + )(fetch) + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10]}) + assert len(df) == 2 # no raise + + +def test_quota_exhausted_message_includes_resume_offset(): + """The error message must point the user at the chunk offset to + resume from, otherwise the partial_frame attribute is a footgun + — the user has no way to know which chunks still need re-issuing.""" + e = QuotaExhausted( + partial_frame=pd.DataFrame(), + partial_response=mock.Mock(), + completed_chunks=7, + total_chunks=20, + remaining=12, + ) + msg = str(e) + assert "7/20" in msg + assert "12" in msg + assert "QuotaExhausted" in msg or "resume" in msg + + +def test_request_bytes_rejects_non_sizable_body(): + """``_request_bytes`` requires a deterministic byte count up front; + silently treating an unknown body as zero would under-chunk and let + the request blow past the server's POST-body limit. Generators, + iterables, and file-like objects must surface as ``TypeError``.""" + from dataretrieval.waterdata.chunking import _request_bytes + + class _FakeReqWithGenBody: + url = "https://example.com/foo" + body = (b"x" for _ in range(3)) + + with pytest.raises(TypeError, match="cannot size a request body"): + _request_bytes(_FakeReqWithGenBody()) + + +def test_request_bytes_handles_supported_body_types(): + """Sanity-check the supported body types: None (GET), bytes (raw + POST), str (JSON-as-string POST).""" + from dataretrieval.waterdata.chunking import _request_bytes + + class _Req: + def __init__(self, url, body): + self.url = url + self.body = body + + assert _request_bytes(_Req("ab", None)) == 2 + assert _request_bytes(_Req("ab", b"cd")) == 4 + assert _request_bytes(_Req("ab", "cd")) == 4 + assert _request_bytes(_Req("ab", bytearray(b"cd"))) == 4 + + +def test_plan_chunks_rejects_non_positive_max_chunks(): + """``max_chunks < 1`` is meaningless and would silently bypass the + cap on the no-chunking-needed path (initial plan total = 1 and the + in-loop check only runs after a split). Reject early.""" + args = {"monitoring_location_id": ["A", "B", "C", "D"]} + with pytest.raises(ValueError, match="max_chunks must be >= 1"): + _plan_chunks(args, _fake_build, url_limit=1000, max_chunks=0) + with pytest.raises(ValueError, match="max_chunks must be >= 1"): + _plan_chunks(args, _fake_build, url_limit=1000, max_chunks=-5) + + +def test_multi_value_chunked_restores_canonical_url(): + """When chunking fans out, the aggregated response's ``.url`` must + reflect the *user's original* query (rebuilt from the unchunked + args), not the first chunk's URL. Callers logging ``md.url`` for + reproducibility need the full query.""" + sites = ["S" * 10 + str(i) for i in range(4)] + sub_urls: list[str] = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + # Each sub-response carries the chunked sub_args's URL, so + # without canonical restoration the first chunk's URL would + # leak through to md.url. + sub_url = _fake_build(**args).url + sub_urls.append(sub_url) + resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + resp.headers = {} + resp.url = sub_url + return pd.DataFrame(), resp + + _df, md = fetch({"sites": sites}) + + assert len(sub_urls) > 1, "test setup error: chunker didn't fan out" + # md.url must equal the URL the unchunked query would have produced. + assert md.url == _fake_build(sites=sites).url + # And differ from every sub-request's URL (each carries a smaller list). + assert all(md.url != u for u in sub_urls) + # The canonical URL is strictly bigger byte-wise than any sub-request. + assert all(len(md.url) > len(u) for u in sub_urls) + + +def test_chunkable_params_skips_filter_passed_as_list(): + """Defensive guard: ``filter`` is documented as a string. If a caller + mistakenly passes it as a list, the chunker must NOT treat it as a + multi-value dim — comma-joining CQL clauses inside the URL would + produce a malformed filter expression. The inner ``filters.chunked`` + is the only place that may shrink ``filter``.""" + args = { + "monitoring_location_id": ["USGS-A", "USGS-B"], + "filter": ["a='1'", "a='2'"], # malformed input + "filter_lang": ["cql-text", "cql-json"], # ditto + } + chunkable = _chunkable_params(args) + assert "monitoring_location_id" in chunkable + assert "filter" not in chunkable + assert "filter_lang" not in chunkable + + def test_samples_results(): """Test results call for proper columns""" df, _ = get_samples( From 46335b63962fbfce16bcbd8c03918135bca4410d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 09:38:02 -0500 Subject: [PATCH 02/17] fix(waterdata): Reject smuggled lists for scalar-contract chunker inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness gaps surfaced in review: 1. ``limit`` and ``skip_geometry`` are scalars by contract (``int | None`` and ``bool | None``) but a list smuggled through type erasure (e.g. ``limit=["100","200"]`` slipping past _normalize_str_iterable when elements happen to be strings) would be picked up by ``_chunkable_params`` and fanned into multiple sub-requests with conflicting per-request caps. Add both to ``_NEVER_CHUNK`` so the chunker leaves scalar-by-contract params alone. 2. ``quota_safety_floor=0`` is the documented "disable the guard" sentinel, but negative values were accepted silently and also disabled the guard — obscuring caller intent. Reject at decoration time, parallel to ``_plan_chunks``'s ``max_chunks < 1`` check. --- dataretrieval/waterdata/chunking.py | 14 ++++++++++++++ tests/waterdata_test.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index bbdf5b12..7be74dfe 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -62,6 +62,11 @@ # CQL ``filter`` (and its ``filter_lang``) is a string that has its own # inner chunker (``filters.chunked``); if a caller passes ``filter`` as # a list, treating it as a multi-value param would emit malformed CQL. +# ``limit`` and ``skip_geometry`` are scalars-by-contract: their public +# signatures are ``int | None`` and ``bool | None``, so a list value +# reaching the chunker would be a type-erasure smuggle. Chunking it would +# change request semantics in confusing ways (e.g. ``limit=["100","200"]`` +# fanning out into separate paginated queries with different caps). _NEVER_CHUNK = frozenset( { "properties", @@ -75,6 +80,8 @@ "time", "filter", "filter_lang", + "limit", + "skip_geometry", } ) @@ -417,6 +424,13 @@ def multi_value_chunked( across repeated calls with the same arguments — resume is well-defined. """ + if quota_safety_floor is not None and quota_safety_floor < 0: + raise ValueError( + f"quota_safety_floor must be >= 0; got {quota_safety_floor}. " + f"A negative floor silently disables the guard the same way " + f"``0`` does, but obscures the intent — pass 0 explicitly to " + f"disable, or a positive integer to set a real safety margin." + ) def decorator(fetch_once: _FetchOnce) -> _FetchOnce: @functools.wraps(fetch_once) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 30637a5e..92f312d8 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -768,6 +768,36 @@ def test_chunkable_params_skips_filter_passed_as_list(): assert "filter_lang" not in chunkable +def test_chunkable_params_skips_scalar_contract_params(): + """``limit`` and ``skip_geometry`` are scalars by contract + (``int | None`` and ``bool | None`` respectively). If a caller smuggles + a list through type erasure (e.g. ``limit=["100","200"]`` after a bad + cast), the chunker must NOT treat it as a multi-value dim. Chunking + ``limit`` would silently fan into separate paginated queries with + different per-request caps; chunking ``skip_geometry`` would emit + sub-requests with conflicting geometry-output settings.""" + args = { + "monitoring_location_id": ["USGS-A", "USGS-B"], + "limit": ["100", "200"], + "skip_geometry": ["true", "false"], + } + chunkable = _chunkable_params(args) + assert "monitoring_location_id" in chunkable + assert "limit" not in chunkable + assert "skip_geometry" not in chunkable + + +def test_multi_value_chunked_rejects_negative_quota_safety_floor(): + """``quota_safety_floor=0`` is the documented "disable the guard" + sentinel (tested elsewhere). A negative value disables it the same + way but obscures intent — reject at decoration time so the caller + notices, parallel to the ``max_chunks < 1`` check in ``_plan_chunks``.""" + with pytest.raises(ValueError, match="quota_safety_floor must be >= 0"): + multi_value_chunked(build_request=_fake_build, quota_safety_floor=-1) + with pytest.raises(ValueError, match="quota_safety_floor must be >= 0"): + multi_value_chunked(build_request=_fake_build, quota_safety_floor=-50) + + def test_samples_results(): """Test results call for proper columns""" df, _ = get_samples( From 9bc342b09d22c44ef5d4df4bc7ff6438773b3a0b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 10:23:45 -0500 Subject: [PATCH 03/17] refactor(waterdata): Unify list and filter chunkers into one joint planner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the two-decorator stack (@multi_value_chunked outside @filters.chunked) with a single planner that allocates URL byte budget across list dims and filter clauses together. Same correctness guarantees, fewer sub-requests when the previous design forced the inner filter chunker to bail at its singleton-clause floor while the outer list chunker held the bulk of the URL budget. Algorithm: - Enumerate filter chunk counts k = 1, 2, 4, ..., n_clauses. - For each k, partition clauses into k balanced groups joined by OR and identify the worst (longest URL-encoded) group. - Substitute the worst group as the filter and plan the list dims with greedy halving against the remaining byte budget. - Pick the candidate whose list_count × k is smallest. Net code shrinks: -50 lines in filters.py (retired the chunked decorator and _effective_filter_budget), +30 in chunking.py for the joint planner (offset by removing _filter_aware_probe_args and the bail-floor coordination machinery), unstack the decorator pair on _fetch_once. Two existing cross-decorator coordination tests collapse into joint-planner tests (mismatched-clause-length probing was the hardest-to-explain artefact of the old design — gone now). New regression test: ``test_joint_planner_url_construction_long_filter_and_long_sites`` exercises the planner with 500 USGS site IDs + 20 datetime OR-clauses using the real ``_construct_api_requests`` builder. Confirms every sub-request URL stays under 8000 bytes, filter partitions cover every clause exactly once, list partitions cover every site exactly once, and the total sub-request count beats the naive bail-floor-style worst case by ≥4×. Live API verified: Ohio Stream sites (2888) → daily discharge (P7D) chunks into 12 sub-requests with canonical URL preserved and cumulative query_time accurate. --- dataretrieval/waterdata/chunking.py | 513 ++++++++++++++-------------- dataretrieval/waterdata/filters.py | 140 +------- dataretrieval/waterdata/utils.py | 16 +- tests/waterdata_test.py | 261 +++++++++----- 4 files changed, 459 insertions(+), 471 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 7be74dfe..5d56d6fc 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -1,36 +1,32 @@ -"""Multi-value GET-parameter chunking for the Water Data OGC getters. +"""Joint URL-byte chunking for the Water Data OGC getters. PR 233 routes most services through GET with comma-separated values -(e.g. ``monitoring_location_id=USGS-A,USGS-B,...``). Long lists can blow -the server's ~8 KB URL byte limit. This module adds a decorator that -sits OUTSIDE ``filters.chunked`` and splits multi-value list params -across multiple sub-requests so each URL fits. See ``get_daily``'s -docstring for an end-to-end chained-query example. - -Design (orthogonal to filter chunking): - -- N-dimensional cartesian product: for each chunkable list param, the - values are partitioned into sub-lists; the planner emits the cartesian - product of those partitions. Sub-chunks of the same dim never overlap, - so frame concat needs no dedup across multi-value chunks. -- Greedy halving of the largest chunk in any dim until the worst-case - sub-request URL fits the limit. Minimises total request count. -- Date params, ``bbox``, and ``properties`` are not chunked: dates are - intervals not enumerable sets; bbox is a coord array; ``properties`` - determines output schema and chunking it would shard columns. - -Coordination with ``filters.chunked``: -The planner probes the URL with a synthetic clause sized to the inner -chunker's bail floor — ``len(longest_clause) * max(per-clause encoding -ratio)`` — when a chunkable filter is present. The inner chunker bails -(emits the full filter) when any single clause's URL-encoded length -exceeds its per-sub-request budget; mirroring -``filters._effective_filter_budget``, that floor already accounts for -the worst per-call encoding ratio, so a long alphanumeric clause -coexisting with a shorter heavily-encoded clause is sized correctly. -Without this coordination, a long OR-filter plus multi-value lists -would trigger a premature ``RequestTooLarge`` even though the combined -chunkers would have made things fit. +(e.g. ``monitoring_location_id=USGS-A,USGS-B,...``). Long lists and +long top-level-``OR`` CQL filters can independently blow the server's +~8 KB URL byte limit. This module adds a single decorator that plans +both chunking dimensions together and iterates the joint cartesian +product so each sub-request URL fits. + +Design: + +- Chunkable dims include multi-value list params (sites, parameter + codes, ...) and the cql-text ``filter`` (split at top-level ``OR`` + to keep each chunk valid CQL). +- The planner enumerates candidate filter chunk counts + (``k = 1, 2, 4, ..., total_clauses``); for each, partitions clauses + into ``k`` roughly-balanced groups joined by ``OR``, substitutes the + worst (longest, URL-encoded) group as the filter, then plans list + chunking by greedy halving. The candidate that minimizes + ``list_count × k`` wins. +- Sub-chunks of the same list dim never overlap, so frame concat needs + no dedup across list chunks. Filter sub-chunks can match overlapping + records (a row matching both ``a=1`` and ``b=2`` returns from both), + so the combiner dedupes by feature ``id``. +- Date params, ``bbox``, ``properties``, ``limit``, and + ``skip_geometry`` are never chunked: dates are intervals not sets; + bbox is a coord array; ``properties`` defines the response schema + (chunking would shard columns); ``limit`` and ``skip_geometry`` are + scalar by contract. """ from __future__ import annotations @@ -47,26 +43,21 @@ from . import filters from .filters import ( + _check_numeric_filter_pitfall, _combine_chunk_frames, _combine_chunk_responses, _FetchOnce, _is_chunkable, - _max_per_clause_encoding_ratio, _split_top_level_or, ) -# Params that look like lists but must NOT be chunked. ``properties`` is -# excluded because it defines the response schema; chunking it would -# return frames with different columns per sub-request. ``bbox`` is a -# fixed 4-element coord tuple. Date params are intervals not sets. The -# CQL ``filter`` (and its ``filter_lang``) is a string that has its own -# inner chunker (``filters.chunked``); if a caller passes ``filter`` as -# a list, treating it as a multi-value param would emit malformed CQL. -# ``limit`` and ``skip_geometry`` are scalars-by-contract: their public -# signatures are ``int | None`` and ``bool | None``, so a list value -# reaching the chunker would be a type-erasure smuggle. Chunking it would -# change request semantics in confusing ways (e.g. ``limit=["100","200"]`` -# fanning out into separate paginated queries with different caps). +# Params that look like lists but must NOT be chunked. ``properties`` +# is excluded because it defines the response schema; chunking it +# would return frames with different columns per sub-request. ``bbox`` +# is a fixed 4-element coord tuple. Date params are intervals not sets. +# ``filter_lang`` is a scalar string. ``limit`` and ``skip_geometry`` +# are scalars by contract (smuggling a list through type erasure would +# silently fan into queries with conflicting per-request semantics). _NEVER_CHUNK = frozenset( { "properties", @@ -78,7 +69,6 @@ "end", "end_utc", "time", - "filter", "filter_lang", "limit", "skip_geometry", @@ -86,13 +76,10 @@ ) # Default cap on the number of sub-requests a single chunked call may -# emit. The USGS Water Data API rate-limits each HTTP request (including -# pagination), so the true budget is ``hourly_quota / avg_pages_per_chunk``. -# 1000 matches the default hourly quota and is a reasonable upper bound -# for single-page sub-requests; tune lower if your queries paginate. -# Override per-decorator via ``max_chunks=`` or by monkeypatching this -# module attribute — both the decorator wrapper and ``_plan_chunks`` -# read it lazily. +# emit. The USGS Water Data API rate-limits each HTTP request +# (including pagination), so the true budget is +# ``hourly_quota / avg_pages_per_chunk``. 1000 matches the default +# hourly quota. _DEFAULT_MAX_CHUNKS = 1000 # When ``x-ratelimit-remaining`` drops below this between sub-requests, @@ -105,19 +92,17 @@ _QUOTA_HEADER = "x-ratelimit-remaining" # Sentinel returned by ``_read_remaining`` when the response has no -# parseable ``x-ratelimit-remaining`` header. Large enough to beat any -# plausible safety floor so a missing/malformed header doesn't trigger -# spurious ``QuotaExhausted`` aborts. +# parseable header — large enough to beat any plausible safety floor +# so a missing/malformed header doesn't trigger spurious aborts. _QUOTA_UNKNOWN = 10**9 class RequestTooLarge(ValueError): - """Raised when a chunked request cannot be issued. Two cases: - (1) URL exceeds the byte limit even with every multi-value param at - a singleton chunk and any chunkable filter at the inner chunker's - bail-floor size (the URL contribution of its longest single - OR-clause, after URL-encoding); (2) the cartesian-product plan - would issue more than ``max_chunks`` sub-requests.""" + """Raised when a chunked request cannot be issued. Either the URL + exceeds the byte limit even at the smallest reducible plan (every + list dim at singleton chunks and the filter at one clause per + sub-request), or the joint cartesian-product plan would issue + more than ``max_chunks`` sub-requests.""" class QuotaExhausted(RuntimeError): @@ -125,12 +110,7 @@ class QuotaExhausted(RuntimeError): (``x-ratelimit-remaining`` header) drops below the configured safety floor. The chunker stops before issuing the next sub-request to avoid a mid-call HTTP 429 that would silently truncate paginated - results (see PR #273 for the pagination side of that bug). - - The exception carries everything needed to resume: the combined - partial frame from completed sub-requests, the metadata for the - last successful sub-request, the number of chunks completed out of - the plan total, and the last-observed ``remaining`` value. + results. Attributes ---------- @@ -139,18 +119,12 @@ class QuotaExhausted(RuntimeError): completed before the floor was crossed. partial_response : requests.Response Aggregated response with the canonical URL restored to the - user's full original query (rebuilt from ``args`` rather than - copied from the first sub-request, which only carries the - first chunk's parameters). Headers come from the last - completed sub-request (so callers inspecting - ``x-ratelimit-remaining`` see current quota state), and - ``elapsed`` is summed across completed sub-requests. Wrap in - ``BaseMetadata`` to surface to the caller alongside the - partial frame. + user's full original query. completed_chunks : int - Number of sub-requests successfully completed. + Number of sub-requests successfully completed (across both + list and filter chunking dims). total_chunks : int - Total sub-requests in the cartesian-product plan. + Total sub-requests in the joint cartesian-product plan. remaining : int Last observed ``x-ratelimit-remaining`` value. """ @@ -183,48 +157,16 @@ def _chunkable_params(args: dict[str, Any]) -> dict[str, list[Any]]: return { k: list(v) for k, v in args.items() - if k not in _NEVER_CHUNK and isinstance(v, (list, tuple)) and len(v) > 1 + if k != "filter" + and k not in _NEVER_CHUNK + and isinstance(v, (list, tuple)) + and len(v) > 1 } -def _filter_aware_probe_args(args: dict[str, Any]) -> dict[str, Any]: - """Substitute the filter with a synthetic ASCII clause sized to the - inner chunker's bail floor, so the planner's URL probe matches what - the inner chunker would emit. - - The inner ``filters.chunked`` bails (emits the full filter) when any - single OR-clause's URL-encoded length exceeds the per-sub-request - budget. Mirroring ``filters._effective_filter_budget``, that floor - is ``len(longest_clause) * max(per-clause encoding ratio)``. - Substituting an ASCII clause of that exact length makes - ``quote_plus`` a no-op, so the URL builder sees exactly the - bail-floor byte count. - """ - filter_expr = args.get("filter") - filter_lang = args.get("filter_lang") - if not _is_chunkable(filter_expr, filter_lang): - return args - parts = _split_top_level_or(filter_expr) - if len(parts) < 2: - return args # one-clause filter — inner chunker can't shrink it - longest_raw = max(len(p) for p in parts) - probe_size = math.ceil(longest_raw * _max_per_clause_encoding_ratio(parts)) - return {**args, "filter": "x" * probe_size} - - def _chunk_bytes(chunk: list[Any]) -> int: """URL-encoded byte length of ``chunk`` when comma-joined into a - URL parameter value. - - Used as the planner's biggest-chunk comparator in - ``_worst_case_args`` and the halving loop. ``quote_plus`` (rather - than raw ``,``-join length) keeps the comparator faithful to what - the real URL builder produces, so values containing characters - that expand under URL encoding (``%``, ``+``, ``/``, ``&``, …) - can't be mis-ranked. For typical USGS multi-value workloads - (alphanumeric IDs and codes) raw and encoded lengths are equal, - but the encoded form is always correct. - """ + URL parameter value.""" return len(quote_plus(",".join(map(str, chunk)))) @@ -232,14 +174,12 @@ def _request_bytes(req: requests.PreparedRequest) -> int: """Total bytes of a prepared request: URL + body. GET routes have ``body=None`` and reduce to URL length. POST routes - (CQL2 JSON body) need body bytes — the URL stays short regardless of - payload, so URL-only sizing would underestimate the request and skip - chunking when it's needed. + (CQL2 JSON body) need body bytes — the URL stays short regardless + of payload, so URL-only sizing would underestimate the request and + skip chunking when it's needed. Raises ``TypeError`` on non-sizable bodies (generators, file-like - streams). Size-based planning needs a deterministic byte count; - silently treating an unknown body as zero bytes would under-chunk - and let the request blow past the server's POST-body limit. + streams). Size-based planning needs a deterministic byte count. """ url_len = len(req.url) body = req.body @@ -251,82 +191,83 @@ def _request_bytes(req: requests.PreparedRequest) -> int: return url_len + len(body.encode("utf-8")) raise TypeError( f"multi_value_chunked cannot size a request body of type " - f"{type(body).__name__!r}; pass str, bytes, or None. Streaming " - f"bodies (generators, file-like) are not supported because the " - f"planner needs a deterministic byte count up front." + f"{type(body).__name__!r}; pass str, bytes, or None." ) -def _plan_total(plan: dict[str, list[list[Any]]]) -> int: - """Sub-request count a plan will issue: the cartesian product of - per-dim chunk counts. Computed in two places (planner's max_chunks - early-bail and wrapper's QuotaExhausted payload) — centralized - here so the two can't drift.""" - return math.prod(len(chunks) for chunks in plan.values()) +def _plan_total(list_plan: dict[str, list[list[Any]]], n_filter_chunks: int) -> int: + """Total sub-request count for a joint plan.""" + list_count = math.prod((len(c) for c in list_plan.values()), start=1) + return list_count * n_filter_chunks def _worst_case_args( - probe_args: dict[str, Any], plan: dict[str, list[list[Any]]] + base_args: dict[str, Any], list_plan: dict[str, list[list[Any]]] ) -> dict[str, Any]: - """Args representing the worst-case sub-request the plan will issue: - each dim's largest chunk (by URL-encoded bytes), composed onto - the ``probe_args`` already returned by ``_filter_aware_probe_args`` - so any chunkable filter sits at the inner chunker's bail-floor - size. The planner feeds these args through ``_request_bytes`` to - decide whether the biggest sub-request fits the budget.""" - out = dict(probe_args) - for k, chunks in plan.items(): + """Args representing the largest sub-request the list plan will issue: + each dim's longest chunk by URL-encoded bytes, layered onto + ``base_args`` (which already has the candidate filter substituted + in by the caller).""" + out = dict(base_args) + for k, chunks in list_plan.items(): out[k] = max(chunks, key=_chunk_bytes) return out -def _plan_chunks( +def _partition_clauses(clauses: list[str], k: int) -> list[str]: + """Split ``clauses`` into ``k`` roughly-balanced sub-filters joined + by ``" OR "``. Distributes the remainder across the first groups so + no group differs from another by more than one clause.""" + if k <= 0: + raise ValueError(f"k must be >= 1; got {k}") + if k >= len(clauses): + return list(clauses) + base, extra = divmod(len(clauses), k) + out: list[str] = [] + i = 0 + for g in range(k): + size = base + (1 if g < extra else 0) + out.append(" OR ".join(clauses[i : i + size])) + i += size + return out + + +def _plan_list_chunks( args: dict[str, Any], build_request: Callable[..., Any], url_limit: int, max_chunks: int | None = None, ) -> dict[str, list[list[Any]]] | None: - """Greedy halving until the worst-case sub-request fits ``url_limit``. - - Budget is total request bytes (URL + body, via ``_request_bytes``) - so POST routes size correctly — see ``multi_value_chunked`` for the - parameter-name caveat. - - Returns ``None`` when no chunking is needed (request as-is fits or - no chunkable lists). Raises ``RequestTooLarge`` when: - - the smallest reducible plan still exceeds ``url_limit`` (every - multi-value param at a singleton chunk and any chunkable filter - already at the inner chunker's bail-floor size), or - - the cartesian-product plan exceeds ``max_chunks`` sub-requests - (the hourly API budget); checked after each split so we bail - promptly once the cap is unreachable. - - ``max_chunks`` defaults to ``_DEFAULT_MAX_CHUNKS`` resolved at call - time, so monkeypatching the module constant takes effect for - direct callers too. + """Greedy halving of multi-value list dims until the worst-case + sub-request URL fits ``url_limit``. The filter (if any) in ``args`` + is treated as fixed — the caller must have already substituted the + candidate's worst filter chunk so the URL probe accounts for it. + + Returns ``None`` when no list chunking is needed (request as-is + fits or no chunkable list dims present). Raises ``RequestTooLarge`` + when the smallest reducible plan still doesn't fit or when the + plan would exceed ``max_chunks`` sub-requests. """ if max_chunks is None: max_chunks = _DEFAULT_MAX_CHUNKS if max_chunks < 1: raise ValueError( f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " - f"values would silently bypass the cap on the no-chunking path." + f"values would silently bypass the cap." ) chunkable = _chunkable_params(args) if not chunkable: return None - probe_args = _filter_aware_probe_args(args) - if _request_bytes(build_request(**probe_args)) <= url_limit: + if _request_bytes(build_request(**args)) <= url_limit: return None plan: dict[str, list[list[Any]]] = {k: [v] for k, v in chunkable.items()} while True: - worst = _worst_case_args(probe_args, plan) + worst = _worst_case_args(args, plan) if _request_bytes(build_request(**worst)) <= url_limit: return plan - # Largest splittable chunk across all dims, by URL-encoded bytes. splittable = ( (dim, idx, chunk) for dim, dim_chunks in plan.items() @@ -337,34 +278,130 @@ def _plan_chunks( if biggest is None: raise RequestTooLarge( f"Request exceeds {url_limit} bytes (URL + body) at the " - f"smallest reducible plan: every multi-value parameter " - f"at a singleton chunk and any chunkable filter at the " - f"inner chunker's bail-floor size. Reduce the number " - f"of values, shorten the filter, or split the call " - f"manually." + f"smallest reducible list plan. Reduce list sizes, shorten " + f"or simplify the filter, or split the call manually." ) dim, idx, chunk = biggest mid = len(chunk) // 2 plan[dim] = plan[dim][:idx] + [chunk[:mid], chunk[mid:]] + plan[dim][idx + 1 :] - # Each split only grows the cartesian product, so once we - # cross max_chunks we can never come back under. Bail now - # rather than keep splitting (the URL probe could still take - # many more iterations). - total = _plan_total(plan) - if total > max_chunks: + if _plan_total(plan, 1) > max_chunks: raise RequestTooLarge( - f"Chunked plan would issue {total} sub-requests, exceeding " - f"max_chunks={max_chunks} (USGS API's default hourly rate " - f"limit per key). Reduce input list sizes, narrow the time " - f"window, or raise max_chunks if you have a higher quota." + f"List plan alone would issue >{max_chunks} sub-requests " + f"before factoring filter chunking. Reduce list sizes or " + f"raise max_chunks." + ) + + +def _filter_chunk_counts(n_clauses: int) -> list[int]: + """Candidate filter chunk counts to evaluate during joint planning. + + Powers of two from 1 up to (and including) ``n_clauses``. Covers + the trade-off curve coarsely enough to find a good plan without + evaluating every possible ``k``. Each doubling halves the per-chunk + filter byte cost, which is what matters for the budget trade-off.""" + if n_clauses < 1: + return [1] + counts = [] + k = 1 + while k < n_clauses: + counts.append(k) + k *= 2 + counts.append(n_clauses) + return counts + + +def _plan_joint( + args: dict[str, Any], + build_request: Callable[..., Any], + url_limit: int, + max_chunks: int | None = None, +) -> tuple[dict[str, list[list[Any]]], list[str | None]] | None: + """Joint planner over list dims and filter clauses. + + Returns ``(list_plan, filter_chunks)`` where ``list_plan`` is a + (possibly empty) dict of chunked list dims and ``filter_chunks`` is + the list of filter expressions to substitute (length 1 means no + filter chunking — either pass the original filter through or, if + no filter at all, the single element is ``None``). Returns ``None`` + when the request fits as-is or nothing can be chunked. + + Algorithm: enumerate filter chunk counts ``k`` (powers of two up + through ``n_clauses``); for each, partition clauses into ``k`` + balanced groups joined by ``OR`` and identify the worst (longest + URL-encoded) group; substitute that as the filter in ``args`` and + plan list chunking with greedy halving against the remaining + budget. Pick the candidate whose ``list_count × k`` is smallest. + + Raises ``RequestTooLarge`` when no candidate fits or the best plan + would exceed ``max_chunks``. + """ + if max_chunks is None: + max_chunks = _DEFAULT_MAX_CHUNKS + if max_chunks < 1: + raise ValueError( + f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " + f"values would silently bypass the cap." + ) + + filter_expr = args.get("filter") + filter_chunkable = _is_chunkable(filter_expr, args.get("filter_lang")) + clauses: list[str] = [] + if filter_chunkable: + _check_numeric_filter_pitfall(filter_expr) + clauses = _split_top_level_or(filter_expr) + if len(clauses) < 2: + filter_chunkable = False + + list_dims = _chunkable_params(args) + if not list_dims and not filter_chunkable: + return None + if _request_bytes(build_request(**args)) <= url_limit: + return None + + candidate_counts = _filter_chunk_counts(len(clauses)) if filter_chunkable else [1] + best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None + last_error: RequestTooLarge | None = None + + for k in candidate_counts: + if filter_chunkable: + filter_chunks: list[str | None] = list(_partition_clauses(clauses, k)) + worst_filter = max(filter_chunks, key=lambda c: len(quote_plus(c or ""))) + plan_args = {**args, "filter": worst_filter} + else: + filter_chunks = [filter_expr] if filter_expr is not None else [None] + plan_args = args + + per_filter_cap = max(1, max_chunks // k) + try: + list_plan = _plan_list_chunks( + plan_args, build_request, url_limit, per_filter_cap ) + except RequestTooLarge as exc: + last_error = exc + continue + if list_plan is None: + list_plan = {} + total = _plan_total(list_plan, k) + if total > max_chunks: + continue + if best is None or total < best[0]: + best = (total, list_plan, filter_chunks) + + if best is None: + raise last_error or RequestTooLarge( + f"Joint plan would exceed max_chunks={max_chunks} at every " + f"candidate filter chunking. Reduce list sizes, simplify the " + f"filter, or raise max_chunks." + ) + + return best[1], best[2] def _read_remaining(response: requests.Response) -> int: """Parse ``x-ratelimit-remaining`` from a response. Missing or - malformed header → return ``_QUOTA_UNKNOWN`` so the safety check - treats it as 'plenty of quota' (don't abort on header glitches).""" + malformed header → ``_QUOTA_UNKNOWN`` so the safety check treats it + as 'plenty of quota'.""" raw = response.headers.get(_QUOTA_HEADER) if raw is None: return _QUOTA_UNKNOWN @@ -381,55 +418,32 @@ def multi_value_chunked( max_chunks: int | None = None, quota_safety_floor: int | None = None, ) -> Callable[[_FetchOnce], _FetchOnce]: - """Decorator that splits multi-value list params across sub-requests - so each sub-request fits ``url_limit`` bytes (defaults to - ``filters._WATERDATA_URL_BYTE_LIMIT``) and the cartesian-product - plan stays ≤ ``max_chunks`` sub-requests (defaults to - ``_DEFAULT_MAX_CHUNKS``). All defaults are resolved at call time so - tests/users that patch the module constants affect this decorator - uniformly. + """Decorator that splits multi-value list params and cql-text + filters across sub-requests so each sub-request URL fits + ``url_limit`` bytes (defaults to ``filters._WATERDATA_URL_BYTE_LIMIT``) + and the joint cartesian-product plan stays ≤ ``max_chunks`` + sub-requests (defaults to ``_DEFAULT_MAX_CHUNKS``). All defaults are + resolved at call time so tests/users that patch the module constants + affect this decorator uniformly. ``url_limit`` is enforced against total request bytes (URL + body, - via ``_request_bytes``); the name reflects the dominant GET case - where body is empty. POST routes (e.g. ``monitoring-locations`` via - CQL2 JSON) are conservatively sized — never under-chunks, but may - over-chunk at the body's true ceiling. + via ``_request_bytes``). POST routes are conservatively sized. Between sub-requests the wrapper reads ``x-ratelimit-remaining`` from each response. If it drops below ``quota_safety_floor`` (default - ``_DEFAULT_QUOTA_SAFETY_FLOOR``), the wrapper raises ``QuotaExhausted`` - carrying the combined partial result and the chunk offset so callers - can resume after the hourly window resets, instead of crashing into - a mid-pagination HTTP 429 (which the upstream pagination loop in - ``_walk_pages`` historically truncated silently — see PR #273). - - Sits OUTSIDE ``@filters.chunked``: list-chunking is the outer loop, - filter-chunking is the inner loop. The wrapped function has the same - signature as ``filters.chunked`` expects — ``(args: dict) -> (frame, - response)`` — so the two decorators compose cleanly. The planner is - filter-aware so it doesn't raise prematurely when the inner filter - chunker would have shrunk the per-sub-request URL on its own. - - Sub-requests run sequentially with no per-call timeout enforced here. - A hung single sub-request will block the entire chunked call; the - caller is responsible for configuring an HTTP-layer timeout (e.g. - via a ``requests.Session`` wrapper) if bounded latency matters. - - Cartesian-product iteration order is deterministic for a given - ``args`` dict: the wrapper iterates ``plan.values()`` in insertion - order (Python 3.7+ guarantee), which equals the order in which - chunkable params appeared in ``args``. For the public waterdata - getters that order is the function-signature order, so - ``QuotaExhausted.completed_chunks`` maps to the same sub-requests - across repeated calls with the same arguments — resume is - well-defined. + ``_DEFAULT_QUOTA_SAFETY_FLOOR``), the wrapper raises + ``QuotaExhausted`` carrying the combined partial result and the + chunk offset so callers can resume after the hourly window resets. + + Joint cartesian-product iteration order is deterministic: list dims + in args-insertion order (Python 3.7+ guarantee), with the filter + dim as the innermost loop. Same ``args`` produces the same chunk + sequence — resume is well-defined. """ if quota_safety_floor is not None and quota_safety_floor < 0: raise ValueError( f"quota_safety_floor must be >= 0; got {quota_safety_floor}. " - f"A negative floor silently disables the guard the same way " - f"``0`` does, but obscures the intent — pass 0 explicitly to " - f"disable, or a positive integer to set a real safety margin." + f"Pass 0 explicitly to disable the guard." ) def decorator(fetch_once: _FetchOnce) -> _FetchOnce: @@ -447,41 +461,46 @@ def wrapper( if quota_safety_floor is not None else _DEFAULT_QUOTA_SAFETY_FLOOR ) - plan = _plan_chunks(args, build_request, limit, max_chunks) + plan = _plan_joint(args, build_request, limit, max_chunks) if plan is None: return fetch_once(args) - # Pre-build the canonical URL representing the user's full - # original query. The chunker sends sub-requests with sliced - # multi-value lists; without this restore, the aggregated - # response's ``.url`` would only show the first chunk and - # callers logging ``md.url`` for reproducibility would see a - # truncated view of their own query. + list_plan, filter_chunks = plan canonical_url = build_request(**args).url - keys = list(plan) - total = _plan_total(plan) + list_keys = list(list_plan) + list_combos = ( + list(itertools.product(*(list_plan[k] for k in list_keys))) + if list_plan + else [()] + ) + total = len(list_combos) * len(filter_chunks) + frames: list[pd.DataFrame] = [] responses: list[requests.Response] = [] - for i, combo in enumerate(itertools.product(*(plan[k] for k in keys))): - sub_args = {**args, **dict(zip(keys, combo))} - frame, response = fetch_once(sub_args) - frames.append(frame) - responses.append(response) - # Skip the quota check after the last sub-request — - # nothing left to abort. - if i < total - 1: - remaining = _read_remaining(response) - if remaining < floor: - partial = _combine_chunk_responses(responses) - partial.url = canonical_url - raise QuotaExhausted( - partial_frame=_combine_chunk_frames(frames), - partial_response=partial, - completed_chunks=i + 1, - total_chunks=total, - remaining=remaining, - ) + i = 0 + for combo in list_combos: + for filter_chunk in filter_chunks: + sub_args = dict(args) + sub_args.update(zip(list_keys, combo)) + if filter_chunk is not None: + sub_args["filter"] = filter_chunk + frame, response = fetch_once(sub_args) + frames.append(frame) + responses.append(response) + if i < total - 1: + remaining = _read_remaining(response) + if remaining < floor: + partial = _combine_chunk_responses(responses) + partial.url = canonical_url + raise QuotaExhausted( + partial_frame=_combine_chunk_frames(frames), + partial_response=partial, + completed_chunks=i + 1, + total_chunks=total, + remaining=remaining, + ) + i += 1 combined = _combine_chunk_responses(responses) combined.url = canonical_url diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py index deac263e..65c489a0 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/waterdata/filters.py @@ -1,25 +1,28 @@ """CQL ``filter`` support for the Water Data OGC getters. -Two names are public to the rest of the package: +Public names: - ``FILTER_LANG``: the type alias used for the ``filter_lang`` kwarg. -- ``chunked``: the decorator ``utils.py`` applies to its single-request - fetch function. It runs the lexicographic-comparison pitfall guard, - splits long cql-text filters at top-level ``OR`` so each sub-request - fits under the server's URL byte limit, and concatenates the results. - -Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal predicates, -function calls) are forwarded verbatim — only top-level ``OR`` chunks -losslessly into independent sub-queries whose result sets can be union'd. + +Internal helpers used by ``chunking.multi_value_chunked``'s joint +planner: ``_split_top_level_or`` (clause partitioning), ``_chunk_cql_or`` +(legacy single-dim chunker, kept for direct callers / tests), +``_is_chunkable`` (filter-language gate), ``_check_numeric_filter_pitfall`` +(the lexicographic-comparison guard), ``_combine_chunk_frames`` / +``_combine_chunk_responses`` (aggregation), and constants +``_WATERDATA_URL_BYTE_LIMIT`` / ``_CQL_FILTER_CHUNK_LEN``. + +Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal +predicates, function calls) are forwarded verbatim — only top-level +``OR`` chunks losslessly into independent sub-queries whose result sets +can be union'd. """ from __future__ import annotations -import functools import re from collections.abc import Callable from typing import Any, Literal, TypeVar -from urllib.parse import quote_plus import pandas as pd import requests @@ -27,8 +30,8 @@ FILTER_LANG = Literal["cql-text", "cql-json"] # Conservative fallback budget when ``_chunk_cql_or`` is called without -# an explicit ``max_len``. The ``chunked`` decorator computes a tighter -# per-request budget from ``_WATERDATA_URL_BYTE_LIMIT``. +# an explicit ``max_len``. The joint planner in ``chunking.py`` computes +# a tighter per-request budget from ``_WATERDATA_URL_BYTE_LIMIT``. _CQL_FILTER_CHUNK_LEN = 5000 # Empirically the API replies HTTP 414 above ~8200 bytes of full URL — @@ -36,12 +39,6 @@ # leaves ~200 bytes for request-line framing and proxy variance. _WATERDATA_URL_BYTE_LIMIT = 8000 -# Conservative over-estimate of URL bytes used by everything *except* -# the filter value. Used only by the fast path in -# ``_effective_filter_budget`` to skip the probe when the encoded filter -# clearly already fits. -_NON_FILTER_URL_HEADROOM = 1000 - _NUM = r"-?(?:\d+(?:\.\d+)?|\.\d+)(?:[eE][+-]?\d+)?" _IDENT = r"[A-Za-z_]\w*" @@ -152,48 +149,6 @@ def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]: return chunks -def _max_per_clause_encoding_ratio(parts: list[str]) -> float: - """Worst per-clause ``len(quote_plus(p)) / len(p)`` across OR-clauses. - - Any sub-request chunk could end up containing only the heavier-encoding - clauses, so per-sub-request byte budgets must be sized against the - worst (not average) ratio to avoid overflow. Used by both this - module's filter chunker and the outer ``chunking._filter_aware_probe_args``; - pinning the formula here keeps the two from drifting. - """ - return max(len(quote_plus(p)) / len(p) for p in parts) - - -def _effective_filter_budget( - args: dict[str, Any], - filter_expr: str, - build_request: Callable[..., Any], -) -> int: - """Raw-CQL byte budget that, after URL-encoding, fits the URL byte limit. - - The server caps total URL length, not raw CQL length. We probe the - non-filter URL bytes by building the request with a 1-byte placeholder - filter, subtract from the URL limit to get the bytes available for the - encoded filter, then convert back to raw CQL bytes via the *maximum* - per-clause encoding ratio. - """ - # Fast path: encoded filter clearly fits with room for any plausible - # non-filter URL. Skips the PreparedRequest build and splitter scan. - encoded_len = len(quote_plus(filter_expr)) - if encoded_len + _NON_FILTER_URL_HEADROOM <= _WATERDATA_URL_BYTE_LIMIT: - return len(filter_expr) + 1 - - probe = build_request(**{**args, "filter": "x"}) - available_url_bytes = _WATERDATA_URL_BYTE_LIMIT - (len(probe.url) - 1) - if available_url_bytes <= 0: - # Non-filter URL already over the limit. Pass through unchanged so - # the caller sees one 414 instead of N parallel sub-request failures. - return len(filter_expr) + 1 - parts = _split_top_level_or(filter_expr) or [filter_expr] - encoding_ratio = _max_per_clause_encoding_ratio(parts) - return max(100, int(available_url_bytes / encoding_ratio)) - - def _check_numeric_filter_pitfall(filter_expr: str) -> None: """Raise if the filter pairs a field with an unquoted numeric literal. @@ -285,9 +240,8 @@ def _combine_chunk_responses( The returned response's ``.url`` is the *first chunk's* URL, which only reflects the first slice of the user's query. Callers wanting the canonical original-query URL on ``BaseMetadata`` must overwrite - ``.url`` themselves (using ``build_request(**original_args).url``); - the decorator wrappers in ``filters.chunked`` and - ``chunking.multi_value_chunked`` do this. + ``.url`` themselves; ``chunking.multi_value_chunked``'s wrapper does + this via ``build_request(**original_args).url``. Mutates the first response in place: ``.headers`` is replaced with the last response's headers and ``.elapsed`` is accumulated across @@ -305,61 +259,3 @@ def _combine_chunk_responses( "_FetchOnce", bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]], ) - - -def chunked(*, build_request: Callable[..., Any]) -> Callable[[_FetchOnce], _FetchOnce]: - """Decorator that adds CQL-filter chunking to a single-request fetch. - - The wrapped function has signature ``(args: dict) -> (frame, response)`` - and represents one HTTP round-trip. The decorator inspects ``args``: - - - No chunkable filter: pass through unchanged. - - Chunkable cql-text filter: run the lexicographic-pitfall guard, split - into URL-length-safe sub-expressions, call the wrapped function once - per chunk, concatenate frames (drop empties, dedup by feature ``id``), - and return an aggregated response with ``.url`` restored to the - canonical full-filter URL (so ``BaseMetadata.url`` reflects the - user's original query rather than the first filter chunk), last - chunk's headers (so callers see current ``x-ratelimit-remaining``, - which the outer ``multi_value_chunked`` decorator's ``QuotaExhausted`` - guard depends on), and summed ``elapsed``. - - Either way the return shape matches the undecorated function's, so the - caller wraps the response in ``BaseMetadata`` the same way in both paths. - - ``build_request`` is injected so the decorator can probe URL length - without importing any specific HTTP builder; it receives the same kwargs - the wrapped function's ``args`` would and returns a prepared-request-like - object with a ``.url`` attribute. - """ - - def decorator(fetch_once: _FetchOnce) -> _FetchOnce: - @functools.wraps(fetch_once) - def wrapper( - args: dict[str, Any], - ) -> tuple[pd.DataFrame, requests.Response]: - filter_expr = args.get("filter") - if not _is_chunkable(filter_expr, args.get("filter_lang")): - return fetch_once(args) - - _check_numeric_filter_pitfall(filter_expr) - budget = _effective_filter_budget(args, filter_expr, build_request) - chunks = _chunk_cql_or(filter_expr, max_len=budget) - - frames: list[pd.DataFrame] = [] - responses: list[requests.Response] = [] - for chunk in chunks: - frame, response = fetch_once({**args, "filter": chunk}) - frames.append(frame) - responses.append(response) - - # Restore the canonical URL representing the user's full filter - # (the aggregated response otherwise carries only the first - # filter-chunk's URL, which misleads callers logging md.url). - combined = _combine_chunk_responses(responses) - combined.url = build_request(**args).url - return _combine_chunk_frames(frames), combined - - return wrapper # type: ignore[return-value] - - return decorator diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index fe419100..cf9d554a 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -14,7 +14,7 @@ from dataretrieval import __version__ from dataretrieval.utils import BaseMetadata -from dataretrieval.waterdata import chunking, filters +from dataretrieval.waterdata import chunking from dataretrieval.waterdata.types import ( PROFILE_LOOKUP, PROFILES, @@ -987,19 +987,17 @@ def get_ogc_data( @chunking.multi_value_chunked(build_request=_construct_api_requests) -@filters.chunked(build_request=_construct_api_requests) def _fetch_once( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: """Send one prepared-args OGC request; return the frame + response. - Two orthogonal chunkers wrap this body. ``@chunking.multi_value_chunked`` - (outer) splits multi-value list params (e.g. ``monitoring_location_id``) - across sub-requests so each URL fits the server byte limit; the - cartesian product of per-dim chunks is iterated. ``@filters.chunked`` - (inner) splits long cql-text filters at top-level ``OR``. With no - chunkable inputs both pass through unchanged. Either way the return - shape is ``(frame, response)``. + ``@chunking.multi_value_chunked`` plans list-chunking and + filter-chunking jointly and iterates the cartesian product, picking + the allocation between list chunks and filter chunks that minimizes + total sub-requests. With no chunkable inputs the decorator passes + args through unchanged. Either way the return shape is + ``(frame, response)``. """ req = _construct_api_requests(**args) return _walk_pages(geopd=GEOPANDAS, req=req) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 92f312d8..12b2e3b5 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,4 +1,5 @@ import datetime +import itertools import json import math import sys @@ -12,7 +13,6 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) -from dataretrieval.waterdata import chunking as _chunking from dataretrieval.waterdata import filters as _filters from dataretrieval.waterdata import ( get_channel, @@ -39,8 +39,10 @@ QuotaExhausted, RequestTooLarge, _chunkable_params, - _filter_aware_probe_args, - _plan_chunks, + _filter_chunk_counts, + _partition_clauses, + _plan_joint, + _plan_list_chunks, _read_remaining, multi_value_chunked, ) @@ -260,39 +262,35 @@ def _fake_build(*, base=200, **kwargs): return _FakeReq("x" * bytes_) -def test_filter_aware_probe_args_passes_through_when_not_chunkable(): - """No filter, json-lang filter, single-clause filter — return unchanged.""" - assert _filter_aware_probe_args({"a": 1}) == {"a": 1} - assert _filter_aware_probe_args({"filter": "a='1'", "filter_lang": "cql-json"}) == { - "filter": "a='1'", - "filter_lang": "cql-json", - } - args = {"filter": "a='single clause with no OR'"} - assert _filter_aware_probe_args(args) == args - - -def test_filter_aware_probe_args_models_inner_chunker_bail_floor(): - """Chunkable filter → return args with filter replaced by a synthetic - ASCII clause whose URL byte count equals the inner chunker's bail - floor ``len(longest) * max(per_clause_encoding_ratio)``. Mirrors - ``filters._effective_filter_budget``'s worst-case model so the - planner doesn't approve plans the inner chunker would refuse.""" - args = {"filter": "a='1' OR a='22' OR a='333'", "x": 7} - probe = _filter_aware_probe_args(args) - parts = ["a='1'", "a='22'", "a='333'"] - expected = math.ceil( - max(len(p) for p in parts) * max(len(quote_plus(p)) / len(p) for p in parts) - ) - assert len(probe["filter"]) == expected - assert probe["filter"].isascii() and probe["filter"].isalnum() - assert probe["x"] == 7 - assert args["filter"] == "a='1' OR a='22' OR a='333'" # input not mutated +def test_partition_clauses_balanced(): + """k roughly-balanced groups, joined by ` OR `, distributing the + remainder across the first groups.""" + clauses = ["a='1'", "b='2'", "c='3'", "d='4'", "e='5'"] + assert _partition_clauses(clauses, 1) == [" OR ".join(clauses)] + # 5 clauses into 2 groups → sizes 3, 2 (remainder lands in the first). + assert _partition_clauses(clauses, 2) == [ + "a='1' OR b='2' OR c='3'", + "d='4' OR e='5'", + ] + assert _partition_clauses(clauses, 5) == clauses + # k > len(clauses) → singletons (don't synthesize empty groups). + assert _partition_clauses(clauses, 99) == clauses + + +def test_filter_chunk_counts_powers_of_two_plus_n(): + """Candidate counts cover the trade-off at powers of two, and always + include ``n_clauses`` itself so the fully-singleton case is always + evaluated even when n isn't a power of two.""" + assert _filter_chunk_counts(0) == [1] + assert _filter_chunk_counts(1) == [1] + assert _filter_chunk_counts(5) == [1, 2, 4, 5] + assert _filter_chunk_counts(16) == [1, 2, 4, 8, 16] def test_plan_chunks_returns_none_when_request_fits(): """URL under limit → planner returns None, decorator passes through.""" args = {"monitoring_location_id": ["A", "B", "C"]} - plan = _plan_chunks(args, _fake_build, url_limit=8000) + plan = _plan_list_chunks(args, _fake_build, url_limit=8000) assert plan is None @@ -300,7 +298,7 @@ def test_plan_chunks_returns_none_when_no_chunkable_lists(): """No multi-value lists, however over-limit → planner can't help, returns None (decorator falls through; server may 414 but that's not chunker's job).""" args = {"monitoring_location_id": "scalar-only"} - plan = _plan_chunks(args, _fake_build, url_limit=10) + plan = _plan_list_chunks(args, _fake_build, url_limit=10) assert plan is None @@ -311,7 +309,7 @@ def test_plan_chunks_greedy_halving_targets_largest_dim(): "parameter_code": ["00060", "00065"], } # full URL ≈ 200 + 123 + 12 = 335; force splitting heavy dim only. - plan = _plan_chunks(args, _fake_build, url_limit=310) + plan = _plan_list_chunks(args, _fake_build, url_limit=310) assert len(plan["monitoring_location_id"]) > 1 assert len(plan["parameter_code"]) == 1 # heavy-dim split was enough @@ -321,73 +319,67 @@ def test_plan_chunks_raises_request_too_large_at_singleton_floor(): fall back on) → RequestTooLarge with a clear message.""" args = {"monitoring_location_id": ["A", "B"]} # base=200 alone exceeds limit; no relief possible. - with pytest.raises(RequestTooLarge, match="multi-value parameter"): - _plan_chunks(args, _fake_build, url_limit=100) - - -def test_plan_chunks_coordinates_with_filter_chunker(monkeypatch): - """COORDINATION REGRESSION TEST. + with pytest.raises(RequestTooLarge, match="smallest reducible"): + _plan_list_chunks(args, _fake_build, url_limit=100) - With the FULL filter in URL-length probes, singleton-per-dim URL still - exceeds the limit and the planner would raise RequestTooLarge. With - filter-aware probing, the planner models the per-sub-request URL as - ``worst-dim-chunk + longest-clause-after-encoding`` (the inner filter - chunker's bail floor — it returns the FULL filter if any single - clause exceeds the budget, so the longest clause is the smallest - floor it can guarantee). The probe fits, plan returns. - Sanity-check the *negative*: with filter-aware probing disabled, the - same inputs would raise. - """ +def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): + """When the request can only fit by chunking BOTH the list and the + filter, the joint planner must produce a plan that touches both + dims. With the previous two-decorator design the inner filter + chunker would have shrunk the filter at runtime; the joint planner + surfaces the same outcome at plan time.""" clauses = [f"f='{i}'" for i in range(10)] args = { "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], "filter": " OR ".join(clauses), } - # singleton+full-filter ≈ 200 + 10 + 86 = 296 (over limit 240) — would raise. - # longest-clause probe model ≈ 200 + 10 + 5 = 215 (under limit) — plan succeeds. - # (Here all clauses are the same length, so longest == shortest; the - # encoding-ratio coordination matters for lopsided clauses.) - plan = _plan_chunks(args, _fake_build, url_limit=240) - assert plan is not None # coordination prevented the premature raise - assert len(plan["monitoring_location_id"]) > 1 # planner did split - - # Negative control: patch the probe helper to be a no-op (model "no - # filter awareness") and confirm the same inputs raise. - monkeypatch.setattr(_chunking, "_filter_aware_probe_args", lambda a: a) - with pytest.raises(RequestTooLarge): - _plan_chunks(args, _fake_build, url_limit=240) - - -def test_plan_chunks_probes_with_max_clause_not_min(): - """Regression: with lopsided OR-clauses (one short, one long), probing - with min(parts) lets the planner falsely declare a plan feasible that - the inner filter chunker can't actually deliver — it bails when any - single clause exceeds the per-sub-request budget. Probing with the - longest clause is the safe lower bound on per-sub-request filter - size, so the planner correctly raises when no plan can fit.""" + # Singleton list + full filter ≈ 200 + 10 + 86 = 296 (over limit 240). + # Joint planner must split the filter into k >= 2 groups. + plan = _plan_joint(args, _fake_build, url_limit=240) + assert plan is not None + list_plan, filter_chunks = plan + # Either the filter was chunked, the list was chunked, or both. + assert len(filter_chunks) > 1 or any(len(v) > 1 for v in list_plan.values()) + + +def test_plan_joint_minimizes_total_sub_requests(): + """When both dims need shrinking, picking smaller filter chunks + frees URL budget for larger list chunks, and vice versa. The + planner should pick the allocation with the *fewest* total + sub-requests, not just the first allocation that fits.""" + # 16 short clauses (no inflation under URL encoding so the math is + # tractable). Each clause = 5 bytes (e.g. "f='0'"); full filter ≈ + # 16*5 + 15*4 = 140 bytes raw. + clauses = [f"f='{i}'" for i in range(16)] args = { - "sites": ["A" * 10, "B" * 10], - "filter": "x='1' OR x='" + "a" * 28 + "'", # 5-char and 33-char clauses + "sites": ["S" * 30 for _ in range(8)], # 8 sites @ 30 chars + "filter": " OR ".join(clauses), } - # base 200 + singleton sites 10 + min-clause 5 = 215 (limit 230 -> fits) - # base 200 + singleton sites 10 + max-clause 33 = 243 (limit 230 -> exceeds) - # With min: planner succeeds, but real URL with full filter (42) exceeds - # 230 -> server 414. With max: planner raises early, as it should. - with pytest.raises(RequestTooLarge): - _plan_chunks(args, _fake_build, url_limit=230) - - -def test_plan_chunks_still_raises_when_even_min_clause_doesnt_fit(): - """If the limit is so tight that singleton + shortest-clause STILL - exceeds it, filter chunker can't save us either — raise.""" + # Tight limit forces both dims to participate. + plan = _plan_joint(args, _fake_build, url_limit=380) + assert plan is not None + list_plan, filter_chunks = plan + total = max( + 1, math.prod(len(c) for c in list_plan.values()) if list_plan else 1 + ) * len(filter_chunks) + # Plan must beat the bail-floor-style worst case (8 singletons × 16 + # filter chunks = 128 sub-requests) by a healthy margin. + assert total < 128 + + +def test_plan_joint_raises_when_smallest_plan_doesnt_fit(): + """If even the most aggressive joint plan (singleton lists + + singleton filter clauses) still exceeds the limit, surface + RequestTooLarge — there's nothing left to shrink.""" args = { "monitoring_location_id": ["A" * 10, "B" * 10], "filter": "x='12345' OR x='67890'", # min clause is 9 chars } - # Singleton + min-clause ≈ 200 + 10 + 9 = 219; limit below that → unrecoverable. + # Base 200 + singleton site (10) + singleton clause (9) = 219; limit + # below 219 → no joint plan can fit. with pytest.raises(RequestTooLarge): - _plan_chunks(args, _fake_build, url_limit=210) + _plan_joint(args, _fake_build, url_limit=210) def test_multi_value_chunked_passes_through_when_url_fits(): @@ -512,8 +504,8 @@ def test_plan_chunks_raises_when_plan_exceeds_max_chunks(): "dim_b": [f"another-long-value-{i}" for i in range(50)], } # url_limit forces splitting; max_chunks=10 forces the cap to fire. - with pytest.raises(RequestTooLarge, match="exceeding max_chunks=10"): - _plan_chunks(args, _fake_build, url_limit=250, max_chunks=10) + with pytest.raises(RequestTooLarge, match=">10 sub-requests"): + _plan_list_chunks(args, _fake_build, url_limit=250, max_chunks=10) def test_plan_chunks_respects_default_cap_without_explicit_arg(): @@ -528,8 +520,8 @@ def test_plan_chunks_respects_default_cap_without_explicit_arg(): # Without explicit max_chunks: defaults to 1000. The plan for these # inputs would emit > 1000 sub-requests at a tight limit, so should # raise on default cap alone. - with pytest.raises(RequestTooLarge, match=r"max_chunks=1000"): - _plan_chunks(args, _fake_build, url_limit=220) + with pytest.raises(RequestTooLarge, match=r">1000 sub-requests"): + _plan_list_chunks(args, _fake_build, url_limit=220) def test_multi_value_chunked_cap_override(): @@ -540,7 +532,7 @@ def test_multi_value_chunked_cap_override(): def fetch(args): return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - with pytest.raises(RequestTooLarge, match="exceeding max_chunks=10"): + with pytest.raises(RequestTooLarge, match=">10 sub-requests"): fetch( { "dim_a": [f"longer-v{i}" for i in range(30)], @@ -715,9 +707,9 @@ def test_plan_chunks_rejects_non_positive_max_chunks(): in-loop check only runs after a split). Reject early.""" args = {"monitoring_location_id": ["A", "B", "C", "D"]} with pytest.raises(ValueError, match="max_chunks must be >= 1"): - _plan_chunks(args, _fake_build, url_limit=1000, max_chunks=0) + _plan_list_chunks(args, _fake_build, url_limit=1000, max_chunks=0) with pytest.raises(ValueError, match="max_chunks must be >= 1"): - _plan_chunks(args, _fake_build, url_limit=1000, max_chunks=-5) + _plan_list_chunks(args, _fake_build, url_limit=1000, max_chunks=-5) def test_multi_value_chunked_restores_canonical_url(): @@ -798,6 +790,89 @@ def test_multi_value_chunked_rejects_negative_quota_safety_floor(): multi_value_chunked(build_request=_fake_build, quota_safety_floor=-50) +def test_joint_planner_url_construction_long_filter_and_long_sites(): + """Realistic stress: 20 datetime OR-clauses combined with 100 USGS + site IDs. Every sub-request URL built from the plan must fit the + 8000-byte limit, the joint planner must beat the naive "filter at + bail-floor, chunk lists" approach, and the partitioned filters + must union to the user's original filter expression. + + Uses the real ``_construct_api_requests`` builder so the test + catches URL-encoding surprises that a fake builder would miss. + """ + # Realistic AGENCY-ID site format: USGS-{8 digits}. 500 sites is + # enough to force the URL well past the 8000-byte server limit + # without any filter contribution. + sites = [f"USGS-{i:08d}" for i in range(500)] + # 20 datetime equality clauses; each ~30 bytes raw, more after URL + # encoding (the apostrophes and `:` characters expand). + clauses = [ + f"time='2024-{m:02d}-{d:02d}T00:00:00Z'" + for m in range(1, 6) + for d in (1, 8, 15, 22) + ] + assert len(clauses) == 20 + filter_expr = " OR ".join(clauses) + + args = { + "service": "daily", + "monitoring_location_id": sites, + "filter": filter_expr, + } + url_limit = 8000 + + plan = _plan_joint(args, _construct_api_requests, url_limit) + assert plan is not None, "expected joint plan for over-limit request" + list_plan, filter_chunks = plan + + # Walk every sub-request and assert URL fits. + list_keys = list(list_plan) + list_combos = ( + list(itertools.product(*(list_plan[k] for k in list_keys))) + if list_plan + else [()] + ) + over_limit = [] + for combo in list_combos: + for filter_chunk in filter_chunks: + sub_args = dict(args) + sub_args.update(zip(list_keys, combo)) + if filter_chunk is not None: + sub_args["filter"] = filter_chunk + req = _construct_api_requests(**sub_args) + url_len = len(req.url) + (len(req.body) if req.body else 0) + if url_len > url_limit: + over_limit.append((url_len, combo, filter_chunk)) + assert not over_limit, ( + f"{len(over_limit)} sub-request(s) exceeded the URL limit; " + f"first: {over_limit[0]}" + ) + + # Filter partitions must union back to the original (modulo + # whitespace around `OR`). Each clause must appear exactly once. + union_clauses: list[str] = [] + for chunk in filter_chunks: + if chunk is None: + continue + union_clauses.extend(c.strip() for c in chunk.split(" OR ")) + assert union_clauses == clauses, ( + "filter partitioning must cover every original clause exactly once" + ) + + # List partitions: every original site appears in exactly one list + # chunk for each dim. Joined sites across all chunks recover the + # input set. + if "monitoring_location_id" in list_plan: + seen = [s for chunk in list_plan["monitoring_location_id"] for s in chunk] + assert sorted(seen) == sorted(sites) + + # Joint plan must beat the bail-floor-style worst case (singleton + # sites × all filter clauses singleton = 100 * 20 = 2000) — joint + # planning of these inputs cuts that by at least a factor of 4. + total = len(list_combos) * len(filter_chunks) + assert total < 500, f"joint plan emitted {total} sub-requests (expected <500)" + + def test_samples_results(): """Test results call for proper columns""" df, _ = get_samples( From 10858e90f7b1bc16a5d6ea1a8a6d9291ce3dc800 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 12:37:27 -0500 Subject: [PATCH 04/17] refactor(waterdata): Share a single URL-byte sizing primitive across both chunking dims The two chunking dimensions (list values and CQL OR-clauses) shared an obvious primitive: "URL-encoded byte length of atoms joined by a separator." Extract _joined_url_bytes(atoms, sep); list dims call it with "," and filter dims call it with " OR ". _chunk_bytes collapses to a one-liner using the helper, and the inline len(quote_plus(c or "")) in the joint planner becomes _joined_url_bytes(group, " OR "). Partition shape also unifies: _partition_clauses now returns list[list[str]] (raw atom groups) instead of pre-joined strings. The joint planner sizes candidates by _joined_url_bytes on the raw groups and joins only the winning groups for the wrapper to iterate, so discarded partition candidates never pay the join cost. Side cleanups motivated by the /simplify review: - Add "filter" to _NEVER_CHUNK so _chunkable_params doesn't need a k != "filter" special case alongside the frozenset check. - Drop the redundant filter_chunkable variable in _plan_joint; derive from len(clauses) >= 2. - Bug fix in _plan_joint: when there are no list dims to shrink and the filter alone overflows the URL limit, the planner used to pick k=1 and emit one over-limit sub-request. Now it verifies the request fits with the chosen filter chunking before accepting that k. Dead code removal: - _chunk_cql_or and _CQL_FILTER_CHUNK_LEN in filters.py had zero production callers after the joint planner subsumed their role. Deleted, with their 4 unit tests. - 4 _effective_filter_budget tests (function already deleted in the unification commit) and their _build_request / _WATERDATA_URL_BYTE_LIMIT test scaffolding. Test rewrites: the three end-to-end tests that previously mocked _effective_filter_budget (long_filter fan-out, dedup, empty-chunk GeoDataFrame preservation) now exercise the joint planner directly via a filter-size-aware fake URL builder. Same invariants, no mock of removed internals. Net diff: -180 lines across 4 files (-72 production, -108 tests). --- dataretrieval/waterdata/chunking.py | 74 ++++---- dataretrieval/waterdata/filters.py | 46 +---- tests/waterdata_filters_test.py | 261 ++++++---------------------- tests/waterdata_test.py | 17 +- 4 files changed, 107 insertions(+), 291 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 5d56d6fc..4294a1d6 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -51,10 +51,11 @@ _split_top_level_or, ) -# Params that look like lists but must NOT be chunked. ``properties`` -# is excluded because it defines the response schema; chunking it -# would return frames with different columns per sub-request. ``bbox`` -# is a fixed 4-element coord tuple. Date params are intervals not sets. +# Params the multi-value chunker must NOT split. ``properties`` defines +# the response schema; chunking it would shard columns. ``bbox`` is a +# fixed 4-element coord tuple. Date params are intervals not sets. +# ``filter`` has its own OR-clause partitioning inside ``_plan_joint``; +# treating it as a multi-value list would emit malformed CQL. # ``filter_lang`` is a scalar string. ``limit`` and ``skip_geometry`` # are scalars by contract (smuggling a list through type erasure would # silently fan into queries with conflicting per-request semantics). @@ -69,6 +70,7 @@ "end", "end_utc", "time", + "filter", "filter_lang", "limit", "skip_geometry", @@ -157,17 +159,19 @@ def _chunkable_params(args: dict[str, Any]) -> dict[str, list[Any]]: return { k: list(v) for k, v in args.items() - if k != "filter" - and k not in _NEVER_CHUNK - and isinstance(v, (list, tuple)) - and len(v) > 1 + if k not in _NEVER_CHUNK and isinstance(v, (list, tuple)) and len(v) > 1 } -def _chunk_bytes(chunk: list[Any]) -> int: - """URL-encoded byte length of ``chunk`` when comma-joined into a - URL parameter value.""" - return len(quote_plus(",".join(map(str, chunk)))) +def _joined_url_bytes(atoms: list[Any], separator: str) -> int: + """URL-encoded byte length of ``atoms`` joined by ``separator``. + + The shared sizing primitive for both list dims (``separator=","``) + and filter clauses (``separator=" OR "``). ``quote_plus`` is faithful + to what the real URL builder produces, so values containing + characters that expand under URL encoding (``%``, ``+``, ``/``, + ``&``, …) can't be mis-ranked.""" + return len(quote_plus(separator.join(map(str, atoms)))) def _request_bytes(req: requests.PreparedRequest) -> int: @@ -210,24 +214,27 @@ def _worst_case_args( in by the caller).""" out = dict(base_args) for k, chunks in list_plan.items(): - out[k] = max(chunks, key=_chunk_bytes) + out[k] = max(chunks, key=lambda c: _joined_url_bytes(c, ",")) return out -def _partition_clauses(clauses: list[str], k: int) -> list[str]: - """Split ``clauses`` into ``k`` roughly-balanced sub-filters joined - by ``" OR "``. Distributes the remainder across the first groups so - no group differs from another by more than one clause.""" +def _partition_clauses(clauses: list[str], k: int) -> list[list[str]]: + """Split ``clauses`` into ``k`` roughly-balanced groups. Distributes + the remainder across the first groups so no group differs from + another by more than one clause. Returns groups as raw atom lists; + callers join with ``" OR "`` only for groups they actually intend + to issue, so the planner can size candidates without joining + discarded partitions.""" if k <= 0: raise ValueError(f"k must be >= 1; got {k}") if k >= len(clauses): - return list(clauses) + return [[c] for c in clauses] base, extra = divmod(len(clauses), k) - out: list[str] = [] + out: list[list[str]] = [] i = 0 for g in range(k): size = base + (1 if g < extra else 0) - out.append(" OR ".join(clauses[i : i + size])) + out.append(clauses[i : i + size]) i += size return out @@ -274,7 +281,9 @@ def _plan_list_chunks( for idx, chunk in enumerate(dim_chunks) if len(chunk) > 1 ) - biggest = max(splittable, key=lambda t: _chunk_bytes(t[2]), default=None) + biggest = max( + splittable, key=lambda t: _joined_url_bytes(t[2], ","), default=None + ) if biggest is None: raise RequestTooLarge( f"Request exceeds {url_limit} bytes (URL + body) at the " @@ -345,16 +354,15 @@ def _plan_joint( ) filter_expr = args.get("filter") - filter_chunkable = _is_chunkable(filter_expr, args.get("filter_lang")) clauses: list[str] = [] - if filter_chunkable: + if _is_chunkable(filter_expr, args.get("filter_lang")): _check_numeric_filter_pitfall(filter_expr) clauses = _split_top_level_or(filter_expr) - if len(clauses) < 2: - filter_chunkable = False + # Filter is chunkable only when there are ≥2 top-level OR clauses; + # a single clause can't be split losslessly. + filter_chunkable = len(clauses) >= 2 - list_dims = _chunkable_params(args) - if not list_dims and not filter_chunkable: + if not _chunkable_params(args) and not filter_chunkable: return None if _request_bytes(build_request(**args)) <= url_limit: return None @@ -365,9 +373,10 @@ def _plan_joint( for k in candidate_counts: if filter_chunkable: - filter_chunks: list[str | None] = list(_partition_clauses(clauses, k)) - worst_filter = max(filter_chunks, key=lambda c: len(quote_plus(c or ""))) - plan_args = {**args, "filter": worst_filter} + groups = _partition_clauses(clauses, k) + worst_group = max(groups, key=lambda g: _joined_url_bytes(g, " OR ")) + filter_chunks: list[str | None] = [" OR ".join(g) for g in groups] + plan_args = {**args, "filter": " OR ".join(worst_group)} else: filter_chunks = [filter_expr] if filter_expr is not None else [None] plan_args = args @@ -382,6 +391,11 @@ def _plan_joint( continue if list_plan is None: list_plan = {} + # When there are no list dims to shrink, ``_plan_list_chunks`` + # returns ``None`` regardless of whether the request actually + # fits. Filter chunking alone has to close the gap — verify it. + if not list_plan and _request_bytes(build_request(**plan_args)) > url_limit: + continue total = _plan_total(list_plan, k) if total > max_chunks: continue diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py index 65c489a0..5ab00869 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/waterdata/filters.py @@ -1,16 +1,15 @@ """CQL ``filter`` support for the Water Data OGC getters. -Public names: +Public: - ``FILTER_LANG``: the type alias used for the ``filter_lang`` kwarg. Internal helpers used by ``chunking.multi_value_chunked``'s joint -planner: ``_split_top_level_or`` (clause partitioning), ``_chunk_cql_or`` -(legacy single-dim chunker, kept for direct callers / tests), +planner: ``_split_top_level_or`` (clause partitioning), ``_is_chunkable`` (filter-language gate), ``_check_numeric_filter_pitfall`` (the lexicographic-comparison guard), ``_combine_chunk_frames`` / -``_combine_chunk_responses`` (aggregation), and constants -``_WATERDATA_URL_BYTE_LIMIT`` / ``_CQL_FILTER_CHUNK_LEN``. +``_combine_chunk_responses`` (aggregation), and the constant +``_WATERDATA_URL_BYTE_LIMIT``. Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal predicates, function calls) are forwarded verbatim — only top-level @@ -29,11 +28,6 @@ FILTER_LANG = Literal["cql-text", "cql-json"] -# Conservative fallback budget when ``_chunk_cql_or`` is called without -# an explicit ``max_len``. The joint planner in ``chunking.py`` computes -# a tighter per-request budget from ``_WATERDATA_URL_BYTE_LIMIT``. -_CQL_FILTER_CHUNK_LEN = 5000 - # Empirically the API replies HTTP 414 above ~8200 bytes of full URL — # matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000 # leaves ~200 bytes for request-line framing and proxy variance. @@ -117,38 +111,6 @@ def _split_top_level_or(expr: str) -> list[str]: return [p for p in parts if p] -def _chunk_cql_or(expr: str, max_len: int = _CQL_FILTER_CHUNK_LEN) -> list[str]: - """Split ``expr`` into OR-chunks each under ``max_len`` characters. - - Only top-level ``OR`` chains can be recombined losslessly as a disjunction - of independent sub-queries. Returns ``[expr]`` unchanged when the whole - expression already fits, when there is no top-level ``OR``, or when any - single clause exceeds ``max_len`` (sending it as-is and surfacing the - server's 414 is clearer than silently dropping data). - """ - if len(expr) <= max_len: - return [expr] - parts = _split_top_level_or(expr) - if len(parts) < 2 or any(len(p) > max_len for p in parts): - return [expr] - - chunks = [] - current: list[str] = [] - current_len = 0 - for part in parts: - join_cost = len(" OR ") if current else 0 - if current and current_len + join_cost + len(part) > max_len: - chunks.append(" OR ".join(current)) - current = [part] - current_len = len(part) - else: - current.append(part) - current_len += join_cost + len(part) - if current: - chunks.append(" OR ".join(current)) - return chunks - - def _check_numeric_filter_pitfall(filter_expr: str) -> None: """Raise if the filter pairs a field with an unquoted numeric literal. diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index 545f7039..9d9d183e 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -7,11 +7,7 @@ import pytest from dataretrieval.waterdata.filters import ( - _CQL_FILTER_CHUNK_LEN, - _WATERDATA_URL_BYTE_LIMIT, _check_numeric_filter_pitfall, - _chunk_cql_or, - _effective_filter_budget, _split_top_level_or, ) from dataretrieval.waterdata.utils import _construct_api_requests @@ -35,11 +31,6 @@ def _fake_response(url="https://example.test", elapsed_ms=1): ) -def _build_request(**kwargs): - """Wrapper that matches the ``build_request`` callable shape.""" - return _construct_api_requests(**kwargs) - - def test_construct_filter_passthrough(): """`filter` is forwarded verbatim as a query parameter.""" expr = ( @@ -113,35 +104,6 @@ def test_split_top_level_or_single_clause(): ] -def test_chunk_cql_or_short_passthrough(): - expr = "time >= '2023-01-01T00:00:00Z'" - assert _chunk_cql_or(expr, max_len=1000) == [expr] - - -def test_chunk_cql_or_splits_into_multiple(): - clause = "(time >= '2023-01-01T00:00:00Z' AND time <= '2023-01-01T00:30:00Z')" - expr = " OR ".join([clause] * 200) - chunks = _chunk_cql_or(expr, max_len=1000) - # each chunk must be under the budget - assert all(len(c) <= 1000 for c in chunks) - # rejoined chunks must cover every clause - rejoined_clauses = sum(len(c.split(" OR ")) for c in chunks) - assert rejoined_clauses == 200 - # and must be a valid OR chain (each chunk is itself a top-level OR of clauses) - assert len(chunks) > 1 - - -def test_chunk_cql_or_unsplittable_returns_input(): - big = "value > 0 AND " + ("A " * 4000) - assert _chunk_cql_or(big, max_len=1000) == [big] - - -def test_chunk_cql_or_single_clause_over_budget_returns_input(): - huge_clause = "(value > " + "9" * 6000 + ")" - expr = f"{huge_clause} OR (value > 0)" - assert _chunk_cql_or(expr, max_len=1000) == [expr] - - @pytest.mark.parametrize( "service", [ @@ -167,41 +129,47 @@ def test_construct_filter_on_all_ogc_services(service): assert qs["filter-lang"] == ["cql-text"] -def test_long_filter_fans_out_into_multiple_requests(): - """An oversized top-level OR filter triggers multiple HTTP requests - whose results are concatenated.""" - from dataretrieval.waterdata import get_continuous - +def _filter_chunking_clauses(n: int = 300) -> str: + """Stock long filter used by the end-to-end fan-out tests below.""" clause = ( "(time >= '2023-01-{day:02d}T00:00:00Z' " "AND time <= '2023-01-{day:02d}T00:30:00Z')" ) - expr = " OR ".join(clause.format(day=(i % 28) + 1) for i in range(300)) - assert len(expr) > _CQL_FILTER_CHUNK_LEN + return " OR ".join(clause.format(day=(i % 28) + 1) for i in range(n)) - sent_filters = [] - def fake_construct_api_requests(**kwargs): - sent_filters.append(kwargs.get("filter")) - return _fake_prepared_request() +def _filter_size_aware_build(**kwargs): + """Fake ``_construct_api_requests`` whose returned URL length scales + with the request's ``filter`` value, so the joint planner naturally + triggers chunking on long filters.""" + return _fake_prepared_request( + url=f"https://example.test/?filter={kwargs.get('filter', '')}", + ) - def fake_walk_pages(*_args, **_kwargs): + +def test_long_filter_fans_out_into_multiple_requests(): + """An oversized top-level OR filter triggers multiple HTTP + sub-requests via the joint planner; every original clause is + preserved across sub-requests; results concatenate to one row per + sub-request given the one-row-per-chunk mock.""" + from dataretrieval.waterdata import get_continuous + + expr = _filter_chunking_clauses() + sent_filters: list[str] = [] + + def fake_walk_pages(*, geopd, req): idx = len(sent_filters) - frame = pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]}) - return frame, _fake_response() + sent_filters.append(_query_params(req).get("filter", [None])[0]) + return pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]}), _fake_response() with ( mock.patch( "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, + side_effect=_filter_size_aware_build, ), mock.patch( "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages ), - mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, - ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -210,51 +178,38 @@ def fake_walk_pages(*_args, **_kwargs): filter_lang="cql-text", ) - # Mocking _effective_filter_budget bypasses the URL-length probe, so - # sent_filters contains only real chunk requests. Assert invariants: - # chunking happened, every original clause is preserved exactly once - # in order, each chunk stays under the budget, and the mock's - # one-row-per-chunk responses concatenate to a row per chunk. expected_parts = _split_top_level_or(expr) assert len(sent_filters) > 1 - rejoined_parts = [] + rejoined_parts: list[str] = [] for chunk in sent_filters: rejoined_parts.extend(_split_top_level_or(chunk)) assert rejoined_parts == expected_parts assert len(df) == len(sent_filters) - assert all(len(chunk) <= _CQL_FILTER_CHUNK_LEN for chunk in sent_filters) def test_long_filter_deduplicates_cross_chunk_overlap(): - """Features returned by multiple chunks (same feature `id`) are - deduplicated in the concatenated result.""" + """Features returned by multiple sub-requests with the same ``id`` + are deduplicated in the concatenated result.""" from dataretrieval.waterdata import get_continuous - clause = ( - "(time >= '2023-01-{day:02d}T00:00:00Z' " - "AND time <= '2023-01-{day:02d}T00:30:00Z')" - ) - expr = " OR ".join(clause.format(day=(i % 28) + 1) for i in range(300)) - + expr = _filter_chunking_clauses() call_count = {"n": 0} def fake_walk_pages(*_args, **_kwargs): call_count["n"] += 1 - frame = pd.DataFrame({"id": ["shared-feature"], "value": [1]}) - return frame, _fake_response() + return ( + pd.DataFrame({"id": ["shared-feature"], "value": [1]}), + _fake_response(), + ) with ( mock.patch( "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), + side_effect=_filter_size_aware_build, ), mock.patch( "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages ), - mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, - ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -263,56 +218,46 @@ def fake_walk_pages(*_args, **_kwargs): filter_lang="cql-text", ) - # Chunking must have happened (otherwise dedup wouldn't be exercised). - assert call_count["n"] > 1 - # Even though each chunk returned a feature, dedup by id collapses them. - assert len(df) == 1 + assert call_count["n"] > 1 # chunking must have happened + assert len(df) == 1 # dedup by ``id`` collapses the duplicates def test_empty_chunks_do_not_downgrade_geodataframe(): - """A mix of empty and non-empty chunk responses must not downgrade a - GeoDataFrame-typed result to a plain DataFrame. ``_get_resp_data`` - returns ``pd.DataFrame()`` on empty responses, which would otherwise - strip geometry/CRS from the concatenated output.""" + """A mix of empty and non-empty sub-request responses must not + downgrade a GeoDataFrame-typed result to a plain DataFrame. + ``_get_resp_data`` returns ``pd.DataFrame()`` on empty responses, + which would otherwise strip geometry/CRS from the concatenated + output.""" pytest.importorskip("geopandas") import geopandas as gpd from shapely.geometry import Point from dataretrieval.waterdata import get_continuous - clause = ( - "(time >= '2023-01-{day:02d}T00:00:00Z' " - "AND time <= '2023-01-{day:02d}T00:30:00Z')" - ) - expr = " OR ".join(clause.format(day=(i % 28) + 1) for i in range(300)) - + expr = _filter_chunking_clauses() call_count = {"n": 0} def fake_walk_pages(*_args, **_kwargs): call_count["n"] += 1 - # Chunk 2 returns empty; chunks 1 and 3 return GeoDataFrames. if call_count["n"] == 2: - frame = pd.DataFrame() - else: - frame = gpd.GeoDataFrame( + return pd.DataFrame(), _fake_response() + return ( + gpd.GeoDataFrame( {"id": [f"feat-{call_count['n']}"], "value": [call_count["n"]]}, geometry=[Point(call_count["n"], call_count["n"])], crs="EPSG:4326", - ) - return frame, _fake_response() + ), + _fake_response(), + ) with ( mock.patch( "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), + side_effect=_filter_size_aware_build, ), mock.patch( "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages ), - mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, - ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -321,119 +266,11 @@ def fake_walk_pages(*_args, **_kwargs): filter_lang="cql-text", ) - # The empty chunk must not have stripped the GeoDataFrame type. assert isinstance(df, gpd.GeoDataFrame) assert "geometry" in df.columns assert df.crs is not None -def test_effective_filter_budget_respects_url_limit(): - """The computed budget, once encoded, fits within the URL byte limit - alongside the other query params.""" - from urllib.parse import quote_plus - - filter_expr = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')" - args = { - "service": "continuous", - "monitoring_location_id": "USGS-02238500", - "parameter_code": "00060", - "filter": filter_expr, - "filter_lang": "cql-text", - } - raw_budget = _effective_filter_budget(args, filter_expr, _build_request) - - # Build a chunk exactly at the raw budget (padded with the clause repeated) - # and confirm the full URL it produces stays under the URL byte limit. - padded = (" OR ".join([filter_expr] * 200))[:raw_budget] - req = _construct_api_requests(**{**args, "filter": padded}) - assert len(req.url) <= _WATERDATA_URL_BYTE_LIMIT - # And the budget scales inversely with encoding ratio (sanity). - assert raw_budget < _WATERDATA_URL_BYTE_LIMIT - # Quick sanity on the encoding math itself. - assert len(quote_plus(padded)) <= _WATERDATA_URL_BYTE_LIMIT - - -def test_effective_filter_budget_uses_max_clause_ratio(): - """Heavy clauses clustered in one part of the filter must not be able - to push any chunk over the URL limit. The budget is computed against - the max per-clause encoding ratio, not the whole-filter average, so - a chunk of only-heaviest-clauses still fits.""" - from urllib.parse import quote_plus - - heavy = ( - "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z' " - "AND approval_status IN ('Approved','Provisional','Revised'))" - ) - light = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')" - # Heavy ratio < light ratio for these shapes; cluster them at opposite - # ends so the chunker must produce at least one light-only chunk. - clauses = [heavy] * 100 + [light] * 400 - expr = " OR ".join(clauses) - args = { - "service": "continuous", - "monitoring_location_id": "USGS-02238500", - "filter": expr, - "filter_lang": "cql-text", - } - budget = _effective_filter_budget(args, expr, _build_request) - chunks = _chunk_cql_or(expr, max_len=budget) - assert len(chunks) > 1 - - # Every chunk, once built into a full request, fits under the URL byte - # limit — even the all-light chunks that have a higher-than-average ratio. - for chunk in chunks: - req = _construct_api_requests(**{**args, "filter": chunk}) - assert len(req.url) <= _WATERDATA_URL_BYTE_LIMIT, ( - f"chunk url {len(req.url)} exceeds {_WATERDATA_URL_BYTE_LIMIT}" - ) - - # Budget should be tight enough that a chunk of only-light clauses - # (the heavier-encoding shape here) still fits. - assert len(quote_plus(light)) * (budget // len(light)) < _WATERDATA_URL_BYTE_LIMIT - - -def test_effective_filter_budget_passes_through_when_no_url_space(): - """If the non-filter URL already exceeds the byte limit, chunking - cannot make the request succeed. The budget helper should signal - pass-through (return a budget larger than the filter) so - ``_chunk_cql_or`` emits one chunk — one 414 from the server is - clearer than a burst of N guaranteed-414 sub-requests.""" - expr = " OR ".join( - ["(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')"] * 50 - ) - fake_build = mock.Mock( - return_value=_fake_prepared_request(url="https://example.test/" + "A" * 9000) - ) - budget = _effective_filter_budget({"filter": expr}, expr, fake_build) - # Budget is large enough that _chunk_cql_or returns the expression - # unchanged (passthrough) rather than producing many small chunks. - assert budget > len(expr) - assert _chunk_cql_or(expr, max_len=budget) == [expr] - - -def test_effective_filter_budget_shrinks_with_more_url_params(): - """Adding more scalar query params consumes URL bytes and should - shrink the raw filter budget accordingly. Use a filter large enough - to skip the short-circuit fast path so the probe actually runs.""" - clause = "(time >= '2023-01-15T00:00:00Z' AND time <= '2023-01-15T00:30:00Z')" - expr = " OR ".join([clause] * 100) - sparse_args = { - "service": "continuous", - "monitoring_location_id": "USGS-02238500", - "filter": expr, - "filter_lang": "cql-text", - } - dense_args = { - **sparse_args, - "parameter_code": "00060", - "statistic_id": "00003", - "last_modified": "2023-01-01T00:00:00Z/2023-12-31T23:59:59Z", - } - sparse_budget = _effective_filter_budget(sparse_args, expr, _build_request) - dense_budget = _effective_filter_budget(dense_args, expr, _build_request) - assert dense_budget < sparse_budget - - def test_cql_json_filter_is_not_chunked(): """Chunking applies only to cql-text; cql-json is passed through unchanged.""" from dataretrieval.waterdata import get_continuous diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 12b2e3b5..4a01457b 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -263,18 +263,21 @@ def _fake_build(*, base=200, **kwargs): def test_partition_clauses_balanced(): - """k roughly-balanced groups, joined by ` OR `, distributing the - remainder across the first groups.""" + """k roughly-balanced groups of atoms, with the remainder distributed + across the first groups so no group differs from another by more + than one clause. Returns raw groups (not joined) so the planner can + size candidates without materializing discarded partitions.""" clauses = ["a='1'", "b='2'", "c='3'", "d='4'", "e='5'"] - assert _partition_clauses(clauses, 1) == [" OR ".join(clauses)] + assert _partition_clauses(clauses, 1) == [clauses] # 5 clauses into 2 groups → sizes 3, 2 (remainder lands in the first). assert _partition_clauses(clauses, 2) == [ - "a='1' OR b='2' OR c='3'", - "d='4' OR e='5'", + ["a='1'", "b='2'", "c='3'"], + ["d='4'", "e='5'"], ] - assert _partition_clauses(clauses, 5) == clauses + # Singletons. + assert _partition_clauses(clauses, 5) == [[c] for c in clauses] # k > len(clauses) → singletons (don't synthesize empty groups). - assert _partition_clauses(clauses, 99) == clauses + assert _partition_clauses(clauses, 99) == [[c] for c in clauses] def test_filter_chunk_counts_powers_of_two_plus_n(): From 4e82722e7f50eaa6210c3db2d9b49c246795ad12 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 12:49:07 -0500 Subject: [PATCH 05/17] refactor(waterdata): Tighten the joint chunker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small extractions and one minor optimization. No behavior change; 130 chunker/filter tests stay green. _iter_sub_args generator yields per-sub-request args dicts; the wrapper's nested-loop-with-manual-counter collapses to ``for i, sub_args in enumerate(...)``. The "is this the last sub-request" branch in the quota-floor check flips to ``if i == total - 1: continue`` so the gate is a guard clause rather than the body of an inverse condition. _finalize_response folds the ``_combine_chunk_responses(responses); response.url = canonical_url`` pattern (used in both the success path and the QuotaExhausted partial-state payload) into one helper. _filter_candidates generator emits ``(filter_chunks, worst_filter)`` pairs for each candidate filter chunk count; ``_plan_joint`` then iterates candidates uniformly without the ``if filter_chunkable: ... else: ...`` fork. The redundant ``filter_chunkable`` flag is gone — ``len(clauses) >= 2`` is the single truth. Per-iteration optimization: ``{**args, **list_overrides}`` was being recomputed for every filter chunk; now built once per outer combo and reused (or shallow-overridden when a filter substitution applies). Module constants ``_LIST_SEP = ","`` and ``_OR_SEP = " OR "`` replace the scattered string literals — the two chunking dimensions are now self-documenting at every call site that sizes them. --- dataretrieval/waterdata/chunking.py | 156 +++++++++++++++++----------- 1 file changed, 98 insertions(+), 58 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 4294a1d6..213ca092 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -34,7 +34,7 @@ import functools import itertools import math -from collections.abc import Callable +from collections.abc import Callable, Iterator from typing import Any from urllib.parse import quote_plus @@ -98,6 +98,14 @@ # so a missing/malformed header doesn't trigger spurious aborts. _QUOTA_UNKNOWN = 10**9 +# Separators the two chunking dimensions use to compose their atoms +# into URL-encoded blobs. List dims comma-join values +# (``site=USGS-A,USGS-B``); filter dims OR-join clauses +# (``filter=a='1' OR a='2'``). Pinned as constants so the URL-byte +# sizing helper and the partition logic agree on the join shape. +_LIST_SEP = "," +_OR_SEP = " OR " + class RequestTooLarge(ValueError): """Raised when a chunked request cannot be issued. Either the URL @@ -214,7 +222,7 @@ def _worst_case_args( in by the caller).""" out = dict(base_args) for k, chunks in list_plan.items(): - out[k] = max(chunks, key=lambda c: _joined_url_bytes(c, ",")) + out[k] = max(chunks, key=lambda c: _joined_url_bytes(c, _LIST_SEP)) return out @@ -282,7 +290,9 @@ def _plan_list_chunks( if len(chunk) > 1 ) biggest = max( - splittable, key=lambda t: _joined_url_bytes(t[2], ","), default=None + splittable, + key=lambda t: _joined_url_bytes(t[2], _LIST_SEP), + default=None, ) if biggest is None: raise RequestTooLarge( @@ -320,6 +330,32 @@ def _filter_chunk_counts(n_clauses: int) -> list[int]: return counts +def _filter_candidates( + clauses: list[str], original_filter: str | None +) -> Iterator[tuple[list[str | None], str | None]]: + """Yield ``(filter_chunks, worst_filter)`` for each candidate filter + chunk count. ``filter_chunks`` is the list of OR-joined sub-filters + the wrapper will substitute one at a time; ``worst_filter`` is the + longest URL-encoded chunk, used by the planner to size the list dims + against the most demanding sub-request. + + Falls through to a single ``(filter_chunks=[original_filter], None)`` + candidate when the filter has no top-level OR splits (single clause, + cql-json, missing filter): the wrapper still iterates that one + "chunk" but the planner can skip substituting a filter into the + URL probe.""" + if len(clauses) < 2: + chunks: list[str | None] = ( + [original_filter] if original_filter is not None else [None] + ) + yield chunks, None + return + for k in _filter_chunk_counts(len(clauses)): + groups = _partition_clauses(clauses, k) + worst = max(groups, key=lambda g: _joined_url_bytes(g, _OR_SEP)) + yield [_OR_SEP.join(g) for g in groups], _OR_SEP.join(worst) + + def _plan_joint( args: dict[str, Any], build_request: Callable[..., Any], @@ -358,42 +394,30 @@ def _plan_joint( if _is_chunkable(filter_expr, args.get("filter_lang")): _check_numeric_filter_pitfall(filter_expr) clauses = _split_top_level_or(filter_expr) - # Filter is chunkable only when there are ≥2 top-level OR clauses; - # a single clause can't be split losslessly. - filter_chunkable = len(clauses) >= 2 - if not _chunkable_params(args) and not filter_chunkable: + if not _chunkable_params(args) and len(clauses) < 2: return None if _request_bytes(build_request(**args)) <= url_limit: return None - candidate_counts = _filter_chunk_counts(len(clauses)) if filter_chunkable else [1] best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None last_error: RequestTooLarge | None = None - for k in candidate_counts: - if filter_chunkable: - groups = _partition_clauses(clauses, k) - worst_group = max(groups, key=lambda g: _joined_url_bytes(g, " OR ")) - filter_chunks: list[str | None] = [" OR ".join(g) for g in groups] - plan_args = {**args, "filter": " OR ".join(worst_group)} - else: - filter_chunks = [filter_expr] if filter_expr is not None else [None] - plan_args = args - - per_filter_cap = max(1, max_chunks // k) + for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr): + k = len(filter_chunks) + plan_args = args if worst_filter is None else {**args, "filter": worst_filter} try: list_plan = _plan_list_chunks( - plan_args, build_request, url_limit, per_filter_cap + plan_args, build_request, url_limit, max(1, max_chunks // k) ) except RequestTooLarge as exc: last_error = exc continue if list_plan is None: list_plan = {} - # When there are no list dims to shrink, ``_plan_list_chunks`` - # returns ``None`` regardless of whether the request actually - # fits. Filter chunking alone has to close the gap — verify it. + # ``_plan_list_chunks`` returns ``None`` when no list dims are + # chunkable, regardless of whether the request actually fits. + # Filter chunking alone has to close the gap — verify it. if not list_plan and _request_bytes(build_request(**plan_args)) > url_limit: continue total = _plan_total(list_plan, k) @@ -425,6 +449,34 @@ def _read_remaining(response: requests.Response) -> int: return _QUOTA_UNKNOWN +def _iter_sub_args( + args: dict[str, Any], + list_plan: dict[str, list[list[Any]]], + filter_chunks: list[str | None], +) -> Iterator[dict[str, Any]]: + """Yield the substituted ``args`` for each sub-request in the joint + plan, in deterministic order: list-dim cartesian product (insertion + order, Python 3.7+ guarantee) crossed with filter chunks.""" + list_keys = list(list_plan) + list_combos = ( + itertools.product(*(list_plan[k] for k in list_keys)) if list_plan else [()] + ) + for combo in list_combos: + base = {**args, **dict(zip(list_keys, combo))} + for filter_chunk in filter_chunks: + yield base if filter_chunk is None else {**base, "filter": filter_chunk} + + +def _finalize_response( + responses: list[requests.Response], canonical_url: str +) -> requests.Response: + """Aggregate per-sub-request responses and restore the canonical + URL representing the user's full original query.""" + combined = _combine_chunk_responses(responses) + combined.url = canonical_url + return combined + + def multi_value_chunked( *, build_request: Callable[..., Any], @@ -481,44 +533,32 @@ def wrapper( list_plan, filter_chunks = plan canonical_url = build_request(**args).url - - list_keys = list(list_plan) - list_combos = ( - list(itertools.product(*(list_plan[k] for k in list_keys))) - if list_plan - else [()] - ) - total = len(list_combos) * len(filter_chunks) + total = _plan_total(list_plan, len(filter_chunks)) frames: list[pd.DataFrame] = [] responses: list[requests.Response] = [] - i = 0 - for combo in list_combos: - for filter_chunk in filter_chunks: - sub_args = dict(args) - sub_args.update(zip(list_keys, combo)) - if filter_chunk is not None: - sub_args["filter"] = filter_chunk - frame, response = fetch_once(sub_args) - frames.append(frame) - responses.append(response) - if i < total - 1: - remaining = _read_remaining(response) - if remaining < floor: - partial = _combine_chunk_responses(responses) - partial.url = canonical_url - raise QuotaExhausted( - partial_frame=_combine_chunk_frames(frames), - partial_response=partial, - completed_chunks=i + 1, - total_chunks=total, - remaining=remaining, - ) - i += 1 - - combined = _combine_chunk_responses(responses) - combined.url = canonical_url - return _combine_chunk_frames(frames), combined + for i, sub_args in enumerate( + _iter_sub_args(args, list_plan, filter_chunks) + ): + frame, response = fetch_once(sub_args) + frames.append(frame) + responses.append(response) + if i == total - 1: + continue # last chunk; no next sub-request to gate + remaining = _read_remaining(response) + if remaining < floor: + raise QuotaExhausted( + partial_frame=_combine_chunk_frames(frames), + partial_response=_finalize_response(responses, canonical_url), + completed_chunks=i + 1, + total_chunks=total, + remaining=remaining, + ) + + return ( + _combine_chunk_frames(frames), + _finalize_response(responses, canonical_url), + ) return wrapper # type: ignore[return-value] From ee550bec612a11da87774b529edaab7c09fe6174 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 12:54:18 -0500 Subject: [PATCH 06/17] =?UTF-8?q?refactor(waterdata):=20Polish=20=E2=80=94?= =?UTF-8?q?=20extract=20=5Fresolve=5Fmax=5Fchunks,=20tidy=20iter=5Fsub=5Fa?= =?UTF-8?q?rgs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three micro-refinements after the previous pass settles. No behavior change; 130 tests stay green. - Extract _resolve_max_chunks() so the default + validation rule for ``max_chunks`` lives in one place, called from both _plan_list_chunks and _plan_joint. The 5-line if-None/if-<1 block was duplicated verbatim. - _iter_sub_args drops its explicit ``list_keys = list(list_plan)`` cache; iterating ``list_plan`` directly gives the same insertion-order sequence (Python 3.7+ dict guarantee), and ``zip(list_plan, combo)`` reads as "pair each list-dim name with its chunk for this combo". - Tighten the wrapper's option resolution to the "default if None else passed" form so each line reads in argument order. - Categorize the _NEVER_CHUNK comment so future additions land in the right category instead of a flat narrative. --- dataretrieval/waterdata/chunking.py | 68 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 213ca092..87ccb009 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -51,14 +51,16 @@ _split_top_level_or, ) -# Params the multi-value chunker must NOT split. ``properties`` defines -# the response schema; chunking it would shard columns. ``bbox`` is a -# fixed 4-element coord tuple. Date params are intervals not sets. -# ``filter`` has its own OR-clause partitioning inside ``_plan_joint``; -# treating it as a multi-value list would emit malformed CQL. -# ``filter_lang`` is a scalar string. ``limit`` and ``skip_geometry`` -# are scalars by contract (smuggling a list through type erasure would -# silently fan into queries with conflicting per-request semantics). +# Params the multi-value chunker must NOT comma-join across sub-requests. +# Categorized to make additions intentional: +# - response shape: ``properties`` defines columns; sharding would +# give different columns per chunk. +# - structured: ``bbox`` is a fixed 4-element coord tuple. +# - intervals: date/time ranges are not enumerable sets. +# - handled elsewhere: ``filter`` gets OR-clause partitioning in +# ``_plan_joint``; multi-value semantics would +# emit malformed CQL. +# - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang``. _NEVER_CHUNK = frozenset( { "properties", @@ -213,6 +215,19 @@ def _plan_total(list_plan: dict[str, list[list[Any]]], n_filter_chunks: int) -> return list_count * n_filter_chunks +def _resolve_max_chunks(max_chunks: int | None) -> int: + """Apply ``max_chunks`` default + validation. Both planner entry + points share this rule, so pinning it here keeps them in sync.""" + if max_chunks is None: + return _DEFAULT_MAX_CHUNKS + if max_chunks < 1: + raise ValueError( + f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " + f"values would silently bypass the cap." + ) + return max_chunks + + def _worst_case_args( base_args: dict[str, Any], list_plan: dict[str, list[list[Any]]] ) -> dict[str, Any]: @@ -263,13 +278,7 @@ def _plan_list_chunks( when the smallest reducible plan still doesn't fit or when the plan would exceed ``max_chunks`` sub-requests. """ - if max_chunks is None: - max_chunks = _DEFAULT_MAX_CHUNKS - if max_chunks < 1: - raise ValueError( - f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " - f"values would silently bypass the cap." - ) + max_chunks = _resolve_max_chunks(max_chunks) chunkable = _chunkable_params(args) if not chunkable: return None @@ -381,13 +390,7 @@ def _plan_joint( Raises ``RequestTooLarge`` when no candidate fits or the best plan would exceed ``max_chunks``. """ - if max_chunks is None: - max_chunks = _DEFAULT_MAX_CHUNKS - if max_chunks < 1: - raise ValueError( - f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " - f"values would silently bypass the cap." - ) + max_chunks = _resolve_max_chunks(max_chunks) filter_expr = args.get("filter") clauses: list[str] = [] @@ -455,14 +458,11 @@ def _iter_sub_args( filter_chunks: list[str | None], ) -> Iterator[dict[str, Any]]: """Yield the substituted ``args`` for each sub-request in the joint - plan, in deterministic order: list-dim cartesian product (insertion - order, Python 3.7+ guarantee) crossed with filter chunks.""" - list_keys = list(list_plan) - list_combos = ( - itertools.product(*(list_plan[k] for k in list_keys)) if list_plan else [()] - ) + plan, in deterministic order: list-dim cartesian product (dict + insertion order — Python 3.7+ guarantee) crossed with filter chunks.""" + list_combos = itertools.product(*list_plan.values()) if list_plan else [()] for combo in list_combos: - base = {**args, **dict(zip(list_keys, combo))} + base = {**args, **dict(zip(list_plan, combo))} for filter_chunk in filter_chunks: yield base if filter_chunk is None else {**base, "filter": filter_chunk} @@ -518,14 +518,12 @@ def wrapper( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: limit = ( - url_limit - if url_limit is not None - else filters._WATERDATA_URL_BYTE_LIMIT + filters._WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit ) floor = ( - quota_safety_floor - if quota_safety_floor is not None - else _DEFAULT_QUOTA_SAFETY_FLOOR + _DEFAULT_QUOTA_SAFETY_FLOOR + if quota_safety_floor is None + else quota_safety_floor ) plan = _plan_joint(args, build_request, limit, max_chunks) if plan is None: From f1588ae6c8a1e58c8ae01bb68ea0c814ca1c939c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 13:00:23 -0500 Subject: [PATCH 07/17] docs(waterdata): Frame _NEVER_CHUNK as exceptions to a default-chunk rule After investigating: the OGC getters expose ~94 list-shaped params, all chunkable. The current 13-entry denylist captures every exception. An allowlist would be ~7x longer and would need updating every time USGS adds a column. Reframe the comment to state the default rule first ("any list-shaped kwarg gets chunked"), then enumerate the exceptions by reason (response-shaping, structured, intervals, handled-elsewhere, scalar-by- contract). Reads as "here are the few cases the default-chunk rule doesn't apply" rather than "here is an arbitrary exclusion set." --- dataretrieval/waterdata/chunking.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 87ccb009..9acc1f6c 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -51,16 +51,21 @@ _split_top_level_or, ) -# Params the multi-value chunker must NOT comma-join across sub-requests. -# Categorized to make additions intentional: -# - response shape: ``properties`` defines columns; sharding would -# give different columns per chunk. +# Default rule: any list-shaped kwarg with >1 element is chunked across +# sub-requests — each chunk becomes a comma-joined sub-list in the URL. +# The OGC getters expose ~90 such list-shaped params (IDs, codes, +# statuses, ...), all chunkable, so it's shorter to enumerate the +# exceptions than to maintain an allowlist that grows with the API. +# Exceptions, by reason: +# - response shape: ``properties`` defines the columns; sharding +# would yield different schemas per chunk. # - structured: ``bbox`` is a fixed 4-element coord tuple. # - intervals: date/time ranges are not enumerable sets. # - handled elsewhere: ``filter`` gets OR-clause partitioning in -# ``_plan_joint``; multi-value semantics would -# emit malformed CQL. -# - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang``. +# ``_plan_joint``; comma-joining CQL clauses +# would emit malformed expressions. +# - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang`` +# — a list value would be a type-erasure smuggle. _NEVER_CHUNK = frozenset( { "properties", From 493e4eb93a4db9e08da46352296e90be119a4913 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 13:13:18 -0500 Subject: [PATCH 08/17] test(waterdata): Add offline stress test for the joint chunker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone runner (``python3 tests/stress_chunker.py``) that exercises the chunker across eight scenarios with the URL byte limit lowered well below the live API's. No live HTTP — mocks fetch_once and uses the real _construct_api_requests for URL sizing. Per-scenario invariants verified: 1. Every sub-request URL ≤ url_limit (primary correctness). 2. List-dim coverage: the union of distinct chunks issued for each list dim equals the input with no overlap (no data dropped, no duplicate fetches of the same atom within its dim). 3. Filter-clause coverage: the distinct filter chunks split back into clauses, concatenated in iteration order, equal the original clauses (lossless OR-disjunction). 4. Speedup vs the bail-floor-singleton baseline that the old two- decorator design would have produced in pathological cases. Plus a greedy-search adaptation check: scanning ``url_limit`` across 1200 → 10000 confirms sub-request count is monotonically non-increasing as the budget grows (the planner adapts to the limit). Scenarios: A. Long sites only (pure list chunking) B. Long filter only (pure filter chunking) C. Long sites + long filter (joint trade-off — 1000× vs baseline) D. 3-D list cartesian product (3000× vs baseline) E. Lopsided clause sizes (worst-case sizing) F. URL-encoding-heavy clauses (quote_plus inflation) G. Very tight URL limit (singleton chunks) H. Generous URL limit (no chunking needed) I. url_limit sweep proving greedy adaptation All 15 chunked calls pass every invariant. --- tests/stress_chunker.py | 430 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 tests/stress_chunker.py diff --git a/tests/stress_chunker.py b/tests/stress_chunker.py new file mode 100644 index 00000000..9efee974 --- /dev/null +++ b/tests/stress_chunker.py @@ -0,0 +1,430 @@ +"""Offline stress test for the joint chunker. + +Runs the multi_value_chunked decorator across a battery of scenarios +with the URL byte limit lowered well below the live API's, captures +every sub-request the decorator emits, and verifies four invariants +per scenario: + + 1. Every sub-request URL stays under the configured ``url_limit`` + (the chunker's primary correctness contract). + 2. Cartesian-product coverage: the union of values seen for each list + dim equals the user's original input list (no data dropped). + 3. Disjointness: no input value appears in more than one chunk for + its dim (no duplicates within a list dim's sub-requests). + 4. Filter-clause coverage: when the filter is OR-chunked, the union + of clauses across sub-requests equals the original clause set in + order (lossless OR-disjunction). + +Plus a greedy-quality check: for the "long filter + long lists" +scenario, the joint planner should comfortably beat the bail-floor-only +baseline (the worst case the old two-decorator design produced). + +Run: ``python3 tests/stress_chunker.py``. +""" + +from __future__ import annotations + +import sys +from collections.abc import Iterable +from datetime import timedelta +from types import SimpleNamespace +from urllib.parse import parse_qsl, urlsplit + +import pandas as pd + +from dataretrieval.waterdata import chunking +from dataretrieval.waterdata.chunking import ( + multi_value_chunked, +) +from dataretrieval.waterdata.utils import _construct_api_requests + +# ────────────────────────────────────────────────────────────────────── +# Harness +# ────────────────────────────────────────────────────────────────────── + + +def _mock_response() -> SimpleNamespace: + return SimpleNamespace( + elapsed=timedelta(milliseconds=1), + headers={}, + url="https://example.test", + body=None, + ) + + +def run_chunked( + args: dict, url_limit: int +) -> tuple[list[dict], pd.DataFrame, SimpleNamespace]: + """Execute the chunker end-to-end with a captured ``fetch_once`` and + return (sub_args_seen, combined_frame, combined_response).""" + sub_args_seen: list[dict] = [] + + @multi_value_chunked( + build_request=_construct_api_requests, + url_limit=url_limit, + ) + def fetch_once(args: dict) -> tuple[pd.DataFrame, SimpleNamespace]: + sub_args_seen.append(dict(args)) + # Stable per-sub-request id so dedup logic doesn't collapse rows. + idx = len(sub_args_seen) + frame = pd.DataFrame({"id": [f"row-{idx}"], "value": [idx]}) + return frame, _mock_response() + + df, response = fetch_once(args) + return sub_args_seen, df, response + + +def _query_params(url: str) -> dict[str, str]: + return dict(parse_qsl(urlsplit(url).query, keep_blank_values=True)) + + +def _bail_floor_baseline(args: dict, url_limit: int) -> int: + """Lower-bound on what a singleton-list + singleton-clause plan + would produce — the worst case the old two-decorator design could + have hit. Used as the optimization-quality bar for the joint + planner: it should beat this comfortably. + + Counts ``product(per-dim length) × n_clauses`` for the inputs that + are eligible to chunk.""" + list_dims_card = 1 + for k, v in args.items(): + if k in chunking._NEVER_CHUNK: + continue + if isinstance(v, (list, tuple)) and len(v) > 1: + list_dims_card *= len(v) + n_clauses = 1 + if isinstance(args.get("filter"), str): + clauses = chunking._split_top_level_or(args["filter"]) + n_clauses = max(len(clauses), 1) + # baseline = worst-case singletons in every dim + return list_dims_card * n_clauses + + +# ────────────────────────────────────────────────────────────────────── +# Invariant checks +# ────────────────────────────────────────────────────────────────────── + + +def assert_urls_fit(sub_args_seen: list[dict], url_limit: int) -> None: + over = [] + for sa in sub_args_seen: + req = _construct_api_requests(**sa) + url_bytes = len(req.url) + (len(req.body) if req.body else 0) + if url_bytes > url_limit: + over.append((url_bytes, sa)) + if over: + raise AssertionError( + f"{len(over)}/{len(sub_args_seen)} sub-requests exceeded " + f"url_limit={url_limit}; first: {over[0][0]} bytes" + ) + + +def assert_list_coverage( + sub_args_seen: list[dict], + original: dict, + dim: str, +) -> None: + """For each list dim, the set of *distinct* chunks issued across + sub-requests must union to the original input with no overlap. + Cartesian-product duplication of any single dim's chunk across the + OTHER dims' iterations is expected — the chunk identity (the tuple + of values) is what matters, not how many times it's seen.""" + expected = sorted(original[dim]) + distinct_chunks: set[tuple[str, ...]] = set() + for sa in sub_args_seen: + v = sa.get(dim) + if v is None: + continue + chunk = tuple(v) if isinstance(v, (list, tuple)) else (v,) + distinct_chunks.add(chunk) + seen = [val for chunk in distinct_chunks for val in chunk] + if sorted(seen) != expected: + missing = set(expected) - set(seen) + extra = set(seen) - set(expected) + dups = {x for x in seen if seen.count(x) > 1} + raise AssertionError( + f"list dim {dim!r} coverage failed across " + f"{len(distinct_chunks)} distinct chunk(s): " + f"missing={sorted(missing)[:3]} extra={sorted(extra)[:3]} " + f"dups={sorted(dups)[:3]}" + ) + + +def assert_filter_coverage( + sub_args_seen: list[dict], + original_filter: str, +) -> None: + """Each filter sub-chunk appears once per list-dim combination. The + *set* of distinct filter values seen, split back into clauses and + concatenated in iteration order, must equal the original clauses + (preserving the planner's order across partitions).""" + expected = chunking._split_top_level_or(original_filter) + distinct_filters: list[str] = [] + for sa in sub_args_seen: + f = sa.get("filter") + if isinstance(f, str) and f not in distinct_filters: + distinct_filters.append(f) + seen: list[str] = [] + for f in distinct_filters: + seen.extend(chunking._split_top_level_or(f)) + if seen != expected: + first_diff = next( + (i for i, (a, b) in enumerate(zip(seen, expected)) if a != b), + min(len(seen), len(expected)), + ) + raise AssertionError( + f"filter clause coverage failed: got {len(seen)} unique-chunk " + f"clauses in chunks, expected {len(expected)} in original order; " + f"first divergence at index {first_diff}" + ) + + +# ────────────────────────────────────────────────────────────────────── +# Scenarios +# ────────────────────────────────────────────────────────────────────── + + +def _ohio_sites(n: int) -> list[str]: + return [f"USGS-{i:08d}" for i in range(n)] + + +def _datetime_clauses(n: int) -> list[str]: + return [ + f"time='2024-{((i // 28) % 12) + 1:02d}-{(i % 28) + 1:02d}T00:00:00Z'" + for i in range(n) + ] + + +def _heavy_clauses(n: int) -> list[str]: + # Each clause contains URL-encoding-heavy characters (`%`, `&`, `+`). + return [ + f"value='~~ baseline%2B{i} & next+{i} ~~'" # noqa: E501 + for i in range(n) + ] + + +def scenario( + name: str, + args: dict, + url_limit: int, + *, + list_dims_to_check: Iterable[str] = (), + expected_filter: str | None = None, + require_chunked: bool = True, +) -> dict: + sub_args_seen, df, _ = run_chunked(args, url_limit=url_limit) + n_sub = len(sub_args_seen) + baseline = _bail_floor_baseline(args, url_limit) + + assert_urls_fit(sub_args_seen, url_limit) + for dim in list_dims_to_check: + assert_list_coverage(sub_args_seen, args, dim) + if expected_filter is not None: + assert_filter_coverage(sub_args_seen, expected_filter) + if require_chunked and n_sub <= 1: + raise AssertionError( + f"{name}: expected fan-out but the chunker produced {n_sub} sub-request(s)" + ) + + return { + "name": name, + "url_limit": url_limit, + "sub_requests": n_sub, + "baseline_worst_case": baseline, + "speedup_vs_baseline": (baseline / n_sub) if n_sub else float("inf"), + "rows_returned": len(df), + } + + +def main() -> int: + print(f"{'=' * 78}\n Joint chunker stress test\n{'=' * 78}\n") + + results: list[dict] = [] + + # A. Long site list, no filter — pure list chunking. + sites = _ohio_sites(500) + results.append( + scenario( + "A. Long sites only", + {"service": "daily", "monitoring_location_id": sites}, + url_limit=2000, + list_dims_to_check=["monitoring_location_id"], + ) + ) + + # B. Long OR filter, no list dims — pure filter chunking. + clauses = _datetime_clauses(50) + filter_expr = " OR ".join(clauses) + results.append( + scenario( + "B. Long filter only", + { + "service": "continuous", + "monitoring_location_id": "USGS-00000001", + "filter": filter_expr, + "filter_lang": "cql-text", + }, + url_limit=1500, + expected_filter=filter_expr, + ) + ) + + # C. Long sites + long filter — joint planning trade-off. + sites = _ohio_sites(200) + clauses = _datetime_clauses(40) + filter_expr = " OR ".join(clauses) + results.append( + scenario( + "C. Long sites + long filter", + { + "service": "continuous", + "monitoring_location_id": sites, + "filter": filter_expr, + "filter_lang": "cql-text", + }, + url_limit=2000, + list_dims_to_check=["monitoring_location_id"], + expected_filter=filter_expr, + ) + ) + + # D. 3-D multi-value list chunking. Tight limit forces all three + # dims to participate in the cartesian product. + sites = _ohio_sites(60) + pcodes = [f"{i:05d}" for i in range(20)] + statistics = [f"{i:05d}" for i in range(10)] + results.append( + scenario( + "D. 3-D list cartesian product (sites × pcodes × statistics)", + { + "service": "daily", + "monitoring_location_id": sites, + "parameter_code": pcodes, + "statistic_id": statistics, + }, + url_limit=800, + list_dims_to_check=[ + "monitoring_location_id", + "parameter_code", + "statistic_id", + ], + ) + ) + + # E. Lopsided clause sizes — half short, half long. + short_clauses = [f"a='{i}'" for i in range(30)] + long_clauses = [ + f"description LIKE '%%state aquifer system region {i}%%'" for i in range(30) + ] + filter_expr = " OR ".join(short_clauses + long_clauses) + results.append( + scenario( + "E. Lopsided clause sizes", + { + "service": "continuous", + "monitoring_location_id": "USGS-00000001", + "filter": filter_expr, + "filter_lang": "cql-text", + }, + url_limit=1800, + expected_filter=filter_expr, + ) + ) + + # F. Heavily URL-encoded clauses (special chars expand under quote_plus). + # Tight limit forces the planner to account for encoding inflation. + clauses = _heavy_clauses(60) + filter_expr = " OR ".join(clauses) + results.append( + scenario( + "F. URL-encoding-heavy clauses", + { + "service": "continuous", + "monitoring_location_id": "USGS-00000001", + "filter": filter_expr, + "filter_lang": "cql-text", + }, + url_limit=1500, + expected_filter=filter_expr, + ) + ) + + # G. Very tight limit — singleton chunks expected. + sites = _ohio_sites(100) + results.append( + scenario( + "G. Very tight URL limit (singleton sites)", + {"service": "daily", "monitoring_location_id": sites}, + url_limit=420, + list_dims_to_check=["monitoring_location_id"], + ) + ) + + # H. Generous URL limit — no chunking needed. + sites = _ohio_sites(5) + results.append( + scenario( + "H. Generous URL limit (no chunking)", + {"service": "daily", "monitoring_location_id": sites}, + url_limit=10000, + list_dims_to_check=["monitoring_location_id"], + require_chunked=False, + ) + ) + + # I. Greedy-search adapts: same input, sweep url_limit and confirm + # sub-request count is monotonically non-increasing as the limit grows. + sites = _ohio_sites(150) + clauses = _datetime_clauses(30) + args = { + "service": "continuous", + "monitoring_location_id": sites, + "filter": " OR ".join(clauses), + "filter_lang": "cql-text", + } + counts_by_limit: list[tuple[int, int]] = [] + for limit in [1200, 1800, 2400, 3000, 4000, 6000, 10000]: + seen, _, _ = run_chunked(args, url_limit=limit) + assert_urls_fit(seen, limit) + assert_list_coverage(seen, args, "monitoring_location_id") + assert_filter_coverage(seen, args["filter"]) + counts_by_limit.append((limit, len(seen))) + + monotone = all( + counts_by_limit[i][1] >= counts_by_limit[i + 1][1] + for i in range(len(counts_by_limit) - 1) + ) + if not monotone: + raise AssertionError( + f"sub-request count is not monotonically non-increasing in " + f"url_limit: {counts_by_limit}" + ) + + # ────────────────────────────────────────────────────────────────── + # Report + # ────────────────────────────────────────────────────────────────── + print(f"{'Scenario':<60}{'limit':>8}{'sub-req':>10}{'baseline':>10}{'×min':>8}") + print("─" * 96) + for r in results: + print( + f" {r['name']:<58}" + f"{r['url_limit']:>8}" + f"{r['sub_requests']:>10}" + f"{r['baseline_worst_case']:>10}" + f"{r['speedup_vs_baseline']:>7.1f}×" + ) + print() + print("Greedy-search adaptation (scenario I — sub-req count vs url_limit):") + print(f" {'url_limit':>10} {'sub-req':>8}") + for limit, n in counts_by_limit: + print(f" {limit:>10} {n:>8}") + print() + print( + "✓ all invariants hold across " + f"{len(results) + len(counts_by_limit)} chunked calls" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 1348304999ae48c042cf09ea762e533ce1055800 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 13:24:23 -0500 Subject: [PATCH 09/17] =?UTF-8?q?perf(test):=20Cut=20stress=20test=20wall-?= =?UTF-8?q?clock=2055%=20=E2=80=94=20capture=20URL=20bytes=20inline,=20tri?= =?UTF-8?q?m=20sweep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profile showed `_construct_api_requests` (PreparedRequest building) dominated the stress test's runtime: 421 calls / ~152ms of the ~290ms profile time. ~75 of those calls came from ``assert_urls_fit`` re-walking every captured sub-request to rebuild its URL after the chunker had already built it during planning. Two simple changes: - ``run_chunked`` now returns a parallel ``url_bytes_seen`` list; the mock ``fetch_once`` captures the built URL's byte count once during execution. ``assert_urls_fit`` just compares ints instead of rebuilding PreparedRequests. - The url_limit sweep dropped from 7 points × (150 sites, 30 clauses) to 5 points × (100 sites, 20 clauses). Monotonicity reads just as clearly with the smaller grid — the curve (8 → 2 → 2 → 1 → 1) is unambiguous. Result: 118ms → 53ms per run. 13 chunked calls, every invariant still holds. --- tests/stress_chunker.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/tests/stress_chunker.py b/tests/stress_chunker.py index 9efee974..0a56c85a 100644 --- a/tests/stress_chunker.py +++ b/tests/stress_chunker.py @@ -54,10 +54,15 @@ def _mock_response() -> SimpleNamespace: def run_chunked( args: dict, url_limit: int -) -> tuple[list[dict], pd.DataFrame, SimpleNamespace]: +) -> tuple[list[dict], list[int], pd.DataFrame, SimpleNamespace]: """Execute the chunker end-to-end with a captured ``fetch_once`` and - return (sub_args_seen, combined_frame, combined_response).""" + return (sub_args_seen, url_bytes_seen, combined_frame, combined_response). + + Capturing the actual built-URL byte count inline avoids the test's + post-hoc URL-fit verifier having to rebuild every sub-request from + scratch (one less ``_construct_api_requests`` call per sub-request).""" sub_args_seen: list[dict] = [] + url_bytes_seen: list[int] = [] @multi_value_chunked( build_request=_construct_api_requests, @@ -65,13 +70,15 @@ def run_chunked( ) def fetch_once(args: dict) -> tuple[pd.DataFrame, SimpleNamespace]: sub_args_seen.append(dict(args)) + req = _construct_api_requests(**args) + url_bytes_seen.append(len(req.url) + (len(req.body) if req.body else 0)) # Stable per-sub-request id so dedup logic doesn't collapse rows. idx = len(sub_args_seen) frame = pd.DataFrame({"id": [f"row-{idx}"], "value": [idx]}) return frame, _mock_response() df, response = fetch_once(args) - return sub_args_seen, df, response + return sub_args_seen, url_bytes_seen, df, response def _query_params(url: str) -> dict[str, str]: @@ -105,17 +112,12 @@ def _bail_floor_baseline(args: dict, url_limit: int) -> int: # ────────────────────────────────────────────────────────────────────── -def assert_urls_fit(sub_args_seen: list[dict], url_limit: int) -> None: - over = [] - for sa in sub_args_seen: - req = _construct_api_requests(**sa) - url_bytes = len(req.url) + (len(req.body) if req.body else 0) - if url_bytes > url_limit: - over.append((url_bytes, sa)) +def assert_urls_fit(url_bytes_seen: list[int], url_limit: int) -> None: + over = [n for n in url_bytes_seen if n > url_limit] if over: raise AssertionError( - f"{len(over)}/{len(sub_args_seen)} sub-requests exceeded " - f"url_limit={url_limit}; first: {over[0][0]} bytes" + f"{len(over)}/{len(url_bytes_seen)} sub-requests exceeded " + f"url_limit={url_limit}; first overrun: {over[0]} bytes" ) @@ -212,11 +214,11 @@ def scenario( expected_filter: str | None = None, require_chunked: bool = True, ) -> dict: - sub_args_seen, df, _ = run_chunked(args, url_limit=url_limit) + sub_args_seen, url_bytes_seen, df, _ = run_chunked(args, url_limit=url_limit) n_sub = len(sub_args_seen) baseline = _bail_floor_baseline(args, url_limit) - assert_urls_fit(sub_args_seen, url_limit) + assert_urls_fit(url_bytes_seen, url_limit) for dim in list_dims_to_check: assert_list_coverage(sub_args_seen, args, dim) if expected_filter is not None: @@ -374,8 +376,9 @@ def main() -> int: # I. Greedy-search adapts: same input, sweep url_limit and confirm # sub-request count is monotonically non-increasing as the limit grows. - sites = _ohio_sites(150) - clauses = _datetime_clauses(30) + # Five points are enough to demonstrate the curve cleanly. + sites = _ohio_sites(100) + clauses = _datetime_clauses(20) args = { "service": "continuous", "monitoring_location_id": sites, @@ -383,9 +386,9 @@ def main() -> int: "filter_lang": "cql-text", } counts_by_limit: list[tuple[int, int]] = [] - for limit in [1200, 1800, 2400, 3000, 4000, 6000, 10000]: - seen, _, _ = run_chunked(args, url_limit=limit) - assert_urls_fit(seen, limit) + for limit in [1200, 1800, 2400, 3500, 10000]: + seen, url_bytes_seen, _, _ = run_chunked(args, url_limit=limit) + assert_urls_fit(url_bytes_seen, limit) assert_list_coverage(seen, args, "monitoring_location_id") assert_filter_coverage(seen, args["filter"]) counts_by_limit.append((limit, len(seen))) From f16555d1ab615616a270bed4be73b745401562b8 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 14:17:13 -0500 Subject: [PATCH 10/17] =?UTF-8?q?refactor(waterdata):=20Address=20PR=20#28?= =?UTF-8?q?3=20review=20=E2=80=94=20relocate=20chunker=20helpers,=20clarif?= =?UTF-8?q?y=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review responses bundled together: - chunking.py module docstring: define ``k`` as the candidate filter chunk count before using it in the planner description. - ``QuotaExhausted`` docstring: drop the stale "silently truncate" framing. PR #273 / #279 already raise on a mid-pagination 429, so this exception is the structured-recovery alternative (partial frames in hand) rather than a defense against silent truncation. - Move chunker-only orphans from filters.py to chunking.py: ``_WATERDATA_URL_BYTE_LIMIT`` (the URL byte ceiling), ``_FetchOnce`` TypeVar, ``_combine_chunk_frames``, and ``_combine_chunk_responses``. filters.py was a leftover home from the pre-unification two-decorator stack; these helpers have no callers outside the chunker. Test ``test_multi_value_chunked_lazy_url_limit`` now monkeypatches the constant on its new module. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/chunking.py | 89 +++++++++++++++++++++++------ dataretrieval/waterdata/filters.py | 67 +--------------------- tests/waterdata_test.py | 6 +- 3 files changed, 78 insertions(+), 84 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 9acc1f6c..a0ed3db5 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -12,12 +12,14 @@ - Chunkable dims include multi-value list params (sites, parameter codes, ...) and the cql-text ``filter`` (split at top-level ``OR`` to keep each chunk valid CQL). -- The planner enumerates candidate filter chunk counts - (``k = 1, 2, 4, ..., total_clauses``); for each, partitions clauses - into ``k`` roughly-balanced groups joined by ``OR``, substitutes the - worst (longest, URL-encoded) group as the filter, then plans list +- For a filter with ``n_clauses`` top-level OR clauses, the planner + enumerates candidate filter chunk counts ``k`` (the number of + sub-filters to split into) at powers of two from 1 through + ``n_clauses``. For each ``k``, it partitions clauses into ``k`` + roughly-balanced groups joined by ``OR``, substitutes the worst + (longest, URL-encoded) group as the filter, then plans list chunking by greedy halving. The candidate that minimizes - ``list_count × k`` wins. + ``list_count × k`` (total sub-request count) wins. - Sub-chunks of the same list dim never overlap, so frame concat needs no dedup across list chunks. Filter sub-chunks can match overlapping records (a row matching both ``a=1`` and ``b=2`` returns from both), @@ -35,22 +37,23 @@ import itertools import math from collections.abc import Callable, Iterator -from typing import Any +from typing import Any, TypeVar from urllib.parse import quote_plus import pandas as pd import requests -from . import filters from .filters import ( _check_numeric_filter_pitfall, - _combine_chunk_frames, - _combine_chunk_responses, - _FetchOnce, _is_chunkable, _split_top_level_or, ) +# Empirically the API replies HTTP 414 above ~8200 bytes of full URL — +# matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000 +# leaves ~200 bytes for request-line framing and proxy variance. +_WATERDATA_URL_BYTE_LIMIT = 8000 + # Default rule: any list-shaped kwarg with >1 element is chunked across # sub-requests — each chunk becomes a comma-joined sub-list in the URL. # The OGC getters expose ~90 such list-shaped params (IDs, codes, @@ -114,6 +117,12 @@ _OR_SEP = " OR " +_FetchOnce = TypeVar( + "_FetchOnce", + bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]], +) + + class RequestTooLarge(ValueError): """Raised when a chunked request cannot be issued. Either the URL exceeds the byte limit even at the smallest reducible plan (every @@ -125,9 +134,14 @@ class RequestTooLarge(ValueError): class QuotaExhausted(RuntimeError): """Raised mid-chunked-call when the API's reported remaining quota (``x-ratelimit-remaining`` header) drops below the configured safety - floor. The chunker stops before issuing the next sub-request to - avoid a mid-call HTTP 429 that would silently truncate paginated - results. + floor. The chunker stops before issuing the next sub-request and + surfaces the partial result so callers can resume after the hourly + window resets. + + A bare 429 raised by ``_walk_pages`` would also abort the call but + discard the chunks completed so far; this exception is the + structured-recovery alternative, triggered pre-emptively while the + accumulated frames are still in hand. Attributes ---------- @@ -472,6 +486,49 @@ def _iter_sub_args( yield base if filter_chunk is None else {**base, "filter": filter_chunk} +def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: + """Concatenate per-chunk frames, dropping empties and deduping by ``id``. + + ``_get_resp_data`` returns a plain ``pd.DataFrame()`` on empty responses; + concat'ing it with real GeoDataFrames downgrades the result to plain + DataFrame and strips geometry/CRS, so empties are dropped first. Dedup + on the pre-rename feature ``id`` keeps overlapping user OR-clauses from + producing duplicate rows across chunks. + """ + non_empty = [f for f in frames if not f.empty] + if not non_empty: + return pd.DataFrame() + if len(non_empty) == 1: + return non_empty[0] + combined = pd.concat(non_empty, ignore_index=True) + if "id" in combined.columns: + combined = combined.drop_duplicates(subset="id", ignore_index=True) + return combined + + +def _combine_chunk_responses( + responses: list[requests.Response], +) -> requests.Response: + """Return one response with the last chunk's headers (for current + rate-limit state) and summed ``elapsed`` (for total wall-clock). + + The returned response's ``.url`` is the *first chunk's* URL, which + only reflects the first slice of the user's query. ``_finalize_response`` + overwrites ``.url`` with the canonical original-query URL so + ``BaseMetadata`` reflects the user's request, not the first sub-chunk. + + Mutates the first response in place: ``.headers`` is replaced with + the last response's headers and ``.elapsed`` is accumulated across + all chunks. Downstream reads ``.url``, ``.headers``, and + ``.elapsed`` (via ``BaseMetadata``). + """ + head = responses[0] + if len(responses) > 1: + head.headers = responses[-1].headers + head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed) + return head + + def _finalize_response( responses: list[requests.Response], canonical_url: str ) -> requests.Response: @@ -491,7 +548,7 @@ def multi_value_chunked( ) -> Callable[[_FetchOnce], _FetchOnce]: """Decorator that splits multi-value list params and cql-text filters across sub-requests so each sub-request URL fits - ``url_limit`` bytes (defaults to ``filters._WATERDATA_URL_BYTE_LIMIT``) + ``url_limit`` bytes (defaults to ``_WATERDATA_URL_BYTE_LIMIT``) and the joint cartesian-product plan stays ≤ ``max_chunks`` sub-requests (defaults to ``_DEFAULT_MAX_CHUNKS``). All defaults are resolved at call time so tests/users that patch the module constants @@ -522,9 +579,7 @@ def decorator(fetch_once: _FetchOnce) -> _FetchOnce: def wrapper( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: - limit = ( - filters._WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit - ) + limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit floor = ( _DEFAULT_QUOTA_SAFETY_FLOOR if quota_safety_floor is None diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py index 5ab00869..5e1c0a67 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/waterdata/filters.py @@ -6,10 +6,8 @@ Internal helpers used by ``chunking.multi_value_chunked``'s joint planner: ``_split_top_level_or`` (clause partitioning), -``_is_chunkable`` (filter-language gate), ``_check_numeric_filter_pitfall`` -(the lexicographic-comparison guard), ``_combine_chunk_frames`` / -``_combine_chunk_responses`` (aggregation), and the constant -``_WATERDATA_URL_BYTE_LIMIT``. +``_is_chunkable`` (filter-language gate), and +``_check_numeric_filter_pitfall`` (the lexicographic-comparison guard). Other CQL shapes (``AND``, ``NOT``, ``LIKE``, spatial/temporal predicates, function calls) are forwarded verbatim — only top-level @@ -20,19 +18,10 @@ from __future__ import annotations import re -from collections.abc import Callable -from typing import Any, Literal, TypeVar - -import pandas as pd -import requests +from typing import Any, Literal FILTER_LANG = Literal["cql-text", "cql-json"] -# Empirically the API replies HTTP 414 above ~8200 bytes of full URL — -# matches nginx's default ``large_client_header_buffers`` of 8 KB. 8000 -# leaves ~200 bytes for request-line framing and proxy variance. -_WATERDATA_URL_BYTE_LIMIT = 8000 - _NUM = r"-?(?:\d+(?:\.\d+)?|\.\d+)(?:[eE][+-]?\d+)?" _IDENT = r"[A-Za-z_]\w*" @@ -171,53 +160,3 @@ def _is_chunkable(filter_expr: Any, filter_lang: Any) -> bool: and bool(filter_expr) and filter_lang in {None, "cql-text"} ) - - -def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: - """Concatenate per-chunk frames, dropping empties and deduping by ``id``. - - ``_get_resp_data`` returns a plain ``pd.DataFrame()`` on empty responses; - concat'ing it with real GeoDataFrames downgrades the result to plain - DataFrame and strips geometry/CRS, so empties are dropped first. Dedup - on the pre-rename feature ``id`` keeps overlapping user OR-clauses from - producing duplicate rows across chunks. - """ - non_empty = [f for f in frames if not f.empty] - if not non_empty: - return pd.DataFrame() - if len(non_empty) == 1: - return non_empty[0] - combined = pd.concat(non_empty, ignore_index=True) - if "id" in combined.columns: - combined = combined.drop_duplicates(subset="id", ignore_index=True) - return combined - - -def _combine_chunk_responses( - responses: list[requests.Response], -) -> requests.Response: - """Return one response with the last chunk's headers (for current - rate-limit state) and summed ``elapsed`` (for total wall-clock). - - The returned response's ``.url`` is the *first chunk's* URL, which - only reflects the first slice of the user's query. Callers wanting - the canonical original-query URL on ``BaseMetadata`` must overwrite - ``.url`` themselves; ``chunking.multi_value_chunked``'s wrapper does - this via ``build_request(**original_args).url``. - - Mutates the first response in place: ``.headers`` is replaced with - the last response's headers and ``.elapsed`` is accumulated across - all chunks. Downstream reads ``.url``, ``.headers``, and - ``.elapsed`` (via ``BaseMetadata``). - """ - head = responses[0] - if len(responses) > 1: - head.headers = responses[-1].headers - head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed) - return head - - -_FetchOnce = TypeVar( - "_FetchOnce", - bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]], -) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 4a01457b..f0e6534e 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -13,7 +13,7 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) -from dataretrieval.waterdata import filters as _filters +from dataretrieval.waterdata import chunking as _chunking from dataretrieval.waterdata import ( get_channel, get_combined_metadata, @@ -475,7 +475,7 @@ def fetch(args): def test_multi_value_chunked_lazy_url_limit(monkeypatch): - """``url_limit=None`` → resolve filters._WATERDATA_URL_BYTE_LIMIT at call + """``url_limit=None`` → resolve chunking._WATERDATA_URL_BYTE_LIMIT at call time, so tests that patch the constant affect this decorator too.""" calls = [] @@ -484,7 +484,7 @@ def fetch(args): calls.append(args) return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - monkeypatch.setattr(_filters, "_WATERDATA_URL_BYTE_LIMIT", 240) + monkeypatch.setattr(_chunking, "_WATERDATA_URL_BYTE_LIMIT", 240) # 4 sites of 10 chars → exceeds 240 → planner splits. fetch({"sites": ["S" * 10 + str(i) for i in range(4)]}) assert len(calls) > 1, "patched constant should drive chunking" From eeba277a38f0ee196c749f41e8903e032a3f5a8d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 14:23:38 -0500 Subject: [PATCH 11/17] docs(tests): Drop stale "two-decorator design" references in test prose Three test docstrings/comments still framed their reasoning against the removed two-decorator stack (PR #283 unified them). Reword to describe the current joint-planner behavior on its own terms: - ``test_plan_joint_fans_out_filter_when_list_alone_cannot_fit``: drop the "previous two-decorator design" aside. - ``test_chunkable_params_skips_filter_passed_as_list``: rewrite the "inner filters.chunked is the only place that may shrink filter" line to point at ``_plan_joint``. - ``stress_chunker._bail_floor_baseline``: reframe the baseline as "degenerate singleton plan" rather than "worst case the old two-decorator design produced." No behavioral changes; prose only. Chunker tests + offline stress test still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/stress_chunker.py | 11 ++++++----- tests/waterdata_test.py | 9 ++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/stress_chunker.py b/tests/stress_chunker.py index 0a56c85a..728dc89d 100644 --- a/tests/stress_chunker.py +++ b/tests/stress_chunker.py @@ -17,7 +17,8 @@ Plus a greedy-quality check: for the "long filter + long lists" scenario, the joint planner should comfortably beat the bail-floor-only -baseline (the worst case the old two-decorator design produced). +baseline (singleton list chunks × one clause per filter chunk — the +worst sensible plan, used as the floor the greedy planner has to beat). Run: ``python3 tests/stress_chunker.py``. """ @@ -86,10 +87,10 @@ def _query_params(url: str) -> dict[str, str]: def _bail_floor_baseline(args: dict, url_limit: int) -> int: - """Lower-bound on what a singleton-list + singleton-clause plan - would produce — the worst case the old two-decorator design could - have hit. Used as the optimization-quality bar for the joint - planner: it should beat this comfortably. + """Upper bound: the sub-request count a degenerate plan that + singleton-chunks every list dim AND emits one clause per filter + sub-request would produce. The joint planner should beat this + comfortably — it's the quality bar. Counts ``product(per-dim length) × n_clauses`` for the inputs that are eligible to chunk.""" diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index f0e6534e..65581aec 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -329,9 +329,7 @@ def test_plan_chunks_raises_request_too_large_at_singleton_floor(): def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): """When the request can only fit by chunking BOTH the list and the filter, the joint planner must produce a plan that touches both - dims. With the previous two-decorator design the inner filter - chunker would have shrunk the filter at runtime; the joint planner - surfaces the same outcome at plan time.""" + dims.""" clauses = [f"f='{i}'" for i in range(10)] args = { "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], @@ -750,8 +748,9 @@ def test_chunkable_params_skips_filter_passed_as_list(): """Defensive guard: ``filter`` is documented as a string. If a caller mistakenly passes it as a list, the chunker must NOT treat it as a multi-value dim — comma-joining CQL clauses inside the URL would - produce a malformed filter expression. The inner ``filters.chunked`` - is the only place that may shrink ``filter``.""" + produce a malformed filter expression. The joint planner partitions + ``filter`` via top-level ``OR`` splitting in ``_plan_joint``; it must + never be sliced as a list dim.""" args = { "monitoring_location_id": ["USGS-A", "USGS-B"], "filter": ["a='1'", "a='2'"], # malformed input From 01e579e21a075d61b8350c80bf5e13172d0e1fdb Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 19:42:53 -0500 Subject: [PATCH 12/17] refactor(waterdata): Replace static max_chunks/safety_floor with dynamic rate-limit gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR #283 review feedback. The static caps (``_DEFAULT_MAX_CHUNKS=1000``, ``_DEFAULT_QUOTA_SAFETY_FLOOR=50``) and the matching ``max_chunks`` / ``quota_safety_floor`` decorator parameters are replaced by a quota check that runs after the first sub-request, using the real ``x-ratelimit-remaining`` value rather than a guessed cap. Behavior: - After the first sub-request the wrapper reads ``x-ratelimit-remaining``. If the rest of the plan won't fit in the current rate-limit window, it raises a new ``RequestExceedsQuota(ValueError)`` carrying ``planned_chunks``, ``available``, and ``deficit`` so the message reports exactly how far over budget the call is. The first chunk has already been issued; the wrapper stops there rather than burn the rest of the quota on a call that will fail mid-way. - ``QuotaExhausted`` is now triggered only when an actual HTTP 429 propagates from a sub-request (detected by walking ``__cause__`` for ``RuntimeError("429: ...")``, the shape ``_raise_for_non_200`` produces and ``_walk_pages`` wraps). A single-process caller should not normally see this — ``RequestExceedsQuota`` short-circuits in chunk 1; arrival here implies a concurrent consumer drained the bucket faster than predicted. Carries the partial frame for resume. ``partial_response`` becomes ``None`` when the 429 hits chunk 0 (no banked responses). - A non-429 ``RuntimeError`` (e.g. 500) propagates unchanged so the real cause surfaces to the caller. - When the server doesn't echo ``x-ratelimit-remaining``, ``_read_remaining`` returns ``_QUOTA_UNKNOWN``; the wrapper skips the post-first-chunk quota check (no signal → don't synthesize a block). Planner: ``_plan_list_chunks`` / ``_plan_joint`` no longer carry a ``max_chunks`` cap. ``RequestTooLarge`` fires only when nothing more can be split (the genuine URL-byte floor). The rate-limit gate replaces the static cap. Module docstring rewritten to summarize the current design (joint planning + dynamic quota gate); historical PR 233 / two-decorator references dropped. Tests: ten obsolete cap/floor tests removed; eight new tests added covering ``RequestExceedsQuota`` after chunk 0, deficit reporting, the no-header skip path, mid-call 429 → ``QuotaExhausted`` with partial frame, the first-chunk 429 (partial_response=None) edge case, and non-429 ``RuntimeError`` pass-through. ``_fetch_once`` in ``utils.py`` calls the decorator with defaults only, so no call-site changes are needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/chunking.py | 300 ++++++++++++++-------------- tests/waterdata_test.py | 278 ++++++++++++-------------- 2 files changed, 270 insertions(+), 308 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index a0ed3db5..9c10abd9 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -1,34 +1,31 @@ """Joint URL-byte chunking for the Water Data OGC getters. -PR 233 routes most services through GET with comma-separated values -(e.g. ``monitoring_location_id=USGS-A,USGS-B,...``). Long lists and -long top-level-``OR`` CQL filters can independently blow the server's -~8 KB URL byte limit. This module adds a single decorator that plans -both chunking dimensions together and iterates the joint cartesian -product so each sub-request URL fits. - -Design: - -- Chunkable dims include multi-value list params (sites, parameter - codes, ...) and the cql-text ``filter`` (split at top-level ``OR`` - to keep each chunk valid CQL). -- For a filter with ``n_clauses`` top-level OR clauses, the planner - enumerates candidate filter chunk counts ``k`` (the number of - sub-filters to split into) at powers of two from 1 through - ``n_clauses``. For each ``k``, it partitions clauses into ``k`` - roughly-balanced groups joined by ``OR``, substitutes the worst - (longest, URL-encoded) group as the filter, then plans list - chunking by greedy halving. The candidate that minimizes - ``list_count × k`` (total sub-request count) wins. -- Sub-chunks of the same list dim never overlap, so frame concat needs - no dedup across list chunks. Filter sub-chunks can match overlapping - records (a row matching both ``a=1`` and ``b=2`` returns from both), - so the combiner dedupes by feature ``id``. -- Date params, ``bbox``, ``properties``, ``limit``, and - ``skip_geometry`` are never chunked: dates are intervals not sets; - bbox is a coord array; ``properties`` defines the response schema - (chunking would shard columns); ``limit`` and ``skip_geometry`` are - scalar by contract. +Long multi-value list params (sites, parameter codes, ...) and long +top-level-``OR`` CQL filters independently risk overflowing the +server's ~8 KB URL byte limit. ``multi_value_chunked`` plans both +chunking dimensions together, picks the allocation that minimizes +total sub-requests, and iterates the joint cartesian product so every +sub-request URL fits. + +Planning: for a filter with ``n_clauses`` top-level OR clauses, try +candidate filter chunk counts ``k = 1, 2, 4, ..., n_clauses``. For +each, partition clauses into ``k`` count-balanced groups joined by +``OR``, take the longest (URL-encoded) group as the worst-case filter, +then plan list-dim chunking by greedy halving against the remaining +budget. Keep the candidate with the smallest ``list_count × k``. + +Quota: after the first sub-request the wrapper reads +``x-ratelimit-remaining``; if the rest of the plan won't fit, it +raises ``RequestExceedsQuota`` before burning more budget. A 429 +encountered mid-call (another caller drained the bucket faster than +predicted) surfaces as ``QuotaExhausted`` carrying the partial frame +so callers can resume after the hourly window resets. + +Dedup: list sub-chunks don't overlap; filter sub-chunks can, so the +combiner dedupes by feature ``id``. ``properties``, ``bbox``, date +intervals, ``limit``, ``skip_geometry``, and ``filter``/``filter_lang`` +themselves are never sliced as list dims (the filter is partitioned +along its top-level OR axis instead). """ from __future__ import annotations @@ -87,26 +84,14 @@ } ) -# Default cap on the number of sub-requests a single chunked call may -# emit. The USGS Water Data API rate-limits each HTTP request -# (including pagination), so the true budget is -# ``hourly_quota / avg_pages_per_chunk``. 1000 matches the default -# hourly quota. -_DEFAULT_MAX_CHUNKS = 1000 - -# When ``x-ratelimit-remaining`` drops below this between sub-requests, -# the chunker bails with ``QuotaExhausted`` rather than risk a mid-call -# HTTP 429. Carries the partial result so callers can resume from a -# known offset instead of retrying the whole chunked call from scratch. -_DEFAULT_QUOTA_SAFETY_FLOOR = 50 - # Response header USGS uses to advertise remaining hourly quota. _QUOTA_HEADER = "x-ratelimit-remaining" # Sentinel returned by ``_read_remaining`` when the response has no -# parseable header — large enough to beat any plausible safety floor -# so a missing/malformed header doesn't trigger spurious aborts. -_QUOTA_UNKNOWN = 10**9 +# parseable header. The wrapper treats this as "no quota signal" and +# skips the post-first-chunk plan-vs-quota check, so the value just +# needs to be distinct from any plausible real remaining count. +_QUOTA_UNKNOWN = -1 # Separators the two chunking dimensions use to compose their atoms # into URL-encoded blobs. List dims comma-join values @@ -124,62 +109,86 @@ class RequestTooLarge(ValueError): - """Raised when a chunked request cannot be issued. Either the URL - exceeds the byte limit even at the smallest reducible plan (every - list dim at singleton chunks and the filter at one clause per - sub-request), or the joint cartesian-product plan would issue - more than ``max_chunks`` sub-requests.""" + """Raised when the URL exceeds the byte limit even at the smallest + reducible plan (every list dim at singleton chunks and the filter + at one clause per sub-request). Shrink the input lists, simplify + the filter, or split the call manually.""" + + +class RequestExceedsQuota(ValueError): + """Raised after the first sub-request when ``x-ratelimit-remaining`` + in the response shows the rest of the plan can't fit in the + current per-key rate-limit window. The first chunk has already + been issued; the wrapper stops here rather than burn quota on a + call that will fail mid-way. + + Attributes + ---------- + planned_chunks : int + Total sub-requests the joint plan would issue. + available : int + Sub-requests this caller can still issue in the current window + (``x-ratelimit-remaining`` + 1, since the first sub-request + already cost one slot). + deficit : int + ``planned_chunks - available`` — how far over budget the call + would run if it continued. + """ + + def __init__(self, *, planned_chunks: int, available: int, deficit: int) -> None: + super().__init__( + f"Request would issue {planned_chunks} sub-requests but only " + f"{available} fit in the current rate-limit window (short by " + f"{deficit}). Wait for the window to reset, request a higher " + f"per-key quota, or narrow the query." + ) + self.planned_chunks = planned_chunks + self.available = available + self.deficit = deficit class QuotaExhausted(RuntimeError): - """Raised mid-chunked-call when the API's reported remaining quota - (``x-ratelimit-remaining`` header) drops below the configured safety - floor. The chunker stops before issuing the next sub-request and - surfaces the partial result so callers can resume after the hourly - window resets. + """Raised when a sub-request returns HTTP 429 mid-call. - A bare 429 raised by ``_walk_pages`` would also abort the call but - discard the chunks completed so far; this exception is the - structured-recovery alternative, triggered pre-emptively while the - accumulated frames are still in hand. + A single-process caller should not normally see this — the post- + first-chunk ``RequestExceedsQuota`` check short-circuits before + burning quota on a plan that won't fit. Arrival here implies a + concurrent consumer drained the rate-limit bucket faster than the + first chunk's ``x-ratelimit-remaining`` predicted. Attributes ---------- partial_frame : pd.DataFrame Concatenated, deduplicated result of every sub-request that - completed before the floor was crossed. - partial_response : requests.Response + completed before the 429. + partial_response : requests.Response | None Aggregated response with the canonical URL restored to the - user's full original query. + user's full original query. ``None`` only if the 429 hit on + the very first sub-request (no responses banked). completed_chunks : int - Number of sub-requests successfully completed (across both - list and filter chunking dims). + Number of sub-requests successfully completed. total_chunks : int Total sub-requests in the joint cartesian-product plan. - remaining : int - Last observed ``x-ratelimit-remaining`` value. """ def __init__( self, *, partial_frame: pd.DataFrame, - partial_response: requests.Response, + partial_response: requests.Response | None, completed_chunks: int, total_chunks: int, - remaining: int, ) -> None: super().__init__( - f"x-ratelimit-remaining dropped to {remaining} after " - f"{completed_chunks}/{total_chunks} chunks; aborting to avoid " - f"mid-call HTTP 429. Catch QuotaExhausted to access " - f".partial_frame and resume from chunk {completed_chunks}." + f"HTTP 429 mid-call after {completed_chunks}/{total_chunks} " + f"chunks; another caller likely drained the rate-limit " + f"window. Catch QuotaExhausted to access .partial_frame and " + f"resume after the window resets." ) self.partial_frame = partial_frame self.partial_response = partial_response self.completed_chunks = completed_chunks self.total_chunks = total_chunks - self.remaining = remaining def _chunkable_params(args: dict[str, Any]) -> dict[str, list[Any]]: @@ -234,19 +243,6 @@ def _plan_total(list_plan: dict[str, list[list[Any]]], n_filter_chunks: int) -> return list_count * n_filter_chunks -def _resolve_max_chunks(max_chunks: int | None) -> int: - """Apply ``max_chunks`` default + validation. Both planner entry - points share this rule, so pinning it here keeps them in sync.""" - if max_chunks is None: - return _DEFAULT_MAX_CHUNKS - if max_chunks < 1: - raise ValueError( - f"max_chunks must be >= 1; got {max_chunks}. Zero or negative " - f"values would silently bypass the cap." - ) - return max_chunks - - def _worst_case_args( base_args: dict[str, Any], list_plan: dict[str, list[list[Any]]] ) -> dict[str, Any]: @@ -285,7 +281,6 @@ def _plan_list_chunks( args: dict[str, Any], build_request: Callable[..., Any], url_limit: int, - max_chunks: int | None = None, ) -> dict[str, list[list[Any]]] | None: """Greedy halving of multi-value list dims until the worst-case sub-request URL fits ``url_limit``. The filter (if any) in ``args`` @@ -294,10 +289,8 @@ def _plan_list_chunks( Returns ``None`` when no list chunking is needed (request as-is fits or no chunkable list dims present). Raises ``RequestTooLarge`` - when the smallest reducible plan still doesn't fit or when the - plan would exceed ``max_chunks`` sub-requests. + when the smallest reducible plan still doesn't fit. """ - max_chunks = _resolve_max_chunks(max_chunks) chunkable = _chunkable_params(args) if not chunkable: return None @@ -332,13 +325,6 @@ def _plan_list_chunks( mid = len(chunk) // 2 plan[dim] = plan[dim][:idx] + [chunk[:mid], chunk[mid:]] + plan[dim][idx + 1 :] - if _plan_total(plan, 1) > max_chunks: - raise RequestTooLarge( - f"List plan alone would issue >{max_chunks} sub-requests " - f"before factoring filter chunking. Reduce list sizes or " - f"raise max_chunks." - ) - def _filter_chunk_counts(n_clauses: int) -> list[int]: """Candidate filter chunk counts to evaluate during joint planning. @@ -388,7 +374,6 @@ def _plan_joint( args: dict[str, Any], build_request: Callable[..., Any], url_limit: int, - max_chunks: int | None = None, ) -> tuple[dict[str, list[list[Any]]], list[str | None]] | None: """Joint planner over list dims and filter clauses. @@ -401,16 +386,15 @@ def _plan_joint( Algorithm: enumerate filter chunk counts ``k`` (powers of two up through ``n_clauses``); for each, partition clauses into ``k`` - balanced groups joined by ``OR`` and identify the worst (longest - URL-encoded) group; substitute that as the filter in ``args`` and - plan list chunking with greedy halving against the remaining - budget. Pick the candidate whose ``list_count × k`` is smallest. - - Raises ``RequestTooLarge`` when no candidate fits or the best plan - would exceed ``max_chunks``. + count-balanced groups joined by ``OR`` and identify the worst + (longest URL-encoded) group; substitute that as the filter in + ``args`` and plan list chunking with greedy halving against the + remaining budget. Pick the candidate whose ``list_count × k`` is + smallest. + + Raises ``RequestTooLarge`` when no candidate fits (URL still + over-budget at every reducible plan). """ - max_chunks = _resolve_max_chunks(max_chunks) - filter_expr = args.get("filter") clauses: list[str] = [] if _is_chunkable(filter_expr, args.get("filter_lang")): @@ -429,9 +413,7 @@ def _plan_joint( k = len(filter_chunks) plan_args = args if worst_filter is None else {**args, "filter": worst_filter} try: - list_plan = _plan_list_chunks( - plan_args, build_request, url_limit, max(1, max_chunks // k) - ) + list_plan = _plan_list_chunks(plan_args, build_request, url_limit) except RequestTooLarge as exc: last_error = exc continue @@ -443,16 +425,13 @@ def _plan_joint( if not list_plan and _request_bytes(build_request(**plan_args)) > url_limit: continue total = _plan_total(list_plan, k) - if total > max_chunks: - continue if best is None or total < best[0]: best = (total, list_plan, filter_chunks) if best is None: raise last_error or RequestTooLarge( - f"Joint plan would exceed max_chunks={max_chunks} at every " - f"candidate filter chunking. Reduce list sizes, simplify the " - f"filter, or raise max_chunks." + "No filter-chunking candidate produces a fitting plan. " + "Reduce list sizes or simplify the filter." ) return best[1], best[2] @@ -460,8 +439,8 @@ def _plan_joint( def _read_remaining(response: requests.Response) -> int: """Parse ``x-ratelimit-remaining`` from a response. Missing or - malformed header → ``_QUOTA_UNKNOWN`` so the safety check treats it - as 'plenty of quota'.""" + malformed header → ``_QUOTA_UNKNOWN`` (treated as "no signal"; the + post-first-chunk quota check skips when this is returned).""" raw = response.headers.get(_QUOTA_HEADER) if raw is None: return _QUOTA_UNKNOWN @@ -471,6 +450,19 @@ def _read_remaining(response: requests.Response) -> int: return _QUOTA_UNKNOWN +def _is_429(exc: BaseException) -> bool: + """True iff ``exc`` or any link in its ``__cause__`` chain is a + ``RuntimeError`` whose message starts with ``429`` — the shape + ``_raise_for_non_200`` produces and ``_walk_pages`` re-wraps for + mid-pagination failures.""" + cur: BaseException | None = exc + while cur is not None: + if isinstance(cur, RuntimeError) and str(cur).startswith("429"): + return True + cur = cur.__cause__ + return False + + def _iter_sub_args( args: dict[str, Any], list_plan: dict[str, list[list[Any]]], @@ -543,36 +535,28 @@ def multi_value_chunked( *, build_request: Callable[..., Any], url_limit: int | None = None, - max_chunks: int | None = None, - quota_safety_floor: int | None = None, ) -> Callable[[_FetchOnce], _FetchOnce]: """Decorator that splits multi-value list params and cql-text filters across sub-requests so each sub-request URL fits - ``url_limit`` bytes (defaults to ``_WATERDATA_URL_BYTE_LIMIT``) - and the joint cartesian-product plan stays ≤ ``max_chunks`` - sub-requests (defaults to ``_DEFAULT_MAX_CHUNKS``). All defaults are - resolved at call time so tests/users that patch the module constants - affect this decorator uniformly. - - ``url_limit`` is enforced against total request bytes (URL + body, - via ``_request_bytes``). POST routes are conservatively sized. - - Between sub-requests the wrapper reads ``x-ratelimit-remaining`` from - each response. If it drops below ``quota_safety_floor`` (default - ``_DEFAULT_QUOTA_SAFETY_FLOOR``), the wrapper raises - ``QuotaExhausted`` carrying the combined partial result and the - chunk offset so callers can resume after the hourly window resets. + ``url_limit`` bytes (defaults to ``_WATERDATA_URL_BYTE_LIMIT``, + resolved at call time so tests/users that patch the constant affect + this decorator uniformly). ``url_limit`` is enforced against total + request bytes (URL + body); POST routes are conservatively sized. + + After the first sub-request the wrapper reads + ``x-ratelimit-remaining`` from the response. If the remaining plan + won't fit in the current rate-limit window, it raises + ``RequestExceedsQuota`` rather than burn the rest of the quota on a + call that will fail mid-way. A 429 received mid-call (because + another caller drained the bucket faster than predicted) is + re-raised as ``QuotaExhausted`` carrying the partial frame so + callers can resume after the window resets. Joint cartesian-product iteration order is deterministic: list dims in args-insertion order (Python 3.7+ guarantee), with the filter dim as the innermost loop. Same ``args`` produces the same chunk sequence — resume is well-defined. """ - if quota_safety_floor is not None and quota_safety_floor < 0: - raise ValueError( - f"quota_safety_floor must be >= 0; got {quota_safety_floor}. " - f"Pass 0 explicitly to disable the guard." - ) def decorator(fetch_once: _FetchOnce) -> _FetchOnce: @functools.wraps(fetch_once) @@ -580,12 +564,7 @@ def wrapper( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit - floor = ( - _DEFAULT_QUOTA_SAFETY_FLOOR - if quota_safety_floor is None - else quota_safety_floor - ) - plan = _plan_joint(args, build_request, limit, max_chunks) + plan = _plan_joint(args, build_request, limit) if plan is None: return fetch_once(args) @@ -598,20 +577,31 @@ def wrapper( for i, sub_args in enumerate( _iter_sub_args(args, list_plan, filter_chunks) ): - frame, response = fetch_once(sub_args) - frames.append(frame) - responses.append(response) - if i == total - 1: - continue # last chunk; no next sub-request to gate - remaining = _read_remaining(response) - if remaining < floor: + try: + frame, response = fetch_once(sub_args) + except RuntimeError as exc: + if not _is_429(exc): + raise raise QuotaExhausted( partial_frame=_combine_chunk_frames(frames), - partial_response=_finalize_response(responses, canonical_url), - completed_chunks=i + 1, + partial_response=( + _finalize_response(responses, canonical_url) + if responses + else None + ), + completed_chunks=i, total_chunks=total, - remaining=remaining, - ) + ) from exc + frames.append(frame) + responses.append(response) + if i == 0 and total > 1: + remaining = _read_remaining(response) + if remaining != _QUOTA_UNKNOWN and remaining < total - 1: + raise RequestExceedsQuota( + planned_chunks=total, + available=remaining + 1, + deficit=total - (remaining + 1), + ) return ( _combine_chunk_frames(frames), diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 65581aec..44e82b6a 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -33,10 +33,9 @@ get_time_series_metadata, ) from dataretrieval.waterdata.chunking import ( - _DEFAULT_MAX_CHUNKS, - _DEFAULT_QUOTA_SAFETY_FLOOR, _QUOTA_HEADER, QuotaExhausted, + RequestExceedsQuota, RequestTooLarge, _chunkable_params, _filter_chunk_counts, @@ -488,187 +487,182 @@ def fetch(args): assert len(calls) > 1, "patched constant should drive chunking" -def test_default_max_chunks_matches_hourly_api_quota(): - """The default cap mirrors the USGS Water Data API's documented - per-API-key hourly limit. Locking this in so future changes have to - explicitly acknowledge the quota.""" - assert _DEFAULT_MAX_CHUNKS == 1000 +def _quota_response(remaining: int | str | None) -> mock.Mock: + """A mock requests.Response-like object whose ``x-ratelimit-remaining`` + header reflects the given value (None → header absent).""" + resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + resp.headers = {} if remaining is None else {_QUOTA_HEADER: str(remaining)} + return resp -def test_plan_chunks_raises_when_plan_exceeds_max_chunks(): - """A converged plan with more sub-requests than ``max_chunks`` must - raise rather than silently issue them and burn the user's API quota.""" - # 2 dims with long values, each needing many singleton-ish chunks. - # Pick chunk sizes that converge to a plan exceeding a tight cap. - args = { - "dim_a": [f"long-string-value-{i}" for i in range(50)], - "dim_b": [f"another-long-value-{i}" for i in range(50)], - } - # url_limit forces splitting; max_chunks=10 forces the cap to fire. - with pytest.raises(RequestTooLarge, match=">10 sub-requests"): - _plan_list_chunks(args, _fake_build, url_limit=250, max_chunks=10) +def test_read_remaining_parses_header(): + assert _read_remaining(_quota_response(42)) == 42 -def test_plan_chunks_respects_default_cap_without_explicit_arg(): - """Default kwarg path: ``max_chunks`` defaults to _DEFAULT_MAX_CHUNKS - when not specified, so direct callers (e.g., other library code) get - the same safety net as the decorator wrapper.""" - args = { - "dim_a": [f"v{i:03d}" for i in range(60)], - "dim_b": [f"v{i:03d}" for i in range(60)], - "dim_c": [f"v{i:03d}" for i in range(60)], - } - # Without explicit max_chunks: defaults to 1000. The plan for these - # inputs would emit > 1000 sub-requests at a tight limit, so should - # raise on default cap alone. - with pytest.raises(RequestTooLarge, match=r">1000 sub-requests"): - _plan_list_chunks(args, _fake_build, url_limit=220) +def test_read_remaining_treats_missing_header_as_unknown(): + """Servers that don't echo a rate-limit header must not synthesize + a remaining count — the wrapper checks for the unknown sentinel + explicitly and skips the post-first-chunk plan-vs-quota check.""" + from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN + + assert _read_remaining(_quota_response(None)) == _QUOTA_UNKNOWN -def test_multi_value_chunked_cap_override(): - """A decorator-time ``max_chunks`` override lets callers with higher - quotas raise the ceiling without monkeypatching the module constant.""" +def test_read_remaining_treats_malformed_header_as_unknown(): + """Defensive: non-integer header value → unknown sentinel, so the + quota check is skipped rather than tripping on a parse failure.""" + from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN + + assert _read_remaining(_quota_response("not-a-number")) == _QUOTA_UNKNOWN + + +def test_request_exceeds_quota_after_first_chunk(): + """Plan totals 4 sub-requests. The first response reports + ``x-ratelimit-remaining=1`` — only 2 sub-requests fit total + (the one just issued + 1 more). The wrapper must raise + ``RequestExceedsQuota`` *before* issuing chunk 2.""" + calls: list[dict] = [] - @multi_value_chunked(build_request=_fake_build, url_limit=220, max_chunks=10) def fetch(args): - return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + calls.append(args) + return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(1) - with pytest.raises(RequestTooLarge, match=">10 sub-requests"): - fetch( - { - "dim_a": [f"longer-v{i}" for i in range(30)], - "dim_b": [f"longer-v{i}" for i in range(30)], - } - ) + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(RequestExceedsQuota) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) -def _quota_response(remaining: int | str | None) -> mock.Mock: - """A mock requests.Response-like object whose ``x-ratelimit-remaining`` - header reflects the given value (None → header absent).""" - resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - resp.headers = {} if remaining is None else {_QUOTA_HEADER: str(remaining)} - return resp + err = excinfo.value + assert err.planned_chunks == 4 + assert err.available == 2 # remaining=1 + the chunk we just spent + assert err.deficit == 2 + assert len(calls) == 1, "only the first chunk should have been issued" -def test_read_remaining_parses_header(): - assert _read_remaining(_quota_response(42)) == 42 +def test_request_exceeds_quota_message_reports_deficit(): + """The error must surface planned / available / deficit so callers + know precisely how far over budget the call is.""" + e = RequestExceedsQuota(planned_chunks=10, available=4, deficit=6) + msg = str(e) + assert "10" in msg + assert "4" in msg + assert "6" in msg -def test_read_remaining_treats_missing_header_as_plenty(): - """Servers that don't echo a rate-limit header must not trigger - spurious QuotaExhausted aborts. Sentinel is a large integer so any - plausible safety floor compares cleanly.""" - assert _read_remaining(_quota_response(None)) >= 1_000_000 +def test_request_exceeds_quota_not_raised_when_plan_fits(): + """If ``x-ratelimit-remaining`` is large enough to cover the rest + of the plan, the wrapper proceeds normally.""" + remaining_seq = iter([100, 99, 98, 97]) + + def fetch(args): + return ( + pd.DataFrame({"sites": list(args["sites"])}), + _quota_response(next(remaining_seq)), + ) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + assert len(df) == 4 -def test_read_remaining_treats_malformed_header_as_plenty(): - """Defensive: non-integer header value → don't abort.""" - assert _read_remaining(_quota_response("not-a-number")) >= 1_000_000 +def test_no_quota_check_when_header_absent(): + """Without an ``x-ratelimit-remaining`` header the wrapper has no + quota signal and must NOT synthesize a ``RequestExceedsQuota``; + every planned sub-request runs.""" + def fetch(args): + return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(None) -def test_default_quota_safety_floor(): - """Default floor lives at 50 — enough headroom for one final - chunked call's pagination spike without breaching the hourly cap.""" - assert _DEFAULT_QUOTA_SAFETY_FLOOR == 50 + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + assert len(df) == 4 -def test_multi_value_chunked_aborts_when_quota_floor_breached(): - """Mid-call, when ``x-ratelimit-remaining`` drops below the floor, - the chunker must raise ``QuotaExhausted`` *before* issuing the next - sub-request — and the exception must carry the partial frame plus - the chunk offset so callers can resume.""" - # Build a fetch_once whose response 'remaining' header decrements - # through 200, 100, 40 (below floor=50), 10. - remaining_seq = iter([200, 100, 40, 10]) - page_idx = iter(range(10)) +def test_quota_exhausted_on_mid_call_429(): + """Mid-call 429 (a concurrent caller drained the bucket) surfaces + as ``QuotaExhausted`` carrying the partial frame plus the chunk + offset so callers can resume after the window resets.""" + state = {"i": 0} def fetch(args): - idx = next(page_idx) + i = state["i"] + state["i"] += 1 + if i == 2: + inner = RuntimeError("429: Too many requests made.") + try: + raise inner + except RuntimeError as cause: + raise RuntimeError( + "Paginated request failed after collecting 0 page(s): " + "429: Too many requests made." + ) from cause return ( - pd.DataFrame( - {"site": list(args["sites"]), "page": [idx] * len(args["sites"])} - ), - _quota_response(next(remaining_seq)), + pd.DataFrame({"i": [i], "sites": list(args["sites"])}), + _quota_response(500), ) - decorated = multi_value_chunked( - build_request=_fake_build, - url_limit=240, - quota_safety_floor=50, - )(fetch) - - # Plan forces 4 sub-requests (4 singleton site chunks). + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) with pytest.raises(QuotaExhausted) as excinfo: - decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) err = excinfo.value - # Aborted after the 3rd sub-request (remaining=40 < floor=50). - assert err.completed_chunks == 3 - assert err.total_chunks == 4 - assert err.remaining == 40 - # Partial frame combines rows from the first three completed sub-requests. + assert err.completed_chunks == 2 # chunks 0 and 1 banked; 429 hit on i=2 + assert err.total_chunks == 5 assert err.partial_frame is not None - assert set(err.partial_frame["page"]) == {0, 1, 2} + assert set(err.partial_frame["i"]) == {0, 1} -def test_multi_value_chunked_does_not_abort_on_last_chunk(): - """Aborting on the final sub-request would be pointless — there's - no 'next' to protect. The check is skipped there. Earlier chunks - stay above the floor; only the last drops below, and we still - return cleanly because the check is skipped at i == total-1.""" - remaining_seq = iter([500, 5]) # only the LAST chunk dips below floor=50 +def test_quota_exhausted_on_first_chunk_429_has_no_partial_response(): + """A 429 on the very first sub-request means no responses are + banked; ``partial_response`` is ``None`` (and ``partial_frame`` is + empty) so callers can branch on that to distinguish "abort before + any data arrived" from "abort after partial collection".""" def fetch(args): - return ( - pd.DataFrame({"site": list(args["sites"])}), - _quota_response(next(remaining_seq)), - ) - - decorated = multi_value_chunked( - build_request=_fake_build, - url_limit=240, - quota_safety_floor=50, - )(fetch) + raise RuntimeError("429: Too many requests made.") - df, _ = decorated({"sites": ["S1" * 10, "S2" * 10]}) # forces 2 chunks - assert len(df) == 2 # no raise — both chunks ran + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(QuotaExhausted) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10]}) + err = excinfo.value + assert err.completed_chunks == 0 + assert err.partial_response is None + assert err.partial_frame.empty -def test_multi_value_chunked_quota_check_disabled_with_zero_floor(): - """Setting the floor to 0 effectively disables the quota guard — - counter can go to 1 without aborting (since 1 > 0 = floor).""" - remaining_seq = iter([5, 1]) +def test_chunker_passes_through_non_429_runtime_error(): + """A non-429 ``RuntimeError`` (e.g. a 500) is not a quota signal; + it must propagate unchanged so callers see the real cause.""" + state = {"i": 0} def fetch(args): + i = state["i"] + state["i"] += 1 + if i == 2: + raise RuntimeError("500: Internal server error.") return ( - pd.DataFrame({"site": list(args["sites"])}), - _quota_response(next(remaining_seq)), + pd.DataFrame({"sites": list(args["sites"])}), + _quota_response(500), ) - decorated = multi_value_chunked( - build_request=_fake_build, - url_limit=240, - quota_safety_floor=0, - )(fetch) - df, _ = decorated({"sites": ["S1" * 10, "S2" * 10]}) - assert len(df) == 2 # no raise + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(RuntimeError, match=r"^500:"): + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) -def test_quota_exhausted_message_includes_resume_offset(): - """The error message must point the user at the chunk offset to - resume from, otherwise the partial_frame attribute is a footgun - — the user has no way to know which chunks still need re-issuing.""" +def test_quota_exhausted_message_points_at_resume(): + """The error message must surface the chunk offset and the resume + affordance — ``partial_frame`` is a footgun without it.""" e = QuotaExhausted( partial_frame=pd.DataFrame(), partial_response=mock.Mock(), completed_chunks=7, total_chunks=20, - remaining=12, ) msg = str(e) assert "7/20" in msg - assert "12" in msg - assert "QuotaExhausted" in msg or "resume" in msg + assert "429" in msg + assert "resume" in msg def test_request_bytes_rejects_non_sizable_body(): @@ -702,17 +696,6 @@ def __init__(self, url, body): assert _request_bytes(_Req("ab", bytearray(b"cd"))) == 4 -def test_plan_chunks_rejects_non_positive_max_chunks(): - """``max_chunks < 1`` is meaningless and would silently bypass the - cap on the no-chunking-needed path (initial plan total = 1 and the - in-loop check only runs after a split). Reject early.""" - args = {"monitoring_location_id": ["A", "B", "C", "D"]} - with pytest.raises(ValueError, match="max_chunks must be >= 1"): - _plan_list_chunks(args, _fake_build, url_limit=1000, max_chunks=0) - with pytest.raises(ValueError, match="max_chunks must be >= 1"): - _plan_list_chunks(args, _fake_build, url_limit=1000, max_chunks=-5) - - def test_multi_value_chunked_restores_canonical_url(): """When chunking fans out, the aggregated response's ``.url`` must reflect the *user's original* query (rebuilt from the unchunked @@ -781,17 +764,6 @@ def test_chunkable_params_skips_scalar_contract_params(): assert "skip_geometry" not in chunkable -def test_multi_value_chunked_rejects_negative_quota_safety_floor(): - """``quota_safety_floor=0`` is the documented "disable the guard" - sentinel (tested elsewhere). A negative value disables it the same - way but obscures intent — reject at decoration time so the caller - notices, parallel to the ``max_chunks < 1`` check in ``_plan_chunks``.""" - with pytest.raises(ValueError, match="quota_safety_floor must be >= 0"): - multi_value_chunked(build_request=_fake_build, quota_safety_floor=-1) - with pytest.raises(ValueError, match="quota_safety_floor must be >= 0"): - multi_value_chunked(build_request=_fake_build, quota_safety_floor=-50) - - def test_joint_planner_url_construction_long_filter_and_long_sites(): """Realistic stress: 20 datetime OR-clauses combined with 100 USGS site IDs. Every sub-request URL built from the plan must fit the From 592c2078f03b673eb6611893d40e93091dcdfe3b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 19:53:33 -0500 Subject: [PATCH 13/17] test(waterdata): Split chunker tests into tests/waterdata_chunking_test.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chunker section had grown to 30 tests + a fake-builder harness and shared ``_quota_response`` helper — a self-contained subsystem sharing a file with live-API getter tests that have nothing in common with it. The live-API file carries a ``pytest.mark.flaky`` rerun marker tuned for transient upstream errors; the chunker tests are pure unit tests against a fake builder and shouldn't be subject to that retry logic at all. - Move the 30 chunker tests, ``_FakeReq`` / ``_fake_build`` / ``_quota_response`` helpers, and the section comment that introduces the fake-builder model into the new file. - The new file imports only what it needs (``_chunking``, the chunking exports, and ``_construct_api_requests`` for the one real-builder URL-construction test). - Drop now-unused imports from ``waterdata_test.py`` (``itertools``, ``math``, ``quote_plus``). - No content changes to the tests themselves; this is a relocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_chunking_test.py | 658 +++++++++++++++++++++++++++++++ tests/waterdata_test.py | 640 ------------------------------ 2 files changed, 658 insertions(+), 640 deletions(-) create mode 100644 tests/waterdata_chunking_test.py diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py new file mode 100644 index 00000000..bfcc491f --- /dev/null +++ b/tests/waterdata_chunking_test.py @@ -0,0 +1,658 @@ +"""Tests for ``dataretrieval.waterdata.chunking``. + +These tests exercise the joint planner with a fake ``build_request`` +whose URL byte length is a deterministic function of its inputs: + +- non-chunkable args contribute ``base_bytes``, +- every multi-value list contributes ``len(",".join(map(str, v)))``, +- the ``filter`` kwarg contributes ``len(filter)``. + +That isolates planner behaviour from the real HTTP request builder. +The one exception is +``test_joint_planner_url_construction_long_filter_and_long_sites``, +which uses the real ``_construct_api_requests`` so URL-encoding +surprises (``%``, ``+``, ``/``, ``&``, …) can't pass against a fake +and then fail in production. +""" + +import datetime +import itertools +import math +import sys +from unittest import mock +from urllib.parse import quote_plus + +import pandas as pd +import pytest + +if sys.version_info < (3, 10): + pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) + +from dataretrieval.waterdata import chunking as _chunking +from dataretrieval.waterdata.chunking import ( + _QUOTA_HEADER, + QuotaExhausted, + RequestExceedsQuota, + RequestTooLarge, + _chunkable_params, + _filter_chunk_counts, + _partition_clauses, + _plan_joint, + _plan_list_chunks, + _read_remaining, + multi_value_chunked, +) +from dataretrieval.waterdata.utils import _construct_api_requests + + +class _FakeReq: + __slots__ = ("url", "body") + + def __init__(self, url, body=None): + self.url = url + self.body = body + + +def _fake_build(*, base=200, **kwargs): + """Fake build_request: URL length deterministic in its inputs. + + Mirrors the GET-routed shape: payload goes in the URL, body is None. + List/string values are URL-encoded via ``quote_plus`` so the fake's + byte count matches what the real ``_construct_api_requests`` would + produce; otherwise an alphanumeric test could pass against the fake + but fail in production once values containing ``%``, ``+``, ``/``, + ``&`` etc. (which expand under encoding) reach the same code path. + """ + bytes_ = base + for v in kwargs.values(): + if isinstance(v, (list, tuple)): + bytes_ += len(quote_plus(",".join(map(str, v)))) + elif isinstance(v, str): + bytes_ += len(quote_plus(v)) + return _FakeReq("x" * bytes_) + + +def test_partition_clauses_balanced(): + """k roughly-balanced groups of atoms, with the remainder distributed + across the first groups so no group differs from another by more + than one clause. Returns raw groups (not joined) so the planner can + size candidates without materializing discarded partitions.""" + clauses = ["a='1'", "b='2'", "c='3'", "d='4'", "e='5'"] + assert _partition_clauses(clauses, 1) == [clauses] + # 5 clauses into 2 groups → sizes 3, 2 (remainder lands in the first). + assert _partition_clauses(clauses, 2) == [ + ["a='1'", "b='2'", "c='3'"], + ["d='4'", "e='5'"], + ] + # Singletons. + assert _partition_clauses(clauses, 5) == [[c] for c in clauses] + # k > len(clauses) → singletons (don't synthesize empty groups). + assert _partition_clauses(clauses, 99) == [[c] for c in clauses] + + +def test_filter_chunk_counts_powers_of_two_plus_n(): + """Candidate counts cover the trade-off at powers of two, and always + include ``n_clauses`` itself so the fully-singleton case is always + evaluated even when n isn't a power of two.""" + assert _filter_chunk_counts(0) == [1] + assert _filter_chunk_counts(1) == [1] + assert _filter_chunk_counts(5) == [1, 2, 4, 5] + assert _filter_chunk_counts(16) == [1, 2, 4, 8, 16] + + +def test_plan_chunks_returns_none_when_request_fits(): + """URL under limit → planner returns None, decorator passes through.""" + args = {"monitoring_location_id": ["A", "B", "C"]} + plan = _plan_list_chunks(args, _fake_build, url_limit=8000) + assert plan is None + + +def test_plan_chunks_returns_none_when_no_chunkable_lists(): + """No multi-value lists, however over-limit → planner can't help, returns None + (decorator falls through; server may 414 but that's not chunker's job).""" + args = {"monitoring_location_id": "scalar-only"} + plan = _plan_list_chunks(args, _fake_build, url_limit=10) + assert plan is None + + +def test_plan_chunks_greedy_halving_targets_largest_dim(): + """Two dims with one much larger — the heavy dim halves first.""" + args = { + "monitoring_location_id": ["X" * 30, "Y" * 30, "Z" * 30, "W" * 30], + "parameter_code": ["00060", "00065"], + } + # full URL ≈ 200 + 123 + 12 = 335; force splitting heavy dim only. + plan = _plan_list_chunks(args, _fake_build, url_limit=310) + assert len(plan["monitoring_location_id"]) > 1 + assert len(plan["parameter_code"]) == 1 # heavy-dim split was enough + + +def test_plan_chunks_raises_request_too_large_at_singleton_floor(): + """Limit below singleton-per-dim floor (with no chunkable filter to + fall back on) → RequestTooLarge with a clear message.""" + args = {"monitoring_location_id": ["A", "B"]} + # base=200 alone exceeds limit; no relief possible. + with pytest.raises(RequestTooLarge, match="smallest reducible"): + _plan_list_chunks(args, _fake_build, url_limit=100) + + +def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): + """When the request can only fit by chunking BOTH the list and the + filter, the joint planner must produce a plan that touches both + dims.""" + clauses = [f"f='{i}'" for i in range(10)] + args = { + "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], + "filter": " OR ".join(clauses), + } + # Singleton list + full filter ≈ 200 + 10 + 86 = 296 (over limit 240). + # Joint planner must split the filter into k >= 2 groups. + plan = _plan_joint(args, _fake_build, url_limit=240) + assert plan is not None + list_plan, filter_chunks = plan + # Either the filter was chunked, the list was chunked, or both. + assert len(filter_chunks) > 1 or any(len(v) > 1 for v in list_plan.values()) + + +def test_plan_joint_minimizes_total_sub_requests(): + """When both dims need shrinking, picking smaller filter chunks + frees URL budget for larger list chunks, and vice versa. The + planner should pick the allocation with the *fewest* total + sub-requests, not just the first allocation that fits.""" + # 16 short clauses (no inflation under URL encoding so the math is + # tractable). Each clause = 5 bytes (e.g. "f='0'"); full filter ≈ + # 16*5 + 15*4 = 140 bytes raw. + clauses = [f"f='{i}'" for i in range(16)] + args = { + "sites": ["S" * 30 for _ in range(8)], # 8 sites @ 30 chars + "filter": " OR ".join(clauses), + } + # Tight limit forces both dims to participate. + plan = _plan_joint(args, _fake_build, url_limit=380) + assert plan is not None + list_plan, filter_chunks = plan + total = max( + 1, math.prod(len(c) for c in list_plan.values()) if list_plan else 1 + ) * len(filter_chunks) + # Plan must beat the bail-floor-style worst case (8 singletons × 16 + # filter chunks = 128 sub-requests) by a healthy margin. + assert total < 128 + + +def test_plan_joint_raises_when_smallest_plan_doesnt_fit(): + """If even the most aggressive joint plan (singleton lists + + singleton filter clauses) still exceeds the limit, surface + RequestTooLarge — there's nothing left to shrink.""" + args = { + "monitoring_location_id": ["A" * 10, "B" * 10], + "filter": "x='12345' OR x='67890'", # min clause is 9 chars + } + # Base 200 + singleton site (10) + singleton clause (9) = 219; limit + # below 219 → no joint plan can fit. + with pytest.raises(RequestTooLarge): + _plan_joint(args, _fake_build, url_limit=210) + + +def test_multi_value_chunked_passes_through_when_url_fits(): + """No planning needed → decorator calls underlying function exactly once + with the original args.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=8000) + def fetch(args): + calls.append(args) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch({"monitoring_location_id": ["A", "B"]}) + assert len(calls) == 1 + assert calls[0]["monitoring_location_id"] == ["A", "B"] + + +def test_multi_value_chunked_emits_cartesian_product(): + """Two chunkable dims, each split into 2 chunks → exactly 4 sub-calls, + each pairing one chunk from each dim.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + calls.append({k: v for k, v in args.items() if k in ("sites", "pcodes")}) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch( + { + "sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10], + "pcodes": ["P1" * 10, "P2" * 10, "P3" * 10, "P4" * 10], + } + ) + # Both heavy → planner should split both dims. Confirm a cartesian shape: + # every unique site-chunk pairs with every unique pcode-chunk. + sites_seen = {tuple(c["sites"]) for c in calls} + pcodes_seen = {tuple(c["pcodes"]) for c in calls} + assert len(calls) == len(sites_seen) * len(pcodes_seen) + assert len(sites_seen) > 1 + assert len(pcodes_seen) > 1 + + +def test_multi_value_chunked_emits_3d_cartesian_product(): + """Three chunkable dims, each forced to split → exhaustive cartesian + product across all three. Verifies the planner's halving loop handles + N>2 dims uniformly and the wrapper's ``itertools.product`` enumerates + every combination exactly once.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + calls.append(tuple(tuple(args[k]) for k in ("sites", "pcodes", "stats"))) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + fetch( + { + "sites": ["S" * 12 + str(i) for i in range(4)], + "pcodes": ["P" * 12 + str(i) for i in range(4)], + "stats": ["T" * 12 + str(i) for i in range(4)], + } + ) + + # Three independent axes — every (site_chunk, pcode_chunk, stat_chunk) + # triple must appear exactly once. Confirm: + sites_seen = {c[0] for c in calls} + pcodes_seen = {c[1] for c in calls} + stats_seen = {c[2] for c in calls} + + assert len(sites_seen) > 1, "sites dim was not split" + assert len(pcodes_seen) > 1, "pcodes dim was not split" + assert len(stats_seen) > 1, "stats dim was not split" + + # Cartesian shape: # sub-calls == product of unique chunks across dims + expected = len(sites_seen) * len(pcodes_seen) * len(stats_seen) + assert len(calls) == expected, ( + f"expected {expected} cartesian-product sub-calls, got {len(calls)}" + ) + # And no triple repeats (exhaustive enumeration, no duplicates). + assert len(set(calls)) == len(calls) + # The chunked values, when unioned across calls, recover the original list. + assert {x for tup in sites_seen for x in tup} == { + "S" * 12 + str(i) for i in range(4) + } + assert {x for tup in pcodes_seen for x in tup} == { + "P" * 12 + str(i) for i in range(4) + } + assert {x for tup in stats_seen for x in tup} == { + "T" * 12 + str(i) for i in range(4) + } + + +def test_multi_value_chunked_lazy_url_limit(monkeypatch): + """``url_limit=None`` → resolve chunking._WATERDATA_URL_BYTE_LIMIT at call + time, so tests that patch the constant affect this decorator too.""" + calls = [] + + @multi_value_chunked(build_request=_fake_build) # url_limit defaults to None + def fetch(args): + calls.append(args) + return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + + monkeypatch.setattr(_chunking, "_WATERDATA_URL_BYTE_LIMIT", 240) + # 4 sites of 10 chars → exceeds 240 → planner splits. + fetch({"sites": ["S" * 10 + str(i) for i in range(4)]}) + assert len(calls) > 1, "patched constant should drive chunking" + + +def _quota_response(remaining: int | str | None) -> mock.Mock: + """A mock requests.Response-like object whose ``x-ratelimit-remaining`` + header reflects the given value (None → header absent).""" + resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + resp.headers = {} if remaining is None else {_QUOTA_HEADER: str(remaining)} + return resp + + +def test_read_remaining_parses_header(): + assert _read_remaining(_quota_response(42)) == 42 + + +def test_read_remaining_treats_missing_header_as_unknown(): + """Servers that don't echo a rate-limit header must not synthesize + a remaining count — the wrapper checks for the unknown sentinel + explicitly and skips the post-first-chunk plan-vs-quota check.""" + from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN + + assert _read_remaining(_quota_response(None)) == _QUOTA_UNKNOWN + + +def test_read_remaining_treats_malformed_header_as_unknown(): + """Defensive: non-integer header value → unknown sentinel, so the + quota check is skipped rather than tripping on a parse failure.""" + from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN + + assert _read_remaining(_quota_response("not-a-number")) == _QUOTA_UNKNOWN + + +def test_request_exceeds_quota_after_first_chunk(): + """Plan totals 4 sub-requests. The first response reports + ``x-ratelimit-remaining=1`` — only 2 sub-requests fit total + (the one just issued + 1 more). The wrapper must raise + ``RequestExceedsQuota`` *before* issuing chunk 2.""" + calls: list[dict] = [] + + def fetch(args): + calls.append(args) + return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(1) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + + with pytest.raises(RequestExceedsQuota) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + + err = excinfo.value + assert err.planned_chunks == 4 + assert err.available == 2 # remaining=1 + the chunk we just spent + assert err.deficit == 2 + assert len(calls) == 1, "only the first chunk should have been issued" + + +def test_request_exceeds_quota_message_reports_deficit(): + """The error must surface planned / available / deficit so callers + know precisely how far over budget the call is.""" + e = RequestExceedsQuota(planned_chunks=10, available=4, deficit=6) + msg = str(e) + assert "10" in msg + assert "4" in msg + assert "6" in msg + + +def test_request_exceeds_quota_not_raised_when_plan_fits(): + """If ``x-ratelimit-remaining`` is large enough to cover the rest + of the plan, the wrapper proceeds normally.""" + remaining_seq = iter([100, 99, 98, 97]) + + def fetch(args): + return ( + pd.DataFrame({"sites": list(args["sites"])}), + _quota_response(next(remaining_seq)), + ) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + assert len(df) == 4 + + +def test_no_quota_check_when_header_absent(): + """Without an ``x-ratelimit-remaining`` header the wrapper has no + quota signal and must NOT synthesize a ``RequestExceedsQuota``; + every planned sub-request runs.""" + + def fetch(args): + return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(None) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) + assert len(df) == 4 + + +def test_quota_exhausted_on_mid_call_429(): + """Mid-call 429 (a concurrent caller drained the bucket) surfaces + as ``QuotaExhausted`` carrying the partial frame plus the chunk + offset so callers can resume after the window resets.""" + state = {"i": 0} + + def fetch(args): + i = state["i"] + state["i"] += 1 + if i == 2: + inner = RuntimeError("429: Too many requests made.") + try: + raise inner + except RuntimeError as cause: + raise RuntimeError( + "Paginated request failed after collecting 0 page(s): " + "429: Too many requests made." + ) from cause + return ( + pd.DataFrame({"i": [i], "sites": list(args["sites"])}), + _quota_response(500), + ) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(QuotaExhausted) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) + + err = excinfo.value + assert err.completed_chunks == 2 # chunks 0 and 1 banked; 429 hit on i=2 + assert err.total_chunks == 5 + assert err.partial_frame is not None + assert set(err.partial_frame["i"]) == {0, 1} + + +def test_quota_exhausted_on_first_chunk_429_has_no_partial_response(): + """A 429 on the very first sub-request means no responses are + banked; ``partial_response`` is ``None`` (and ``partial_frame`` is + empty) so callers can branch on that to distinguish "abort before + any data arrived" from "abort after partial collection".""" + + def fetch(args): + raise RuntimeError("429: Too many requests made.") + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(QuotaExhausted) as excinfo: + decorated({"sites": ["S1" * 10, "S2" * 10]}) + err = excinfo.value + assert err.completed_chunks == 0 + assert err.partial_response is None + assert err.partial_frame.empty + + +def test_chunker_passes_through_non_429_runtime_error(): + """A non-429 ``RuntimeError`` (e.g. a 500) is not a quota signal; + it must propagate unchanged so callers see the real cause.""" + state = {"i": 0} + + def fetch(args): + i = state["i"] + state["i"] += 1 + if i == 2: + raise RuntimeError("500: Internal server error.") + return ( + pd.DataFrame({"sites": list(args["sites"])}), + _quota_response(500), + ) + + decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) + with pytest.raises(RuntimeError, match=r"^500:"): + decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) + + +def test_quota_exhausted_message_points_at_resume(): + """The error message must surface the chunk offset and the resume + affordance — ``partial_frame`` is a footgun without it.""" + e = QuotaExhausted( + partial_frame=pd.DataFrame(), + partial_response=mock.Mock(), + completed_chunks=7, + total_chunks=20, + ) + msg = str(e) + assert "7/20" in msg + assert "429" in msg + assert "resume" in msg + + +def test_request_bytes_rejects_non_sizable_body(): + """``_request_bytes`` requires a deterministic byte count up front; + silently treating an unknown body as zero would under-chunk and let + the request blow past the server's POST-body limit. Generators, + iterables, and file-like objects must surface as ``TypeError``.""" + from dataretrieval.waterdata.chunking import _request_bytes + + class _FakeReqWithGenBody: + url = "https://example.com/foo" + body = (b"x" for _ in range(3)) + + with pytest.raises(TypeError, match="cannot size a request body"): + _request_bytes(_FakeReqWithGenBody()) + + +def test_request_bytes_handles_supported_body_types(): + """Sanity-check the supported body types: None (GET), bytes (raw + POST), str (JSON-as-string POST).""" + from dataretrieval.waterdata.chunking import _request_bytes + + class _Req: + def __init__(self, url, body): + self.url = url + self.body = body + + assert _request_bytes(_Req("ab", None)) == 2 + assert _request_bytes(_Req("ab", b"cd")) == 4 + assert _request_bytes(_Req("ab", "cd")) == 4 + assert _request_bytes(_Req("ab", bytearray(b"cd"))) == 4 + + +def test_multi_value_chunked_restores_canonical_url(): + """When chunking fans out, the aggregated response's ``.url`` must + reflect the *user's original* query (rebuilt from the unchunked + args), not the first chunk's URL. Callers logging ``md.url`` for + reproducibility need the full query.""" + sites = ["S" * 10 + str(i) for i in range(4)] + sub_urls: list[str] = [] + + @multi_value_chunked(build_request=_fake_build, url_limit=240) + def fetch(args): + # Each sub-response carries the chunked sub_args's URL, so + # without canonical restoration the first chunk's URL would + # leak through to md.url. + sub_url = _fake_build(**args).url + sub_urls.append(sub_url) + resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) + resp.headers = {} + resp.url = sub_url + return pd.DataFrame(), resp + + _df, md = fetch({"sites": sites}) + + assert len(sub_urls) > 1, "test setup error: chunker didn't fan out" + # md.url must equal the URL the unchunked query would have produced. + assert md.url == _fake_build(sites=sites).url + # And differ from every sub-request's URL (each carries a smaller list). + assert all(md.url != u for u in sub_urls) + # The canonical URL is strictly bigger byte-wise than any sub-request. + assert all(len(md.url) > len(u) for u in sub_urls) + + +def test_chunkable_params_skips_filter_passed_as_list(): + """Defensive guard: ``filter`` is documented as a string. If a caller + mistakenly passes it as a list, the chunker must NOT treat it as a + multi-value dim — comma-joining CQL clauses inside the URL would + produce a malformed filter expression. The joint planner partitions + ``filter`` via top-level ``OR`` splitting in ``_plan_joint``; it must + never be sliced as a list dim.""" + args = { + "monitoring_location_id": ["USGS-A", "USGS-B"], + "filter": ["a='1'", "a='2'"], # malformed input + "filter_lang": ["cql-text", "cql-json"], # ditto + } + chunkable = _chunkable_params(args) + assert "monitoring_location_id" in chunkable + assert "filter" not in chunkable + assert "filter_lang" not in chunkable + + +def test_chunkable_params_skips_scalar_contract_params(): + """``limit`` and ``skip_geometry`` are scalars by contract + (``int | None`` and ``bool | None`` respectively). If a caller smuggles + a list through type erasure (e.g. ``limit=["100","200"]`` after a bad + cast), the chunker must NOT treat it as a multi-value dim. Chunking + ``limit`` would silently fan into separate paginated queries with + different per-request caps; chunking ``skip_geometry`` would emit + sub-requests with conflicting geometry-output settings.""" + args = { + "monitoring_location_id": ["USGS-A", "USGS-B"], + "limit": ["100", "200"], + "skip_geometry": ["true", "false"], + } + chunkable = _chunkable_params(args) + assert "monitoring_location_id" in chunkable + assert "limit" not in chunkable + assert "skip_geometry" not in chunkable + + +def test_joint_planner_url_construction_long_filter_and_long_sites(): + """Realistic stress: 20 datetime OR-clauses combined with 100 USGS + site IDs. Every sub-request URL built from the plan must fit the + 8000-byte limit, the joint planner must beat the naive "filter at + bail-floor, chunk lists" approach, and the partitioned filters + must union to the user's original filter expression. + + Uses the real ``_construct_api_requests`` builder so the test + catches URL-encoding surprises that a fake builder would miss. + """ + # Realistic AGENCY-ID site format: USGS-{8 digits}. 500 sites is + # enough to force the URL well past the 8000-byte server limit + # without any filter contribution. + sites = [f"USGS-{i:08d}" for i in range(500)] + # 20 datetime equality clauses; each ~30 bytes raw, more after URL + # encoding (the apostrophes and `:` characters expand). + clauses = [ + f"time='2024-{m:02d}-{d:02d}T00:00:00Z'" + for m in range(1, 6) + for d in (1, 8, 15, 22) + ] + assert len(clauses) == 20 + filter_expr = " OR ".join(clauses) + + args = { + "service": "daily", + "monitoring_location_id": sites, + "filter": filter_expr, + } + url_limit = 8000 + + plan = _plan_joint(args, _construct_api_requests, url_limit) + assert plan is not None, "expected joint plan for over-limit request" + list_plan, filter_chunks = plan + + # Walk every sub-request and assert URL fits. + list_keys = list(list_plan) + list_combos = ( + list(itertools.product(*(list_plan[k] for k in list_keys))) + if list_plan + else [()] + ) + over_limit = [] + for combo in list_combos: + for filter_chunk in filter_chunks: + sub_args = dict(args) + sub_args.update(zip(list_keys, combo)) + if filter_chunk is not None: + sub_args["filter"] = filter_chunk + req = _construct_api_requests(**sub_args) + url_len = len(req.url) + (len(req.body) if req.body else 0) + if url_len > url_limit: + over_limit.append((url_len, combo, filter_chunk)) + assert not over_limit, ( + f"{len(over_limit)} sub-request(s) exceeded the URL limit; " + f"first: {over_limit[0]}" + ) + + # Filter partitions must union back to the original (modulo + # whitespace around `OR`). Each clause must appear exactly once. + union_clauses: list[str] = [] + for chunk in filter_chunks: + if chunk is None: + continue + union_clauses.extend(c.strip() for c in chunk.split(" OR ")) + assert union_clauses == clauses, ( + "filter partitioning must cover every original clause exactly once" + ) + + # List partitions: every original site appears in exactly one list + # chunk for each dim. Joined sites across all chunks recover the + # input set. + if "monitoring_location_id" in list_plan: + seen = [s for chunk in list_plan["monitoring_location_id"] for s in chunk] + assert sorted(seen) == sorted(sites) + + # Joint plan must beat the bail-floor-style worst case (singleton + # sites × all filter clauses singleton = 100 * 20 = 2000) — joint + # planning of these inputs cuts that by at least a factor of 4. + total = len(list_combos) * len(filter_chunks) + assert total < 500, f"joint plan emitted {total} sub-requests (expected <500)" diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 44e82b6a..18e78594 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,10 +1,7 @@ import datetime -import itertools import json -import math import sys from unittest import mock -from urllib.parse import quote_plus import pandas as pd import pytest @@ -13,7 +10,6 @@ if sys.version_info < (3, 10): pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) -from dataretrieval.waterdata import chunking as _chunking from dataretrieval.waterdata import ( get_channel, get_combined_metadata, @@ -32,19 +28,6 @@ get_stats_por, get_time_series_metadata, ) -from dataretrieval.waterdata.chunking import ( - _QUOTA_HEADER, - QuotaExhausted, - RequestExceedsQuota, - RequestTooLarge, - _chunkable_params, - _filter_chunk_counts, - _partition_clauses, - _plan_joint, - _plan_list_chunks, - _read_remaining, - multi_value_chunked, -) from dataretrieval.waterdata.utils import ( _check_monitoring_location_id, _check_profiles, @@ -224,629 +207,6 @@ def test_construct_api_requests_two_element_date_list_becomes_interval(): assert "time=2024-01-01%2F2024-01-31" in req.url -# ----- Multi-value GET-parameter chunker (chunking.py) ---------------------- -# -# These tests exercise the planner with a fake ``build_request`` whose URL -# byte length is a deterministic function of its inputs. Tests below model: -# - non-chunkable args contribute ``base_bytes``, -# - every multi-value list contributes ``len(",".join(map(str, v)))``, -# - the ``filter`` kwarg contributes ``len(filter)``. -# This isolates planner behaviour from the real HTTP request builder. - - -class _FakeReq: - __slots__ = ("url", "body") - - def __init__(self, url, body=None): - self.url = url - self.body = body - - -def _fake_build(*, base=200, **kwargs): - """Fake build_request: URL length deterministic in its inputs. - - Mirrors the GET-routed shape: payload goes in the URL, body is None. - List/string values are URL-encoded via ``quote_plus`` so the fake's - byte count matches what the real ``_construct_api_requests`` would - produce; otherwise an alphanumeric test could pass against the fake - but fail in production once values containing ``%``, ``+``, ``/``, - ``&`` etc. (which expand under encoding) reach the same code path. - """ - bytes_ = base - for v in kwargs.values(): - if isinstance(v, (list, tuple)): - bytes_ += len(quote_plus(",".join(map(str, v)))) - elif isinstance(v, str): - bytes_ += len(quote_plus(v)) - return _FakeReq("x" * bytes_) - - -def test_partition_clauses_balanced(): - """k roughly-balanced groups of atoms, with the remainder distributed - across the first groups so no group differs from another by more - than one clause. Returns raw groups (not joined) so the planner can - size candidates without materializing discarded partitions.""" - clauses = ["a='1'", "b='2'", "c='3'", "d='4'", "e='5'"] - assert _partition_clauses(clauses, 1) == [clauses] - # 5 clauses into 2 groups → sizes 3, 2 (remainder lands in the first). - assert _partition_clauses(clauses, 2) == [ - ["a='1'", "b='2'", "c='3'"], - ["d='4'", "e='5'"], - ] - # Singletons. - assert _partition_clauses(clauses, 5) == [[c] for c in clauses] - # k > len(clauses) → singletons (don't synthesize empty groups). - assert _partition_clauses(clauses, 99) == [[c] for c in clauses] - - -def test_filter_chunk_counts_powers_of_two_plus_n(): - """Candidate counts cover the trade-off at powers of two, and always - include ``n_clauses`` itself so the fully-singleton case is always - evaluated even when n isn't a power of two.""" - assert _filter_chunk_counts(0) == [1] - assert _filter_chunk_counts(1) == [1] - assert _filter_chunk_counts(5) == [1, 2, 4, 5] - assert _filter_chunk_counts(16) == [1, 2, 4, 8, 16] - - -def test_plan_chunks_returns_none_when_request_fits(): - """URL under limit → planner returns None, decorator passes through.""" - args = {"monitoring_location_id": ["A", "B", "C"]} - plan = _plan_list_chunks(args, _fake_build, url_limit=8000) - assert plan is None - - -def test_plan_chunks_returns_none_when_no_chunkable_lists(): - """No multi-value lists, however over-limit → planner can't help, returns None - (decorator falls through; server may 414 but that's not chunker's job).""" - args = {"monitoring_location_id": "scalar-only"} - plan = _plan_list_chunks(args, _fake_build, url_limit=10) - assert plan is None - - -def test_plan_chunks_greedy_halving_targets_largest_dim(): - """Two dims with one much larger — the heavy dim halves first.""" - args = { - "monitoring_location_id": ["X" * 30, "Y" * 30, "Z" * 30, "W" * 30], - "parameter_code": ["00060", "00065"], - } - # full URL ≈ 200 + 123 + 12 = 335; force splitting heavy dim only. - plan = _plan_list_chunks(args, _fake_build, url_limit=310) - assert len(plan["monitoring_location_id"]) > 1 - assert len(plan["parameter_code"]) == 1 # heavy-dim split was enough - - -def test_plan_chunks_raises_request_too_large_at_singleton_floor(): - """Limit below singleton-per-dim floor (with no chunkable filter to - fall back on) → RequestTooLarge with a clear message.""" - args = {"monitoring_location_id": ["A", "B"]} - # base=200 alone exceeds limit; no relief possible. - with pytest.raises(RequestTooLarge, match="smallest reducible"): - _plan_list_chunks(args, _fake_build, url_limit=100) - - -def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): - """When the request can only fit by chunking BOTH the list and the - filter, the joint planner must produce a plan that touches both - dims.""" - clauses = [f"f='{i}'" for i in range(10)] - args = { - "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], - "filter": " OR ".join(clauses), - } - # Singleton list + full filter ≈ 200 + 10 + 86 = 296 (over limit 240). - # Joint planner must split the filter into k >= 2 groups. - plan = _plan_joint(args, _fake_build, url_limit=240) - assert plan is not None - list_plan, filter_chunks = plan - # Either the filter was chunked, the list was chunked, or both. - assert len(filter_chunks) > 1 or any(len(v) > 1 for v in list_plan.values()) - - -def test_plan_joint_minimizes_total_sub_requests(): - """When both dims need shrinking, picking smaller filter chunks - frees URL budget for larger list chunks, and vice versa. The - planner should pick the allocation with the *fewest* total - sub-requests, not just the first allocation that fits.""" - # 16 short clauses (no inflation under URL encoding so the math is - # tractable). Each clause = 5 bytes (e.g. "f='0'"); full filter ≈ - # 16*5 + 15*4 = 140 bytes raw. - clauses = [f"f='{i}'" for i in range(16)] - args = { - "sites": ["S" * 30 for _ in range(8)], # 8 sites @ 30 chars - "filter": " OR ".join(clauses), - } - # Tight limit forces both dims to participate. - plan = _plan_joint(args, _fake_build, url_limit=380) - assert plan is not None - list_plan, filter_chunks = plan - total = max( - 1, math.prod(len(c) for c in list_plan.values()) if list_plan else 1 - ) * len(filter_chunks) - # Plan must beat the bail-floor-style worst case (8 singletons × 16 - # filter chunks = 128 sub-requests) by a healthy margin. - assert total < 128 - - -def test_plan_joint_raises_when_smallest_plan_doesnt_fit(): - """If even the most aggressive joint plan (singleton lists + - singleton filter clauses) still exceeds the limit, surface - RequestTooLarge — there's nothing left to shrink.""" - args = { - "monitoring_location_id": ["A" * 10, "B" * 10], - "filter": "x='12345' OR x='67890'", # min clause is 9 chars - } - # Base 200 + singleton site (10) + singleton clause (9) = 219; limit - # below 219 → no joint plan can fit. - with pytest.raises(RequestTooLarge): - _plan_joint(args, _fake_build, url_limit=210) - - -def test_multi_value_chunked_passes_through_when_url_fits(): - """No planning needed → decorator calls underlying function exactly once - with the original args.""" - calls = [] - - @multi_value_chunked(build_request=_fake_build, url_limit=8000) - def fetch(args): - calls.append(args) - return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - - fetch({"monitoring_location_id": ["A", "B"]}) - assert len(calls) == 1 - assert calls[0]["monitoring_location_id"] == ["A", "B"] - - -def test_multi_value_chunked_emits_cartesian_product(): - """Two chunkable dims, each split into 2 chunks → exactly 4 sub-calls, - each pairing one chunk from each dim.""" - calls = [] - - @multi_value_chunked(build_request=_fake_build, url_limit=240) - def fetch(args): - calls.append({k: v for k, v in args.items() if k in ("sites", "pcodes")}) - return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - - fetch( - { - "sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10], - "pcodes": ["P1" * 10, "P2" * 10, "P3" * 10, "P4" * 10], - } - ) - # Both heavy → planner should split both dims. Confirm a cartesian shape: - # every unique site-chunk pairs with every unique pcode-chunk. - sites_seen = {tuple(c["sites"]) for c in calls} - pcodes_seen = {tuple(c["pcodes"]) for c in calls} - assert len(calls) == len(sites_seen) * len(pcodes_seen) - assert len(sites_seen) > 1 - assert len(pcodes_seen) > 1 - - -def test_multi_value_chunked_emits_3d_cartesian_product(): - """Three chunkable dims, each forced to split → exhaustive cartesian - product across all three. Verifies the planner's halving loop handles - N>2 dims uniformly and the wrapper's ``itertools.product`` enumerates - every combination exactly once.""" - calls = [] - - @multi_value_chunked(build_request=_fake_build, url_limit=240) - def fetch(args): - calls.append(tuple(tuple(args[k]) for k in ("sites", "pcodes", "stats"))) - return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - - fetch( - { - "sites": ["S" * 12 + str(i) for i in range(4)], - "pcodes": ["P" * 12 + str(i) for i in range(4)], - "stats": ["T" * 12 + str(i) for i in range(4)], - } - ) - - # Three independent axes — every (site_chunk, pcode_chunk, stat_chunk) - # triple must appear exactly once. Confirm: - sites_seen = {c[0] for c in calls} - pcodes_seen = {c[1] for c in calls} - stats_seen = {c[2] for c in calls} - - assert len(sites_seen) > 1, "sites dim was not split" - assert len(pcodes_seen) > 1, "pcodes dim was not split" - assert len(stats_seen) > 1, "stats dim was not split" - - # Cartesian shape: # sub-calls == product of unique chunks across dims - expected = len(sites_seen) * len(pcodes_seen) * len(stats_seen) - assert len(calls) == expected, ( - f"expected {expected} cartesian-product sub-calls, got {len(calls)}" - ) - # And no triple repeats (exhaustive enumeration, no duplicates). - assert len(set(calls)) == len(calls) - # The chunked values, when unioned across calls, recover the original list. - assert {x for tup in sites_seen for x in tup} == { - "S" * 12 + str(i) for i in range(4) - } - assert {x for tup in pcodes_seen for x in tup} == { - "P" * 12 + str(i) for i in range(4) - } - assert {x for tup in stats_seen for x in tup} == { - "T" * 12 + str(i) for i in range(4) - } - - -def test_multi_value_chunked_lazy_url_limit(monkeypatch): - """``url_limit=None`` → resolve chunking._WATERDATA_URL_BYTE_LIMIT at call - time, so tests that patch the constant affect this decorator too.""" - calls = [] - - @multi_value_chunked(build_request=_fake_build) # url_limit defaults to None - def fetch(args): - calls.append(args) - return pd.DataFrame(), mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - - monkeypatch.setattr(_chunking, "_WATERDATA_URL_BYTE_LIMIT", 240) - # 4 sites of 10 chars → exceeds 240 → planner splits. - fetch({"sites": ["S" * 10 + str(i) for i in range(4)]}) - assert len(calls) > 1, "patched constant should drive chunking" - - -def _quota_response(remaining: int | str | None) -> mock.Mock: - """A mock requests.Response-like object whose ``x-ratelimit-remaining`` - header reflects the given value (None → header absent).""" - resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - resp.headers = {} if remaining is None else {_QUOTA_HEADER: str(remaining)} - return resp - - -def test_read_remaining_parses_header(): - assert _read_remaining(_quota_response(42)) == 42 - - -def test_read_remaining_treats_missing_header_as_unknown(): - """Servers that don't echo a rate-limit header must not synthesize - a remaining count — the wrapper checks for the unknown sentinel - explicitly and skips the post-first-chunk plan-vs-quota check.""" - from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN - - assert _read_remaining(_quota_response(None)) == _QUOTA_UNKNOWN - - -def test_read_remaining_treats_malformed_header_as_unknown(): - """Defensive: non-integer header value → unknown sentinel, so the - quota check is skipped rather than tripping on a parse failure.""" - from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN - - assert _read_remaining(_quota_response("not-a-number")) == _QUOTA_UNKNOWN - - -def test_request_exceeds_quota_after_first_chunk(): - """Plan totals 4 sub-requests. The first response reports - ``x-ratelimit-remaining=1`` — only 2 sub-requests fit total - (the one just issued + 1 more). The wrapper must raise - ``RequestExceedsQuota`` *before* issuing chunk 2.""" - calls: list[dict] = [] - - def fetch(args): - calls.append(args) - return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(1) - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - - with pytest.raises(RequestExceedsQuota) as excinfo: - decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) - - err = excinfo.value - assert err.planned_chunks == 4 - assert err.available == 2 # remaining=1 + the chunk we just spent - assert err.deficit == 2 - assert len(calls) == 1, "only the first chunk should have been issued" - - -def test_request_exceeds_quota_message_reports_deficit(): - """The error must surface planned / available / deficit so callers - know precisely how far over budget the call is.""" - e = RequestExceedsQuota(planned_chunks=10, available=4, deficit=6) - msg = str(e) - assert "10" in msg - assert "4" in msg - assert "6" in msg - - -def test_request_exceeds_quota_not_raised_when_plan_fits(): - """If ``x-ratelimit-remaining`` is large enough to cover the rest - of the plan, the wrapper proceeds normally.""" - remaining_seq = iter([100, 99, 98, 97]) - - def fetch(args): - return ( - pd.DataFrame({"sites": list(args["sites"])}), - _quota_response(next(remaining_seq)), - ) - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) - assert len(df) == 4 - - -def test_no_quota_check_when_header_absent(): - """Without an ``x-ratelimit-remaining`` header the wrapper has no - quota signal and must NOT synthesize a ``RequestExceedsQuota``; - every planned sub-request runs.""" - - def fetch(args): - return pd.DataFrame({"sites": list(args["sites"])}), _quota_response(None) - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - df, _ = decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10]}) - assert len(df) == 4 - - -def test_quota_exhausted_on_mid_call_429(): - """Mid-call 429 (a concurrent caller drained the bucket) surfaces - as ``QuotaExhausted`` carrying the partial frame plus the chunk - offset so callers can resume after the window resets.""" - state = {"i": 0} - - def fetch(args): - i = state["i"] - state["i"] += 1 - if i == 2: - inner = RuntimeError("429: Too many requests made.") - try: - raise inner - except RuntimeError as cause: - raise RuntimeError( - "Paginated request failed after collecting 0 page(s): " - "429: Too many requests made." - ) from cause - return ( - pd.DataFrame({"i": [i], "sites": list(args["sites"])}), - _quota_response(500), - ) - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - with pytest.raises(QuotaExhausted) as excinfo: - decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) - - err = excinfo.value - assert err.completed_chunks == 2 # chunks 0 and 1 banked; 429 hit on i=2 - assert err.total_chunks == 5 - assert err.partial_frame is not None - assert set(err.partial_frame["i"]) == {0, 1} - - -def test_quota_exhausted_on_first_chunk_429_has_no_partial_response(): - """A 429 on the very first sub-request means no responses are - banked; ``partial_response`` is ``None`` (and ``partial_frame`` is - empty) so callers can branch on that to distinguish "abort before - any data arrived" from "abort after partial collection".""" - - def fetch(args): - raise RuntimeError("429: Too many requests made.") - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - with pytest.raises(QuotaExhausted) as excinfo: - decorated({"sites": ["S1" * 10, "S2" * 10]}) - err = excinfo.value - assert err.completed_chunks == 0 - assert err.partial_response is None - assert err.partial_frame.empty - - -def test_chunker_passes_through_non_429_runtime_error(): - """A non-429 ``RuntimeError`` (e.g. a 500) is not a quota signal; - it must propagate unchanged so callers see the real cause.""" - state = {"i": 0} - - def fetch(args): - i = state["i"] - state["i"] += 1 - if i == 2: - raise RuntimeError("500: Internal server error.") - return ( - pd.DataFrame({"sites": list(args["sites"])}), - _quota_response(500), - ) - - decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) - with pytest.raises(RuntimeError, match=r"^500:"): - decorated({"sites": ["S1" * 10, "S2" * 10, "S3" * 10, "S4" * 10, "S5" * 10]}) - - -def test_quota_exhausted_message_points_at_resume(): - """The error message must surface the chunk offset and the resume - affordance — ``partial_frame`` is a footgun without it.""" - e = QuotaExhausted( - partial_frame=pd.DataFrame(), - partial_response=mock.Mock(), - completed_chunks=7, - total_chunks=20, - ) - msg = str(e) - assert "7/20" in msg - assert "429" in msg - assert "resume" in msg - - -def test_request_bytes_rejects_non_sizable_body(): - """``_request_bytes`` requires a deterministic byte count up front; - silently treating an unknown body as zero would under-chunk and let - the request blow past the server's POST-body limit. Generators, - iterables, and file-like objects must surface as ``TypeError``.""" - from dataretrieval.waterdata.chunking import _request_bytes - - class _FakeReqWithGenBody: - url = "https://example.com/foo" - body = (b"x" for _ in range(3)) - - with pytest.raises(TypeError, match="cannot size a request body"): - _request_bytes(_FakeReqWithGenBody()) - - -def test_request_bytes_handles_supported_body_types(): - """Sanity-check the supported body types: None (GET), bytes (raw - POST), str (JSON-as-string POST).""" - from dataretrieval.waterdata.chunking import _request_bytes - - class _Req: - def __init__(self, url, body): - self.url = url - self.body = body - - assert _request_bytes(_Req("ab", None)) == 2 - assert _request_bytes(_Req("ab", b"cd")) == 4 - assert _request_bytes(_Req("ab", "cd")) == 4 - assert _request_bytes(_Req("ab", bytearray(b"cd"))) == 4 - - -def test_multi_value_chunked_restores_canonical_url(): - """When chunking fans out, the aggregated response's ``.url`` must - reflect the *user's original* query (rebuilt from the unchunked - args), not the first chunk's URL. Callers logging ``md.url`` for - reproducibility need the full query.""" - sites = ["S" * 10 + str(i) for i in range(4)] - sub_urls: list[str] = [] - - @multi_value_chunked(build_request=_fake_build, url_limit=240) - def fetch(args): - # Each sub-response carries the chunked sub_args's URL, so - # without canonical restoration the first chunk's URL would - # leak through to md.url. - sub_url = _fake_build(**args).url - sub_urls.append(sub_url) - resp = mock.Mock(elapsed=datetime.timedelta(seconds=0.1)) - resp.headers = {} - resp.url = sub_url - return pd.DataFrame(), resp - - _df, md = fetch({"sites": sites}) - - assert len(sub_urls) > 1, "test setup error: chunker didn't fan out" - # md.url must equal the URL the unchunked query would have produced. - assert md.url == _fake_build(sites=sites).url - # And differ from every sub-request's URL (each carries a smaller list). - assert all(md.url != u for u in sub_urls) - # The canonical URL is strictly bigger byte-wise than any sub-request. - assert all(len(md.url) > len(u) for u in sub_urls) - - -def test_chunkable_params_skips_filter_passed_as_list(): - """Defensive guard: ``filter`` is documented as a string. If a caller - mistakenly passes it as a list, the chunker must NOT treat it as a - multi-value dim — comma-joining CQL clauses inside the URL would - produce a malformed filter expression. The joint planner partitions - ``filter`` via top-level ``OR`` splitting in ``_plan_joint``; it must - never be sliced as a list dim.""" - args = { - "monitoring_location_id": ["USGS-A", "USGS-B"], - "filter": ["a='1'", "a='2'"], # malformed input - "filter_lang": ["cql-text", "cql-json"], # ditto - } - chunkable = _chunkable_params(args) - assert "monitoring_location_id" in chunkable - assert "filter" not in chunkable - assert "filter_lang" not in chunkable - - -def test_chunkable_params_skips_scalar_contract_params(): - """``limit`` and ``skip_geometry`` are scalars by contract - (``int | None`` and ``bool | None`` respectively). If a caller smuggles - a list through type erasure (e.g. ``limit=["100","200"]`` after a bad - cast), the chunker must NOT treat it as a multi-value dim. Chunking - ``limit`` would silently fan into separate paginated queries with - different per-request caps; chunking ``skip_geometry`` would emit - sub-requests with conflicting geometry-output settings.""" - args = { - "monitoring_location_id": ["USGS-A", "USGS-B"], - "limit": ["100", "200"], - "skip_geometry": ["true", "false"], - } - chunkable = _chunkable_params(args) - assert "monitoring_location_id" in chunkable - assert "limit" not in chunkable - assert "skip_geometry" not in chunkable - - -def test_joint_planner_url_construction_long_filter_and_long_sites(): - """Realistic stress: 20 datetime OR-clauses combined with 100 USGS - site IDs. Every sub-request URL built from the plan must fit the - 8000-byte limit, the joint planner must beat the naive "filter at - bail-floor, chunk lists" approach, and the partitioned filters - must union to the user's original filter expression. - - Uses the real ``_construct_api_requests`` builder so the test - catches URL-encoding surprises that a fake builder would miss. - """ - # Realistic AGENCY-ID site format: USGS-{8 digits}. 500 sites is - # enough to force the URL well past the 8000-byte server limit - # without any filter contribution. - sites = [f"USGS-{i:08d}" for i in range(500)] - # 20 datetime equality clauses; each ~30 bytes raw, more after URL - # encoding (the apostrophes and `:` characters expand). - clauses = [ - f"time='2024-{m:02d}-{d:02d}T00:00:00Z'" - for m in range(1, 6) - for d in (1, 8, 15, 22) - ] - assert len(clauses) == 20 - filter_expr = " OR ".join(clauses) - - args = { - "service": "daily", - "monitoring_location_id": sites, - "filter": filter_expr, - } - url_limit = 8000 - - plan = _plan_joint(args, _construct_api_requests, url_limit) - assert plan is not None, "expected joint plan for over-limit request" - list_plan, filter_chunks = plan - - # Walk every sub-request and assert URL fits. - list_keys = list(list_plan) - list_combos = ( - list(itertools.product(*(list_plan[k] for k in list_keys))) - if list_plan - else [()] - ) - over_limit = [] - for combo in list_combos: - for filter_chunk in filter_chunks: - sub_args = dict(args) - sub_args.update(zip(list_keys, combo)) - if filter_chunk is not None: - sub_args["filter"] = filter_chunk - req = _construct_api_requests(**sub_args) - url_len = len(req.url) + (len(req.body) if req.body else 0) - if url_len > url_limit: - over_limit.append((url_len, combo, filter_chunk)) - assert not over_limit, ( - f"{len(over_limit)} sub-request(s) exceeded the URL limit; " - f"first: {over_limit[0]}" - ) - - # Filter partitions must union back to the original (modulo - # whitespace around `OR`). Each clause must appear exactly once. - union_clauses: list[str] = [] - for chunk in filter_chunks: - if chunk is None: - continue - union_clauses.extend(c.strip() for c in chunk.split(" OR ")) - assert union_clauses == clauses, ( - "filter partitioning must cover every original clause exactly once" - ) - - # List partitions: every original site appears in exactly one list - # chunk for each dim. Joined sites across all chunks recover the - # input set. - if "monitoring_location_id" in list_plan: - seen = [s for chunk in list_plan["monitoring_location_id"] for s in chunk] - assert sorted(seen) == sorted(sites) - - # Joint plan must beat the bail-floor-style worst case (singleton - # sites × all filter clauses singleton = 100 * 20 = 2000) — joint - # planning of these inputs cuts that by at least a factor of 4. - total = len(list_combos) * len(filter_chunks) - assert total < 500, f"joint plan emitted {total} sub-requests (expected <500)" - - def test_samples_results(): """Test results call for proper columns""" df, _ = get_samples( From f615db85dd32c3ff9b84457184c5bd2093605b83 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 20:04:21 -0500 Subject: [PATCH 14/17] =?UTF-8?q?test(waterdata):=20Drop=20tests/stress=5F?= =?UTF-8?q?chunker.py=20=E2=80=94=20invariants=20now=20covered=20by=20unit?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stress test exercised the joint planner across eight scenarios against a fake URL builder, checking URL-bytes-within-limit, exhaustive cartesian-product coverage, and a beats-the-bail-floor quality bar. Every one of those properties is now asserted by ``tests/waterdata_chunking_test.py``, including ``test_joint_planner_url_construction_long_filter_and_long_sites`` which runs the real ``_construct_api_requests`` builder against a 500-site × 20-clause query and verifies each invariant explicitly. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/stress_chunker.py | 434 ---------------------------------------- 1 file changed, 434 deletions(-) delete mode 100644 tests/stress_chunker.py diff --git a/tests/stress_chunker.py b/tests/stress_chunker.py deleted file mode 100644 index 728dc89d..00000000 --- a/tests/stress_chunker.py +++ /dev/null @@ -1,434 +0,0 @@ -"""Offline stress test for the joint chunker. - -Runs the multi_value_chunked decorator across a battery of scenarios -with the URL byte limit lowered well below the live API's, captures -every sub-request the decorator emits, and verifies four invariants -per scenario: - - 1. Every sub-request URL stays under the configured ``url_limit`` - (the chunker's primary correctness contract). - 2. Cartesian-product coverage: the union of values seen for each list - dim equals the user's original input list (no data dropped). - 3. Disjointness: no input value appears in more than one chunk for - its dim (no duplicates within a list dim's sub-requests). - 4. Filter-clause coverage: when the filter is OR-chunked, the union - of clauses across sub-requests equals the original clause set in - order (lossless OR-disjunction). - -Plus a greedy-quality check: for the "long filter + long lists" -scenario, the joint planner should comfortably beat the bail-floor-only -baseline (singleton list chunks × one clause per filter chunk — the -worst sensible plan, used as the floor the greedy planner has to beat). - -Run: ``python3 tests/stress_chunker.py``. -""" - -from __future__ import annotations - -import sys -from collections.abc import Iterable -from datetime import timedelta -from types import SimpleNamespace -from urllib.parse import parse_qsl, urlsplit - -import pandas as pd - -from dataretrieval.waterdata import chunking -from dataretrieval.waterdata.chunking import ( - multi_value_chunked, -) -from dataretrieval.waterdata.utils import _construct_api_requests - -# ────────────────────────────────────────────────────────────────────── -# Harness -# ────────────────────────────────────────────────────────────────────── - - -def _mock_response() -> SimpleNamespace: - return SimpleNamespace( - elapsed=timedelta(milliseconds=1), - headers={}, - url="https://example.test", - body=None, - ) - - -def run_chunked( - args: dict, url_limit: int -) -> tuple[list[dict], list[int], pd.DataFrame, SimpleNamespace]: - """Execute the chunker end-to-end with a captured ``fetch_once`` and - return (sub_args_seen, url_bytes_seen, combined_frame, combined_response). - - Capturing the actual built-URL byte count inline avoids the test's - post-hoc URL-fit verifier having to rebuild every sub-request from - scratch (one less ``_construct_api_requests`` call per sub-request).""" - sub_args_seen: list[dict] = [] - url_bytes_seen: list[int] = [] - - @multi_value_chunked( - build_request=_construct_api_requests, - url_limit=url_limit, - ) - def fetch_once(args: dict) -> tuple[pd.DataFrame, SimpleNamespace]: - sub_args_seen.append(dict(args)) - req = _construct_api_requests(**args) - url_bytes_seen.append(len(req.url) + (len(req.body) if req.body else 0)) - # Stable per-sub-request id so dedup logic doesn't collapse rows. - idx = len(sub_args_seen) - frame = pd.DataFrame({"id": [f"row-{idx}"], "value": [idx]}) - return frame, _mock_response() - - df, response = fetch_once(args) - return sub_args_seen, url_bytes_seen, df, response - - -def _query_params(url: str) -> dict[str, str]: - return dict(parse_qsl(urlsplit(url).query, keep_blank_values=True)) - - -def _bail_floor_baseline(args: dict, url_limit: int) -> int: - """Upper bound: the sub-request count a degenerate plan that - singleton-chunks every list dim AND emits one clause per filter - sub-request would produce. The joint planner should beat this - comfortably — it's the quality bar. - - Counts ``product(per-dim length) × n_clauses`` for the inputs that - are eligible to chunk.""" - list_dims_card = 1 - for k, v in args.items(): - if k in chunking._NEVER_CHUNK: - continue - if isinstance(v, (list, tuple)) and len(v) > 1: - list_dims_card *= len(v) - n_clauses = 1 - if isinstance(args.get("filter"), str): - clauses = chunking._split_top_level_or(args["filter"]) - n_clauses = max(len(clauses), 1) - # baseline = worst-case singletons in every dim - return list_dims_card * n_clauses - - -# ────────────────────────────────────────────────────────────────────── -# Invariant checks -# ────────────────────────────────────────────────────────────────────── - - -def assert_urls_fit(url_bytes_seen: list[int], url_limit: int) -> None: - over = [n for n in url_bytes_seen if n > url_limit] - if over: - raise AssertionError( - f"{len(over)}/{len(url_bytes_seen)} sub-requests exceeded " - f"url_limit={url_limit}; first overrun: {over[0]} bytes" - ) - - -def assert_list_coverage( - sub_args_seen: list[dict], - original: dict, - dim: str, -) -> None: - """For each list dim, the set of *distinct* chunks issued across - sub-requests must union to the original input with no overlap. - Cartesian-product duplication of any single dim's chunk across the - OTHER dims' iterations is expected — the chunk identity (the tuple - of values) is what matters, not how many times it's seen.""" - expected = sorted(original[dim]) - distinct_chunks: set[tuple[str, ...]] = set() - for sa in sub_args_seen: - v = sa.get(dim) - if v is None: - continue - chunk = tuple(v) if isinstance(v, (list, tuple)) else (v,) - distinct_chunks.add(chunk) - seen = [val for chunk in distinct_chunks for val in chunk] - if sorted(seen) != expected: - missing = set(expected) - set(seen) - extra = set(seen) - set(expected) - dups = {x for x in seen if seen.count(x) > 1} - raise AssertionError( - f"list dim {dim!r} coverage failed across " - f"{len(distinct_chunks)} distinct chunk(s): " - f"missing={sorted(missing)[:3]} extra={sorted(extra)[:3]} " - f"dups={sorted(dups)[:3]}" - ) - - -def assert_filter_coverage( - sub_args_seen: list[dict], - original_filter: str, -) -> None: - """Each filter sub-chunk appears once per list-dim combination. The - *set* of distinct filter values seen, split back into clauses and - concatenated in iteration order, must equal the original clauses - (preserving the planner's order across partitions).""" - expected = chunking._split_top_level_or(original_filter) - distinct_filters: list[str] = [] - for sa in sub_args_seen: - f = sa.get("filter") - if isinstance(f, str) and f not in distinct_filters: - distinct_filters.append(f) - seen: list[str] = [] - for f in distinct_filters: - seen.extend(chunking._split_top_level_or(f)) - if seen != expected: - first_diff = next( - (i for i, (a, b) in enumerate(zip(seen, expected)) if a != b), - min(len(seen), len(expected)), - ) - raise AssertionError( - f"filter clause coverage failed: got {len(seen)} unique-chunk " - f"clauses in chunks, expected {len(expected)} in original order; " - f"first divergence at index {first_diff}" - ) - - -# ────────────────────────────────────────────────────────────────────── -# Scenarios -# ────────────────────────────────────────────────────────────────────── - - -def _ohio_sites(n: int) -> list[str]: - return [f"USGS-{i:08d}" for i in range(n)] - - -def _datetime_clauses(n: int) -> list[str]: - return [ - f"time='2024-{((i // 28) % 12) + 1:02d}-{(i % 28) + 1:02d}T00:00:00Z'" - for i in range(n) - ] - - -def _heavy_clauses(n: int) -> list[str]: - # Each clause contains URL-encoding-heavy characters (`%`, `&`, `+`). - return [ - f"value='~~ baseline%2B{i} & next+{i} ~~'" # noqa: E501 - for i in range(n) - ] - - -def scenario( - name: str, - args: dict, - url_limit: int, - *, - list_dims_to_check: Iterable[str] = (), - expected_filter: str | None = None, - require_chunked: bool = True, -) -> dict: - sub_args_seen, url_bytes_seen, df, _ = run_chunked(args, url_limit=url_limit) - n_sub = len(sub_args_seen) - baseline = _bail_floor_baseline(args, url_limit) - - assert_urls_fit(url_bytes_seen, url_limit) - for dim in list_dims_to_check: - assert_list_coverage(sub_args_seen, args, dim) - if expected_filter is not None: - assert_filter_coverage(sub_args_seen, expected_filter) - if require_chunked and n_sub <= 1: - raise AssertionError( - f"{name}: expected fan-out but the chunker produced {n_sub} sub-request(s)" - ) - - return { - "name": name, - "url_limit": url_limit, - "sub_requests": n_sub, - "baseline_worst_case": baseline, - "speedup_vs_baseline": (baseline / n_sub) if n_sub else float("inf"), - "rows_returned": len(df), - } - - -def main() -> int: - print(f"{'=' * 78}\n Joint chunker stress test\n{'=' * 78}\n") - - results: list[dict] = [] - - # A. Long site list, no filter — pure list chunking. - sites = _ohio_sites(500) - results.append( - scenario( - "A. Long sites only", - {"service": "daily", "monitoring_location_id": sites}, - url_limit=2000, - list_dims_to_check=["monitoring_location_id"], - ) - ) - - # B. Long OR filter, no list dims — pure filter chunking. - clauses = _datetime_clauses(50) - filter_expr = " OR ".join(clauses) - results.append( - scenario( - "B. Long filter only", - { - "service": "continuous", - "monitoring_location_id": "USGS-00000001", - "filter": filter_expr, - "filter_lang": "cql-text", - }, - url_limit=1500, - expected_filter=filter_expr, - ) - ) - - # C. Long sites + long filter — joint planning trade-off. - sites = _ohio_sites(200) - clauses = _datetime_clauses(40) - filter_expr = " OR ".join(clauses) - results.append( - scenario( - "C. Long sites + long filter", - { - "service": "continuous", - "monitoring_location_id": sites, - "filter": filter_expr, - "filter_lang": "cql-text", - }, - url_limit=2000, - list_dims_to_check=["monitoring_location_id"], - expected_filter=filter_expr, - ) - ) - - # D. 3-D multi-value list chunking. Tight limit forces all three - # dims to participate in the cartesian product. - sites = _ohio_sites(60) - pcodes = [f"{i:05d}" for i in range(20)] - statistics = [f"{i:05d}" for i in range(10)] - results.append( - scenario( - "D. 3-D list cartesian product (sites × pcodes × statistics)", - { - "service": "daily", - "monitoring_location_id": sites, - "parameter_code": pcodes, - "statistic_id": statistics, - }, - url_limit=800, - list_dims_to_check=[ - "monitoring_location_id", - "parameter_code", - "statistic_id", - ], - ) - ) - - # E. Lopsided clause sizes — half short, half long. - short_clauses = [f"a='{i}'" for i in range(30)] - long_clauses = [ - f"description LIKE '%%state aquifer system region {i}%%'" for i in range(30) - ] - filter_expr = " OR ".join(short_clauses + long_clauses) - results.append( - scenario( - "E. Lopsided clause sizes", - { - "service": "continuous", - "monitoring_location_id": "USGS-00000001", - "filter": filter_expr, - "filter_lang": "cql-text", - }, - url_limit=1800, - expected_filter=filter_expr, - ) - ) - - # F. Heavily URL-encoded clauses (special chars expand under quote_plus). - # Tight limit forces the planner to account for encoding inflation. - clauses = _heavy_clauses(60) - filter_expr = " OR ".join(clauses) - results.append( - scenario( - "F. URL-encoding-heavy clauses", - { - "service": "continuous", - "monitoring_location_id": "USGS-00000001", - "filter": filter_expr, - "filter_lang": "cql-text", - }, - url_limit=1500, - expected_filter=filter_expr, - ) - ) - - # G. Very tight limit — singleton chunks expected. - sites = _ohio_sites(100) - results.append( - scenario( - "G. Very tight URL limit (singleton sites)", - {"service": "daily", "monitoring_location_id": sites}, - url_limit=420, - list_dims_to_check=["monitoring_location_id"], - ) - ) - - # H. Generous URL limit — no chunking needed. - sites = _ohio_sites(5) - results.append( - scenario( - "H. Generous URL limit (no chunking)", - {"service": "daily", "monitoring_location_id": sites}, - url_limit=10000, - list_dims_to_check=["monitoring_location_id"], - require_chunked=False, - ) - ) - - # I. Greedy-search adapts: same input, sweep url_limit and confirm - # sub-request count is monotonically non-increasing as the limit grows. - # Five points are enough to demonstrate the curve cleanly. - sites = _ohio_sites(100) - clauses = _datetime_clauses(20) - args = { - "service": "continuous", - "monitoring_location_id": sites, - "filter": " OR ".join(clauses), - "filter_lang": "cql-text", - } - counts_by_limit: list[tuple[int, int]] = [] - for limit in [1200, 1800, 2400, 3500, 10000]: - seen, url_bytes_seen, _, _ = run_chunked(args, url_limit=limit) - assert_urls_fit(url_bytes_seen, limit) - assert_list_coverage(seen, args, "monitoring_location_id") - assert_filter_coverage(seen, args["filter"]) - counts_by_limit.append((limit, len(seen))) - - monotone = all( - counts_by_limit[i][1] >= counts_by_limit[i + 1][1] - for i in range(len(counts_by_limit) - 1) - ) - if not monotone: - raise AssertionError( - f"sub-request count is not monotonically non-increasing in " - f"url_limit: {counts_by_limit}" - ) - - # ────────────────────────────────────────────────────────────────── - # Report - # ────────────────────────────────────────────────────────────────── - print(f"{'Scenario':<60}{'limit':>8}{'sub-req':>10}{'baseline':>10}{'×min':>8}") - print("─" * 96) - for r in results: - print( - f" {r['name']:<58}" - f"{r['url_limit']:>8}" - f"{r['sub_requests']:>10}" - f"{r['baseline_worst_case']:>10}" - f"{r['speedup_vs_baseline']:>7.1f}×" - ) - print() - print("Greedy-search adaptation (scenario I — sub-req count vs url_limit):") - print(f" {'url_limit':>10} {'sub-req':>8}") - for limit, n in counts_by_limit: - print(f" {limit:>10} {n:>8}") - print() - print( - "✓ all invariants hold across " - f"{len(results) + len(counts_by_limit)} chunked calls" - ) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) From c475452f85a9452dba5fc061d1e93e12f5007d84 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 20:14:54 -0500 Subject: [PATCH 15/17] =?UTF-8?q?refactor(waterdata):=20/simplify=20pass?= =?UTF-8?q?=20=E2=80=94=20typed=20RateLimited=20exception,=20drop=20sentin?= =?UTF-8?q?els?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pulls cleanup items out of /simplify's review: - **Replace string-sniffing 429 detection with a typed exception.** ``_raise_for_non_200`` now raises ``utils.RateLimited`` (a ``RuntimeError`` subclass) for HTTP 429 specifically, plain ``RuntimeError`` for everything else. The chunker's ``_is_429`` walks ``__cause__`` looking for ``isinstance(_, RateLimited)`` instead of ``str(cur).startswith("429")``; ``_paginated_failure_message`` similarly switches to ``isinstance``. Any future reformatting of the 429 message can no longer silently break quota handling. - **Drop the ``_QUOTA_UNKNOWN = -1`` sentinel.** ``_read_remaining`` now returns ``int | None``; the wrapper branches on ``remaining is not None`` instead of comparing against a magic number. - **Collapse ``_finalize_response`` into ``_combine_chunk_responses``.** The two were a 3-line wrapper around a 6-line helper; merging them removes one call and the indirection. The combiner now takes ``canonical_url`` and sets ``.url`` directly. - **Simplify ``_FetchOnce``** from a constrained ``TypeVar`` to a plain ``Callable`` alias. The TypeVar required ``# type: ignore`` at the return site anyway and bought no callsite type safety. - Update the ``waterdata_test.py`` flaky-rerun regex to match the new ``RateLimited:`` prefix as well as ``RuntimeError:``. Items considered and skipped: - Planning-phase efficiency findings (redundant ``build_request`` probes, ``_filter_candidates`` joining discarded groups, dict-copy cost in ``_iter_sub_args``) — all cold-path next to the actual HTTP round-trips that follow. Premature optimization. - Unifying ``_combine_chunk_responses`` with ``_finalize_paginated_response`` — they take different inputs (one accumulates wall-clock externally, the other builds it from the response list); unification would be churn. - Test docstring trim — separate pass. All 142 chunker / filter / utils unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/chunking.py | 83 +++++++++++------------------ dataretrieval/waterdata/utils.py | 22 ++++++-- tests/waterdata_chunking_test.py | 32 +++++------ tests/waterdata_test.py | 2 +- 4 files changed, 65 insertions(+), 74 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 9c10abd9..123ba269 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -34,7 +34,7 @@ import itertools import math from collections.abc import Callable, Iterator -from typing import Any, TypeVar +from typing import Any from urllib.parse import quote_plus import pandas as pd @@ -87,12 +87,6 @@ # Response header USGS uses to advertise remaining hourly quota. _QUOTA_HEADER = "x-ratelimit-remaining" -# Sentinel returned by ``_read_remaining`` when the response has no -# parseable header. The wrapper treats this as "no quota signal" and -# skips the post-first-chunk plan-vs-quota check, so the value just -# needs to be distinct from any plausible real remaining count. -_QUOTA_UNKNOWN = -1 - # Separators the two chunking dimensions use to compose their atoms # into URL-encoded blobs. List dims comma-join values # (``site=USGS-A,USGS-B``); filter dims OR-join clauses @@ -101,11 +95,7 @@ _LIST_SEP = "," _OR_SEP = " OR " - -_FetchOnce = TypeVar( - "_FetchOnce", - bound=Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]], -) +_FetchOnce = Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]] class RequestTooLarge(ValueError): @@ -437,27 +427,34 @@ def _plan_joint( return best[1], best[2] -def _read_remaining(response: requests.Response) -> int: - """Parse ``x-ratelimit-remaining`` from a response. Missing or - malformed header → ``_QUOTA_UNKNOWN`` (treated as "no signal"; the - post-first-chunk quota check skips when this is returned).""" +def _read_remaining(response: requests.Response) -> int | None: + """Parse ``x-ratelimit-remaining`` from a response. Returns ``None`` + when the header is missing or unparseable; the wrapper treats that + as "no quota signal" and skips the post-first-chunk plan check.""" raw = response.headers.get(_QUOTA_HEADER) if raw is None: - return _QUOTA_UNKNOWN + return None try: return int(raw) except (TypeError, ValueError): - return _QUOTA_UNKNOWN + return None def _is_429(exc: BaseException) -> bool: - """True iff ``exc`` or any link in its ``__cause__`` chain is a - ``RuntimeError`` whose message starts with ``429`` — the shape - ``_raise_for_non_200`` produces and ``_walk_pages`` re-wraps for - mid-pagination failures.""" + """True iff ``exc`` or anywhere along its ``__cause__`` chain is a + ``utils.RateLimited``. ``_walk_pages`` re-wraps mid-pagination + failures as ``RuntimeError`` with the original ``RateLimited`` + linked as ``__cause__``, so the chunker has to follow the chain + rather than just ``isinstance(exc, RateLimited)``. + + Lazy import: ``utils`` imports this module to decorate + ``_fetch_once``, so a top-level import would be circular. + """ + from .utils import RateLimited + cur: BaseException | None = exc while cur is not None: - if isinstance(cur, RuntimeError) and str(cur).startswith("429"): + if isinstance(cur, RateLimited): return True cur = cur.__cause__ return False @@ -499,38 +496,22 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: def _combine_chunk_responses( - responses: list[requests.Response], + responses: list[requests.Response], canonical_url: str ) -> requests.Response: - """Return one response with the last chunk's headers (for current - rate-limit state) and summed ``elapsed`` (for total wall-clock). - - The returned response's ``.url`` is the *first chunk's* URL, which - only reflects the first slice of the user's query. ``_finalize_response`` - overwrites ``.url`` with the canonical original-query URL so - ``BaseMetadata`` reflects the user's request, not the first sub-chunk. - - Mutates the first response in place: ``.headers`` is replaced with - the last response's headers and ``.elapsed`` is accumulated across - all chunks. Downstream reads ``.url``, ``.headers``, and - ``.elapsed`` (via ``BaseMetadata``). - """ + """Fold per-sub-request responses into one. The first response is + mutated in place: ``.headers`` becomes the last response's (so + ``x-ratelimit-remaining`` reflects current state), ``.elapsed`` + accumulates total wall-clock, and ``.url`` is set to the canonical + original-query URL so ``BaseMetadata`` reflects the user's full + request rather than the first sub-chunk.""" head = responses[0] if len(responses) > 1: head.headers = responses[-1].headers head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed) + head.url = canonical_url return head -def _finalize_response( - responses: list[requests.Response], canonical_url: str -) -> requests.Response: - """Aggregate per-sub-request responses and restore the canonical - URL representing the user's full original query.""" - combined = _combine_chunk_responses(responses) - combined.url = canonical_url - return combined - - def multi_value_chunked( *, build_request: Callable[..., Any], @@ -585,7 +566,7 @@ def wrapper( raise QuotaExhausted( partial_frame=_combine_chunk_frames(frames), partial_response=( - _finalize_response(responses, canonical_url) + _combine_chunk_responses(responses, canonical_url) if responses else None ), @@ -596,7 +577,7 @@ def wrapper( responses.append(response) if i == 0 and total > 1: remaining = _read_remaining(response) - if remaining != _QUOTA_UNKNOWN and remaining < total - 1: + if remaining is not None and remaining < total - 1: raise RequestExceedsQuota( planned_chunks=total, available=remaining + 1, @@ -605,9 +586,9 @@ def wrapper( return ( _combine_chunk_frames(frames), - _finalize_response(responses, canonical_url), + _combine_chunk_responses(responses, canonical_url), ) - return wrapper # type: ignore[return-value] + return wrapper return decorator diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index cf9d554a..67a71b14 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -410,16 +410,30 @@ def _error_body(resp: requests.Response): ) +class RateLimited(RuntimeError): + """A USGS Water Data API request was rejected with HTTP 429. Exposed + as a typed exception so callers (notably the multi-value chunker) + can detect rate-limit failures via ``isinstance`` instead of + string-matching error messages.""" + + def _raise_for_non_200(resp: requests.Response) -> None: - """Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200. + """Raise on a non-200 response. ``RateLimited`` for 429 (so the + chunker can branch on it without parsing the message); plain + ``RuntimeError`` for every other failure. Routes through ``_error_body`` (USGS-API-aware: handles 429/403 specially, extracts ``code``/``description`` from JSON error bodies) rather than ``Response.raise_for_status``, which raises ``HTTPError`` with a generic message. """ - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + status = resp.status_code + if status == 200: + return + body = _error_body(resp) + if status == 429: + raise RateLimited(body) + raise RuntimeError(body) def _paginated_failure_message(pages_collected: int, cause: BaseException) -> str: @@ -436,7 +450,7 @@ def _paginated_failure_message(pages_collected: int, cause: BaseException) -> st # message is always informative. if not cause_str.strip(): cause_str = type(cause).__name__ - if cause_str.startswith("429"): + if isinstance(cause, RateLimited): action = "wait for the rate-limit window to reset and retry" else: action = "retry the request (possibly after a short backoff)" diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index bfcc491f..6e42d97a 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -42,7 +42,7 @@ _read_remaining, multi_value_chunked, ) -from dataretrieval.waterdata.utils import _construct_api_requests +from dataretrieval.waterdata.utils import RateLimited, _construct_api_requests class _FakeReq: @@ -310,21 +310,16 @@ def test_read_remaining_parses_header(): assert _read_remaining(_quota_response(42)) == 42 -def test_read_remaining_treats_missing_header_as_unknown(): - """Servers that don't echo a rate-limit header must not synthesize - a remaining count — the wrapper checks for the unknown sentinel - explicitly and skips the post-first-chunk plan-vs-quota check.""" - from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN +def test_read_remaining_returns_none_when_header_missing(): + """No rate-limit header → ``None`` so the wrapper can branch on + ``is None`` instead of comparing against a magic sentinel.""" + assert _read_remaining(_quota_response(None)) is None - assert _read_remaining(_quota_response(None)) == _QUOTA_UNKNOWN - -def test_read_remaining_treats_malformed_header_as_unknown(): - """Defensive: non-integer header value → unknown sentinel, so the - quota check is skipped rather than tripping on a parse failure.""" - from dataretrieval.waterdata.chunking import _QUOTA_UNKNOWN - - assert _read_remaining(_quota_response("not-a-number")) == _QUOTA_UNKNOWN +def test_read_remaining_returns_none_on_malformed_header(): + """Non-integer header value → ``None`` so a parse failure doesn't + trip the quota check.""" + assert _read_remaining(_quota_response("not-a-number")) is None def test_request_exceeds_quota_after_first_chunk(): @@ -399,10 +394,11 @@ def fetch(args): i = state["i"] state["i"] += 1 if i == 2: - inner = RuntimeError("429: Too many requests made.") + # Match _walk_pages's wrapping: a generic mid-pagination + # RuntimeError with the typed RateLimited as __cause__. try: - raise inner - except RuntimeError as cause: + raise RateLimited("429: Too many requests made.") + except RateLimited as cause: raise RuntimeError( "Paginated request failed after collecting 0 page(s): " "429: Too many requests made." @@ -430,7 +426,7 @@ def test_quota_exhausted_on_first_chunk_429_has_no_partial_response(): any data arrived" from "abort after partial collection".""" def fetch(args): - raise RuntimeError("429: Too many requests made.") + raise RateLimited("429: Too many requests made.") decorated = multi_value_chunked(build_request=_fake_build, url_limit=240)(fetch) with pytest.raises(QuotaExhausted) as excinfo: diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 18e78594..39a9a4ee 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -49,7 +49,7 @@ reruns=2, reruns_delay=5, only_rerun=[ - r"RuntimeError:\s*(?:429|5\d\d):", # _raise_for_non_200 output + r"(?:RateLimited|RuntimeError):\s*(?:429|5\d\d):", # _raise_for_non_200 output r"ConnectionError", r"ReadTimeout|ConnectTimeout|Timeout", ], From 24fd1585976207330bf3e000951ca4e69d2e5509 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 21:10:57 -0500 Subject: [PATCH 16/17] refactor(waterdata): Extract ChunkPlan + _ChunkExecution; unify passthrough into the loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decorator wrapper was carrying three concerns: bookkeeping for the planning result tuple, the mid-loop 429 → ``QuotaExhausted`` translation, and the post-first-chunk ``RequestExceedsQuota`` check. That spilled out as four locals (``list_plan``, ``filter_chunks``, ``canonical_url``, ``total``) and ~30 lines of orchestration, with an early-return branch that handled "no chunking needed" differently from "chunking needed." Extract two classes: - ``ChunkPlan`` (frozen dataclass) — the precomputed strategy. ``ChunkPlan.from_args(args, build_request, url_limit)`` *always* returns a plan. Passthrough requests (no chunking needed) are represented as a trivial plan with ``list_chunks={}``, ``filter_chunks=[None]``, ``total=1``; ``iter_sub_args`` yields the original args unchanged. Owns ``.total`` (property), ``.iter_sub_args()``, ``.execute(fetch_once)``, and a ``from_args`` classmethod that subsumes the old ``_plan_joint``. - ``_ChunkExecution`` — in-flight execution state. Holds the plan, accumulates frames and responses, owns the 429 translation (``issue`` catches and re-raises as ``QuotaExhausted``) and the post-first-chunk quota check. Exposes ``run(fetch_once)`` which drives the whole loop. The wrapper collapses to two lines of meat: limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit return ChunkPlan.from_args(args, build_request, limit).execute(fetch_once) No more early-return branch — the loop is the same shape whether chunking was needed or not. The trade-off: passthrough requests that hit 429 now surface as ``QuotaExhausted(completed_chunks=0, total_chunks=1, partial_response=None)`` rather than the bare ``RateLimited``. The original ``RateLimited`` is on ``__cause__``; the ``QuotaExhausted`` docstring and message are updated to call out both the multi-chunk and single-shot cases honestly. Removed: ``_plan_joint``, ``_plan_total``, ``_iter_sub_args`` (their roles are now methods on ``ChunkPlan``). Tests: ``_plan_joint_*`` tests renamed to ``chunk_plan_*`` and updated to call ``ChunkPlan.from_args`` and check ``.list_chunks`` / ``.filter_chunks`` / ``.total`` attributes; three new passthrough tests added covering the trivial-plan shape and ``iter_sub_args`` yielding original args. All 145 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/chunking.py | 407 ++++++++++++++++------------ tests/waterdata_chunking_test.py | 100 ++++--- 2 files changed, 299 insertions(+), 208 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index 123ba269..cca5f4ed 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -2,10 +2,12 @@ Long multi-value list params (sites, parameter codes, ...) and long top-level-``OR`` CQL filters independently risk overflowing the -server's ~8 KB URL byte limit. ``multi_value_chunked`` plans both -chunking dimensions together, picks the allocation that minimizes -total sub-requests, and iterates the joint cartesian product so every -sub-request URL fits. +server's ~8 KB URL byte limit. ``multi_value_chunked`` builds a +``ChunkPlan`` that plans both chunking dimensions together, picks the +allocation that minimizes total sub-requests, and iterates the joint +cartesian product so every sub-request URL fits. Requests that +already fit get a trivial single-step plan — the wrapper has one code +path either way. Planning: for a filter with ``n_clauses`` top-level OR clauses, try candidate filter chunk counts ``k = 1, 2, 4, ..., n_clauses``. For @@ -14,12 +16,12 @@ then plan list-dim chunking by greedy halving against the remaining budget. Keep the candidate with the smallest ``list_count × k``. -Quota: after the first sub-request the wrapper reads +Quota: after the first sub-request the execution reads ``x-ratelimit-remaining``; if the rest of the plan won't fit, it raises ``RequestExceedsQuota`` before burning more budget. A 429 -encountered mid-call (another caller drained the bucket faster than -predicted) surfaces as ``QuotaExhausted`` carrying the partial frame -so callers can resume after the hourly window resets. +on any sub-request surfaces as ``QuotaExhausted`` carrying whatever +chunks completed first, so callers can resume after the hourly window +resets. Dedup: list sub-chunks don't overlap; filter sub-chunks can, so the combiner dedupes by feature ``id``. ``properties``, ``bbox``, date @@ -34,6 +36,7 @@ import itertools import math from collections.abc import Callable, Iterator +from dataclasses import dataclass, field from typing import Any from urllib.parse import quote_plus @@ -138,27 +141,31 @@ def __init__(self, *, planned_chunks: int, available: int, deficit: int) -> None class QuotaExhausted(RuntimeError): - """Raised when a sub-request returns HTTP 429 mid-call. + """Raised when a sub-request returns HTTP 429. - A single-process caller should not normally see this — the post- - first-chunk ``RequestExceedsQuota`` check short-circuits before - burning quota on a plan that won't fit. Arrival here implies a - concurrent consumer drained the rate-limit bucket faster than the - first chunk's ``x-ratelimit-remaining`` predicted. + For a chunked call (``total_chunks > 1``) reached past chunk 0, + the post-first-chunk ``RequestExceedsQuota`` check normally + short-circuits before burning quota on a plan that won't fit; + arrival here typically means a concurrent caller drained the + bucket faster than predicted, and ``partial_frame`` holds what + completed first. + + For a single-shot call (``total_chunks == 1``) or a 429 on the very + first chunk, ``partial_frame`` is empty and ``partial_response`` is + ``None``; the original ``RateLimited`` is on ``__cause__``. Attributes ---------- partial_frame : pd.DataFrame Concatenated, deduplicated result of every sub-request that - completed before the 429. + completed before the 429. Empty when ``completed_chunks == 0``. partial_response : requests.Response | None Aggregated response with the canonical URL restored to the - user's full original query. ``None`` only if the 429 hit on - the very first sub-request (no responses banked). + user's full original query. ``None`` when ``completed_chunks == 0``. completed_chunks : int Number of sub-requests successfully completed. total_chunks : int - Total sub-requests in the joint cartesian-product plan. + Total sub-requests in the plan. """ def __init__( @@ -170,10 +177,9 @@ def __init__( total_chunks: int, ) -> None: super().__init__( - f"HTTP 429 mid-call after {completed_chunks}/{total_chunks} " - f"chunks; another caller likely drained the rate-limit " - f"window. Catch QuotaExhausted to access .partial_frame and " - f"resume after the window resets." + f"HTTP 429 after {completed_chunks}/{total_chunks} " + f"sub-requests; catch QuotaExhausted to access .partial_frame " + f"and resume after the rate-limit window resets." ) self.partial_frame = partial_frame self.partial_response = partial_response @@ -227,12 +233,6 @@ def _request_bytes(req: requests.PreparedRequest) -> int: ) -def _plan_total(list_plan: dict[str, list[list[Any]]], n_filter_chunks: int) -> int: - """Total sub-request count for a joint plan.""" - list_count = math.prod((len(c) for c in list_plan.values()), start=1) - return list_count * n_filter_chunks - - def _worst_case_args( base_args: dict[str, Any], list_plan: dict[str, list[list[Any]]] ) -> dict[str, Any]: @@ -360,71 +360,150 @@ def _filter_candidates( yield [_OR_SEP.join(g) for g in groups], _OR_SEP.join(worst) -def _plan_joint( - args: dict[str, Any], - build_request: Callable[..., Any], - url_limit: int, -) -> tuple[dict[str, list[list[Any]]], list[str | None]] | None: - """Joint planner over list dims and filter clauses. - - Returns ``(list_plan, filter_chunks)`` where ``list_plan`` is a - (possibly empty) dict of chunked list dims and ``filter_chunks`` is - the list of filter expressions to substitute (length 1 means no - filter chunking — either pass the original filter through or, if - no filter at all, the single element is ``None``). Returns ``None`` - when the request fits as-is or nothing can be chunked. - - Algorithm: enumerate filter chunk counts ``k`` (powers of two up - through ``n_clauses``); for each, partition clauses into ``k`` - count-balanced groups joined by ``OR`` and identify the worst - (longest URL-encoded) group; substitute that as the filter in - ``args`` and plan list chunking with greedy halving against the - remaining budget. Pick the candidate whose ``list_count × k`` is - smallest. - - Raises ``RequestTooLarge`` when no candidate fits (URL still - over-budget at every reducible plan). - """ - filter_expr = args.get("filter") - clauses: list[str] = [] - if _is_chunkable(filter_expr, args.get("filter_lang")): - _check_numeric_filter_pitfall(filter_expr) - clauses = _split_top_level_or(filter_expr) +@dataclass(frozen=True) +class ChunkPlan: + """A precomputed strategy for issuing one user-level request as a + sequence of sub-requests whose URLs each fit ``url_limit``. - if not _chunkable_params(args) and len(clauses) < 2: - return None - if _request_bytes(build_request(**args)) <= url_limit: - return None + ``ChunkPlan.from_args`` always returns a plan, even when no + chunking is needed: the passthrough case is represented by empty + ``list_chunks`` and a single-element ``filter_chunks=[None]`` so + ``total == 1`` and ``iter_sub_args`` yields the original args + unchanged. The wrapper's loop is therefore the same shape whether + chunking was needed or not. - best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None - last_error: RequestTooLarge | None = None + Attributes + ---------- + args : dict + The original user-level args this plan was built for. Bound to + the plan so ``iter_sub_args`` is self-contained. + list_chunks : dict[str, list[list]] + Per-param chunkings of multi-value list inputs. Empty in the + passthrough case. + filter_chunks : list[str | None] + Filter sub-expressions to substitute one per sub-request. + ``[None]`` means "leave ``args['filter']`` as-is" (passthrough + and single-clause cases). + canonical_url : str + URL of the full original request, used to overwrite the first + chunk's ``response.url`` so ``BaseMetadata`` reflects the + user's full query. + """ - for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr): - k = len(filter_chunks) - plan_args = args if worst_filter is None else {**args, "filter": worst_filter} - try: - list_plan = _plan_list_chunks(plan_args, build_request, url_limit) - except RequestTooLarge as exc: - last_error = exc - continue - if list_plan is None: - list_plan = {} - # ``_plan_list_chunks`` returns ``None`` when no list dims are - # chunkable, regardless of whether the request actually fits. - # Filter chunking alone has to close the gap — verify it. - if not list_plan and _request_bytes(build_request(**plan_args)) > url_limit: - continue - total = _plan_total(list_plan, k) - if best is None or total < best[0]: - best = (total, list_plan, filter_chunks) - - if best is None: - raise last_error or RequestTooLarge( - "No filter-chunking candidate produces a fitting plan. " - "Reduce list sizes or simplify the filter." + args: dict[str, Any] + list_chunks: dict[str, list[list[Any]]] + filter_chunks: list[str | None] + canonical_url: str + + @property + def total(self) -> int: + """Sub-request count: product of list-dim chunk counts times + ``len(filter_chunks)``.""" + list_count = math.prod((len(c) for c in self.list_chunks.values()), start=1) + return list_count * len(self.filter_chunks) + + def iter_sub_args(self) -> Iterator[dict[str, Any]]: + """Yield substituted args for each sub-request, in deterministic + order: list-dim cartesian product (dict insertion order) crossed + with filter chunks. Same plan → same sequence — resume is + well-defined.""" + list_combos = ( + itertools.product(*self.list_chunks.values()) if self.list_chunks else [()] ) + for combo in list_combos: + base = {**self.args, **dict(zip(self.list_chunks, combo))} + for filter_chunk in self.filter_chunks: + if filter_chunk is None: + yield base + else: + yield {**base, "filter": filter_chunk} + + def execute(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]: + """Run the plan and return the combined result. See + ``_ChunkExecution`` for the per-sub-request semantics.""" + return _ChunkExecution(self).run(fetch_once) + + @classmethod + def from_args( + cls, + args: dict[str, Any], + build_request: Callable[..., Any], + url_limit: int, + ) -> ChunkPlan: + """Compute the cheapest joint plan for ``args``. Returns a + passthrough plan when the request already fits or nothing's + chunkable; raises ``RequestTooLarge`` only when chunking *is* + needed but no candidate plan fits ``url_limit``. + + Algorithm: enumerate filter chunk counts ``k = 1, 2, 4, ..., + n_clauses``; for each, partition clauses into ``k`` + count-balanced groups joined by ``OR`` and pick the worst + (longest URL-encoded) group; substitute that as the filter + and plan list chunking with greedy halving. Keep the candidate + whose ``list_count × k`` is smallest. + """ + initial_request = build_request(**args) + canonical_url = initial_request.url + + filter_expr = args.get("filter") + clauses: list[str] = [] + if _is_chunkable(filter_expr, args.get("filter_lang")): + _check_numeric_filter_pitfall(filter_expr) + clauses = _split_top_level_or(filter_expr) + + # Passthrough: either nothing's chunkable, or the request + # already fits as-is. Trivial plan, single sub-request, original + # args flow through unchanged. + if (not _chunkable_params(args) and len(clauses) < 2) or ( + _request_bytes(initial_request) <= url_limit + ): + return cls( + args=args, + list_chunks={}, + filter_chunks=[None], + canonical_url=canonical_url, + ) + + best: tuple[int, dict[str, list[list[Any]]], list[str | None]] | None = None + last_error: RequestTooLarge | None = None + + for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr): + plan_args = ( + args if worst_filter is None else {**args, "filter": worst_filter} + ) + try: + list_chunks = _plan_list_chunks(plan_args, build_request, url_limit) + except RequestTooLarge as exc: + last_error = exc + continue + if list_chunks is None: + list_chunks = {} + # ``_plan_list_chunks`` returns ``None`` both when no list + # dims are chunkable AND when the request fits. Filter + # chunking alone has to close the gap — verify before + # committing to a list-empty candidate. + if ( + not list_chunks + and _request_bytes(build_request(**plan_args)) > url_limit + ): + continue + list_count = math.prod((len(c) for c in list_chunks.values()), start=1) + total = list_count * len(filter_chunks) + if best is None or total < best[0]: + best = (total, list_chunks, filter_chunks) + + if best is None: + raise last_error or RequestTooLarge( + "No filter-chunking candidate produces a fitting plan. " + "Reduce list sizes or simplify the filter." + ) - return best[1], best[2] + return cls( + args=args, + list_chunks=best[1], + filter_chunks=best[2], + canonical_url=canonical_url, + ) def _read_remaining(response: requests.Response) -> int | None: @@ -460,21 +539,6 @@ def _is_429(exc: BaseException) -> bool: return False -def _iter_sub_args( - args: dict[str, Any], - list_plan: dict[str, list[list[Any]]], - filter_chunks: list[str | None], -) -> Iterator[dict[str, Any]]: - """Yield the substituted ``args`` for each sub-request in the joint - plan, in deterministic order: list-dim cartesian product (dict - insertion order — Python 3.7+ guarantee) crossed with filter chunks.""" - list_combos = itertools.product(*list_plan.values()) if list_plan else [()] - for combo in list_combos: - base = {**args, **dict(zip(list_plan, combo))} - for filter_chunk in filter_chunks: - yield base if filter_chunk is None else {**base, "filter": filter_chunk} - - def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: """Concatenate per-chunk frames, dropping empties and deduping by ``id``. @@ -512,31 +576,84 @@ def _combine_chunk_responses( return head +@dataclass +class _ChunkExecution: + """In-flight execution of a ``ChunkPlan``. Issues each sub-request, + accumulates frames and responses, translates 429s into + ``QuotaExhausted`` with the partial state captured so far, and + raises ``RequestExceedsQuota`` after the first sub-request if the + rest of the plan won't fit the current rate-limit window.""" + + plan: ChunkPlan + frames: list[pd.DataFrame] = field(default_factory=list) + responses: list[requests.Response] = field(default_factory=list) + + @property + def completed(self) -> int: + return len(self.responses) + + def run(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]: + for sub_args in self.plan.iter_sub_args(): + self.issue(fetch_once, sub_args) + return self.finalize() + + def issue(self, fetch_once: _FetchOnce, sub_args: dict[str, Any]) -> None: + try: + frame, response = fetch_once(sub_args) + except RuntimeError as exc: + if not _is_429(exc): + raise + raise self._quota_exhausted() from exc + self.frames.append(frame) + self.responses.append(response) + if self.completed == 1 and self.plan.total > 1: + self._check_quota_after_first() + + def finalize(self) -> tuple[pd.DataFrame, requests.Response]: + return ( + _combine_chunk_frames(self.frames), + _combine_chunk_responses(self.responses, self.plan.canonical_url), + ) + + def _check_quota_after_first(self) -> None: + remaining = _read_remaining(self.responses[0]) + if remaining is None or remaining >= self.plan.total - 1: + return + raise RequestExceedsQuota( + planned_chunks=self.plan.total, + available=remaining + 1, + deficit=self.plan.total - remaining - 1, + ) + + def _quota_exhausted(self) -> QuotaExhausted: + return QuotaExhausted( + partial_frame=_combine_chunk_frames(self.frames), + partial_response=( + _combine_chunk_responses(self.responses, self.plan.canonical_url) + if self.responses + else None + ), + completed_chunks=self.completed, + total_chunks=self.plan.total, + ) + + def multi_value_chunked( *, build_request: Callable[..., Any], url_limit: int | None = None, ) -> Callable[[_FetchOnce], _FetchOnce]: - """Decorator that splits multi-value list params and cql-text - filters across sub-requests so each sub-request URL fits - ``url_limit`` bytes (defaults to ``_WATERDATA_URL_BYTE_LIMIT``, - resolved at call time so tests/users that patch the constant affect - this decorator uniformly). ``url_limit`` is enforced against total - request bytes (URL + body); POST routes are conservatively sized. - - After the first sub-request the wrapper reads - ``x-ratelimit-remaining`` from the response. If the remaining plan - won't fit in the current rate-limit window, it raises - ``RequestExceedsQuota`` rather than burn the rest of the quota on a - call that will fail mid-way. A 429 received mid-call (because - another caller drained the bucket faster than predicted) is - re-raised as ``QuotaExhausted`` carrying the partial frame so - callers can resume after the window resets. - - Joint cartesian-product iteration order is deterministic: list dims - in args-insertion order (Python 3.7+ guarantee), with the filter - dim as the innermost loop. Same ``args`` produces the same chunk - sequence — resume is well-defined. + """Decorator that splits multi-value list params and cql-text filters + across sub-requests so each fits ``url_limit`` bytes (default + ``_WATERDATA_URL_BYTE_LIMIT``, resolved at call time so test patches + take effect). Builds a ``ChunkPlan`` and runs it: passthrough + requests are a trivial single-step plan, so there's one code path + either way. + + See ``ChunkPlan`` and ``_ChunkExecution`` for planning and + rate-limit semantics. Exceptions: ``RequestTooLarge`` if no plan + fits, ``RequestExceedsQuota`` if the remaining plan can't fit the + current rate-limit window, ``QuotaExhausted`` on a 429 mid-execution. """ def decorator(fetch_once: _FetchOnce) -> _FetchOnce: @@ -545,49 +662,7 @@ def wrapper( args: dict[str, Any], ) -> tuple[pd.DataFrame, requests.Response]: limit = _WATERDATA_URL_BYTE_LIMIT if url_limit is None else url_limit - plan = _plan_joint(args, build_request, limit) - if plan is None: - return fetch_once(args) - - list_plan, filter_chunks = plan - canonical_url = build_request(**args).url - total = _plan_total(list_plan, len(filter_chunks)) - - frames: list[pd.DataFrame] = [] - responses: list[requests.Response] = [] - for i, sub_args in enumerate( - _iter_sub_args(args, list_plan, filter_chunks) - ): - try: - frame, response = fetch_once(sub_args) - except RuntimeError as exc: - if not _is_429(exc): - raise - raise QuotaExhausted( - partial_frame=_combine_chunk_frames(frames), - partial_response=( - _combine_chunk_responses(responses, canonical_url) - if responses - else None - ), - completed_chunks=i, - total_chunks=total, - ) from exc - frames.append(frame) - responses.append(response) - if i == 0 and total > 1: - remaining = _read_remaining(response) - if remaining is not None and remaining < total - 1: - raise RequestExceedsQuota( - planned_chunks=total, - available=remaining + 1, - deficit=total - (remaining + 1), - ) - - return ( - _combine_chunk_frames(frames), - _combine_chunk_responses(responses, canonical_url), - ) + return ChunkPlan.from_args(args, build_request, limit).execute(fetch_once) return wrapper diff --git a/tests/waterdata_chunking_test.py b/tests/waterdata_chunking_test.py index 6e42d97a..220b5643 100644 --- a/tests/waterdata_chunking_test.py +++ b/tests/waterdata_chunking_test.py @@ -16,8 +16,6 @@ """ import datetime -import itertools -import math import sys from unittest import mock from urllib.parse import quote_plus @@ -31,13 +29,13 @@ from dataretrieval.waterdata import chunking as _chunking from dataretrieval.waterdata.chunking import ( _QUOTA_HEADER, + ChunkPlan, QuotaExhausted, RequestExceedsQuota, RequestTooLarge, _chunkable_params, _filter_chunk_counts, _partition_clauses, - _plan_joint, _plan_list_chunks, _read_remaining, multi_value_chunked, @@ -136,10 +134,9 @@ def test_plan_chunks_raises_request_too_large_at_singleton_floor(): _plan_list_chunks(args, _fake_build, url_limit=100) -def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): +def test_chunk_plan_fans_out_filter_when_list_alone_cannot_fit(): """When the request can only fit by chunking BOTH the list and the - filter, the joint planner must produce a plan that touches both - dims.""" + filter, the plan must touch both dims.""" clauses = [f"f='{i}'" for i in range(10)] args = { "monitoring_location_id": ["A" * 10, "B" * 10, "C" * 10, "D" * 10], @@ -147,14 +144,14 @@ def test_plan_joint_fans_out_filter_when_list_alone_cannot_fit(): } # Singleton list + full filter ≈ 200 + 10 + 86 = 296 (over limit 240). # Joint planner must split the filter into k >= 2 groups. - plan = _plan_joint(args, _fake_build, url_limit=240) - assert plan is not None - list_plan, filter_chunks = plan + plan = ChunkPlan.from_args(args, _fake_build, url_limit=240) # Either the filter was chunked, the list was chunked, or both. - assert len(filter_chunks) > 1 or any(len(v) > 1 for v in list_plan.values()) + assert len(plan.filter_chunks) > 1 or any( + len(v) > 1 for v in plan.list_chunks.values() + ) -def test_plan_joint_minimizes_total_sub_requests(): +def test_chunk_plan_minimizes_total_sub_requests(): """When both dims need shrinking, picking smaller filter chunks frees URL budget for larger list chunks, and vice versa. The planner should pick the allocation with the *fewest* total @@ -168,18 +165,13 @@ def test_plan_joint_minimizes_total_sub_requests(): "filter": " OR ".join(clauses), } # Tight limit forces both dims to participate. - plan = _plan_joint(args, _fake_build, url_limit=380) - assert plan is not None - list_plan, filter_chunks = plan - total = max( - 1, math.prod(len(c) for c in list_plan.values()) if list_plan else 1 - ) * len(filter_chunks) + plan = ChunkPlan.from_args(args, _fake_build, url_limit=380) # Plan must beat the bail-floor-style worst case (8 singletons × 16 # filter chunks = 128 sub-requests) by a healthy margin. - assert total < 128 + assert plan.total < 128 -def test_plan_joint_raises_when_smallest_plan_doesnt_fit(): +def test_chunk_plan_raises_when_smallest_plan_doesnt_fit(): """If even the most aggressive joint plan (singleton lists + singleton filter clauses) still exceeds the limit, surface RequestTooLarge — there's nothing left to shrink.""" @@ -190,7 +182,40 @@ def test_plan_joint_raises_when_smallest_plan_doesnt_fit(): # Base 200 + singleton site (10) + singleton clause (9) = 219; limit # below 219 → no joint plan can fit. with pytest.raises(RequestTooLarge): - _plan_joint(args, _fake_build, url_limit=210) + ChunkPlan.from_args(args, _fake_build, url_limit=210) + + +def test_chunk_plan_passthrough_when_request_fits(): + """A request that already fits gets a trivial single-step plan: + no list chunks, ``filter_chunks=[None]``, ``total == 1``. The + wrapper still iterates it through one fetch_once call.""" + args = {"monitoring_location_id": ["A", "B", "C"]} + plan = ChunkPlan.from_args(args, _fake_build, url_limit=8000) + assert plan.list_chunks == {} + assert plan.filter_chunks == [None] + assert plan.total == 1 + + +def test_chunk_plan_passthrough_when_nothing_chunkable(): + """A request with no multi-value lists and no top-level-OR filter + is also a passthrough plan, even if the URL is technically over + the limit (the server may 414, but the chunker has nothing to + split).""" + args = {"monitoring_location_id": "scalar-only"} + plan = ChunkPlan.from_args(args, _fake_build, url_limit=10) + assert plan.list_chunks == {} + assert plan.filter_chunks == [None] + assert plan.total == 1 + + +def test_chunk_plan_iter_sub_args_passthrough_yields_original_args_once(): + """The passthrough plan's ``iter_sub_args`` yields exactly one + sub-args dict equal to the original args (modulo dict identity).""" + args = {"monitoring_location_id": ["A", "B", "C"], "limit": 100} + plan = ChunkPlan.from_args(args, _fake_build, url_limit=8000) + subs = list(plan.iter_sub_args()) + assert len(subs) == 1 + assert subs[0] == args def test_multi_value_chunked_passes_through_when_url_fits(): @@ -602,28 +627,18 @@ def test_joint_planner_url_construction_long_filter_and_long_sites(): } url_limit = 8000 - plan = _plan_joint(args, _construct_api_requests, url_limit) - assert plan is not None, "expected joint plan for over-limit request" - list_plan, filter_chunks = plan + plan = ChunkPlan.from_args(args, _construct_api_requests, url_limit) + assert plan.total > 1, "expected non-trivial plan for over-limit request" + list_plan = plan.list_chunks + filter_chunks = plan.filter_chunks - # Walk every sub-request and assert URL fits. - list_keys = list(list_plan) - list_combos = ( - list(itertools.product(*(list_plan[k] for k in list_keys))) - if list_plan - else [()] - ) + # Walk every sub-request the plan would issue and assert URL fits. over_limit = [] - for combo in list_combos: - for filter_chunk in filter_chunks: - sub_args = dict(args) - sub_args.update(zip(list_keys, combo)) - if filter_chunk is not None: - sub_args["filter"] = filter_chunk - req = _construct_api_requests(**sub_args) - url_len = len(req.url) + (len(req.body) if req.body else 0) - if url_len > url_limit: - over_limit.append((url_len, combo, filter_chunk)) + for sub_args in plan.iter_sub_args(): + req = _construct_api_requests(**sub_args) + url_len = len(req.url) + (len(req.body) if req.body else 0) + if url_len > url_limit: + over_limit.append((url_len, sub_args)) assert not over_limit, ( f"{len(over_limit)} sub-request(s) exceeded the URL limit; " f"first: {over_limit[0]}" @@ -650,5 +665,6 @@ def test_joint_planner_url_construction_long_filter_and_long_sites(): # Joint plan must beat the bail-floor-style worst case (singleton # sites × all filter clauses singleton = 100 * 20 = 2000) — joint # planning of these inputs cuts that by at least a factor of 4. - total = len(list_combos) * len(filter_chunks) - assert total < 500, f"joint plan emitted {total} sub-requests (expected <500)" + assert plan.total < 500, ( + f"joint plan emitted {plan.total} sub-requests (expected <500)" + ) From 5d931fa3413176bfde13867adf20784c064f2592 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 18 May 2026 21:18:06 -0500 Subject: [PATCH 17/17] =?UTF-8?q?refactor(waterdata):=20/simplify=20pass?= =?UTF-8?q?=20on=20ChunkPlan=20=E2=80=94=20skip=20work=20on=20the=20passth?= =?UTF-8?q?rough=20hot=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aggregated and applied the meaningful items from the review: - **Trivial-passthrough skips ``build_request`` entirely.** Previously ``ChunkPlan.from_args`` called ``build_request(**args)`` up front to capture ``canonical_url`` and to size the request, even when there was nothing to chunk. Reorder so the "no multi-value lists, no top-level-OR filter" check runs first; on that path the plan is built with ``canonical_url=None`` and no request preparation. The ~20-80 µs ``Request.prepare()`` overhead is removed from the dominant Water Data call shape. ``_combine_chunk_responses`` now treats ``canonical_url=None`` as "skip the override" — fine because ``_walk_pages`` already pinned the response's ``.url`` to the canonical request URL. - **``iter_sub_args`` short-circuits the trivial-passthrough case** — yields ``self.args`` directly instead of allocating a dict copy and spinning through an empty cartesian product. - **``_ChunkExecution`` now owns ``fetch_once``** instead of receiving it per-call on ``issue()``. ``fetch_once`` is constant across the loop, so threading it through every call was needless. ``issue(sub_args)`` and ``run()`` are now zero- and one-arg respectively. Converted from ``@dataclass`` to a plain class (the auto-generated repr/eq weren't earning their keep). The ``completed`` property was inlined to its one remaining caller as ``len(self.responses)``. - **Hoist ``_FILTER_KEY = "filter"``** so the planner and ``iter_sub_args`` substitute on the same constant, matching the existing ``_LIST_SEP``/``_OR_SEP``/``_QUOTA_HEADER`` convention. - **``utils._next_req_url``** now references ``chunking._QUOTA_HEADER`` instead of repeating the ``"x-ratelimit-remaining"`` literal. - Stale ``_NEVER_CHUNK`` comment that pointed at the removed ``_plan_joint`` now points at ``ChunkPlan.from_args``. Items considered and skipped: - ``ChunkPlan.canonical_url`` derivable from ``args`` — keeping it avoids the extra ``build_request`` call on every ``finalize``. - ``_plan_list_chunks`` dual-meaning ``None`` return — fixing it would touch unrelated callers; the current ``continue`` guard is clearly commented. - ``args: dict`` mutability on the frozen dataclass — internal use only; ``MappingProxyType`` adds churn without value. - ``ChunkPlan.from_args`` length / search-loop extraction — the search loop reads well in place; pulling it out would only push state through a helper signature. - ``_count_subrequests`` helper to DRY the ``list_count * len(...)`` math — used in two adjacent places; not worth a helper. All 145 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/chunking.py | 95 ++++++++++++++++++----------- dataretrieval/waterdata/utils.py | 2 +- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/dataretrieval/waterdata/chunking.py b/dataretrieval/waterdata/chunking.py index cca5f4ed..b33a95ed 100644 --- a/dataretrieval/waterdata/chunking.py +++ b/dataretrieval/waterdata/chunking.py @@ -36,7 +36,7 @@ import itertools import math from collections.abc import Callable, Iterator -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any from urllib.parse import quote_plus @@ -65,8 +65,8 @@ # - structured: ``bbox`` is a fixed 4-element coord tuple. # - intervals: date/time ranges are not enumerable sets. # - handled elsewhere: ``filter`` gets OR-clause partitioning in -# ``_plan_joint``; comma-joining CQL clauses -# would emit malformed expressions. +# ``ChunkPlan.from_args``; comma-joining CQL +# clauses would emit malformed expressions. # - scalar by contract: ``limit``, ``skip_geometry``, ``filter_lang`` # — a list value would be a type-erasure smuggle. _NEVER_CHUNK = frozenset( @@ -98,6 +98,10 @@ _LIST_SEP = "," _OR_SEP = " OR " +# Args-dict key for the CQL filter. Hoisted so the planner and the +# wrapper substitute on the same key. +_FILTER_KEY = "filter" + _FetchOnce = Callable[[dict[str, Any]], tuple[pd.DataFrame, requests.Response]] @@ -384,16 +388,19 @@ class ChunkPlan: Filter sub-expressions to substitute one per sub-request. ``[None]`` means "leave ``args['filter']`` as-is" (passthrough and single-clause cases). - canonical_url : str + canonical_url : str | None URL of the full original request, used to overwrite the first chunk's ``response.url`` so ``BaseMetadata`` reflects the - user's full query. + user's full query. ``None`` on the nothing-to-chunk passthrough + path: ``fetch_once``'s response already carries the canonical + URL, so the override is skipped to avoid an extra + ``build_request`` call on the hot path. """ args: dict[str, Any] list_chunks: dict[str, list[list[Any]]] filter_chunks: list[str | None] - canonical_url: str + canonical_url: str | None @property def total(self) -> int: @@ -407,6 +414,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]: order: list-dim cartesian product (dict insertion order) crossed with filter chunks. Same plan → same sequence — resume is well-defined.""" + # Trivial-passthrough fast path: nothing to substitute, just + # yield the original args. Skips a wasted dict copy on the + # most common Water Data call shape. + if not self.list_chunks and self.filter_chunks == [None]: + yield self.args + return list_combos = ( itertools.product(*self.list_chunks.values()) if self.list_chunks else [()] ) @@ -416,12 +429,12 @@ def iter_sub_args(self) -> Iterator[dict[str, Any]]: if filter_chunk is None: yield base else: - yield {**base, "filter": filter_chunk} + yield {**base, _FILTER_KEY: filter_chunk} def execute(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]: """Run the plan and return the combined result. See ``_ChunkExecution`` for the per-sub-request semantics.""" - return _ChunkExecution(self).run(fetch_once) + return _ChunkExecution(self, fetch_once).run() @classmethod def from_args( @@ -442,21 +455,29 @@ def from_args( and plan list chunking with greedy halving. Keep the candidate whose ``list_count × k`` is smallest. """ - initial_request = build_request(**args) - canonical_url = initial_request.url - - filter_expr = args.get("filter") + filter_expr = args.get(_FILTER_KEY) clauses: list[str] = [] if _is_chunkable(filter_expr, args.get("filter_lang")): _check_numeric_filter_pitfall(filter_expr) clauses = _split_top_level_or(filter_expr) - # Passthrough: either nothing's chunkable, or the request - # already fits as-is. Trivial plan, single sub-request, original - # args flow through unchanged. - if (not _chunkable_params(args) and len(clauses) < 2) or ( - _request_bytes(initial_request) <= url_limit - ): + # Trivial passthrough: no multi-value lists and no top-level-OR + # filter to split, so chunking has no leverage. Skip the + # ``build_request`` call entirely — ``fetch_once``'s response + # will carry the canonical URL already (set by + # ``_finalize_paginated_response``), so the wrapper can elide + # the override. This is the common Water Data call shape, so + # the saved request prep is worth a small branch here. + if not _chunkable_params(args) and len(clauses) < 2: + return cls( + args=args, list_chunks={}, filter_chunks=[None], canonical_url=None + ) + + initial_request = build_request(**args) + canonical_url = initial_request.url + + # Already-fits passthrough: chunking is possible but unnecessary. + if _request_bytes(initial_request) <= url_limit: return cls( args=args, list_chunks={}, @@ -469,7 +490,7 @@ def from_args( for filter_chunks, worst_filter in _filter_candidates(clauses, filter_expr): plan_args = ( - args if worst_filter is None else {**args, "filter": worst_filter} + args if worst_filter is None else {**args, _FILTER_KEY: worst_filter} ) try: list_chunks = _plan_list_chunks(plan_args, build_request, url_limit) @@ -560,23 +581,27 @@ def _combine_chunk_frames(frames: list[pd.DataFrame]) -> pd.DataFrame: def _combine_chunk_responses( - responses: list[requests.Response], canonical_url: str + responses: list[requests.Response], canonical_url: str | None ) -> requests.Response: """Fold per-sub-request responses into one. The first response is mutated in place: ``.headers`` becomes the last response's (so ``x-ratelimit-remaining`` reflects current state), ``.elapsed`` accumulates total wall-clock, and ``.url`` is set to the canonical original-query URL so ``BaseMetadata`` reflects the user's full - request rather than the first sub-chunk.""" + request rather than the first sub-chunk. + + ``canonical_url=None`` skips the URL override — used by the + trivial-passthrough path where ``fetch_once`` already returns a + response whose ``.url`` is the original-query URL.""" head = responses[0] if len(responses) > 1: head.headers = responses[-1].headers head.elapsed = sum((r.elapsed for r in responses[1:]), start=head.elapsed) - head.url = canonical_url + if canonical_url is not None: + head.url = canonical_url return head -@dataclass class _ChunkExecution: """In-flight execution of a ``ChunkPlan``. Issues each sub-request, accumulates frames and responses, translates 429s into @@ -584,29 +609,27 @@ class _ChunkExecution: raises ``RequestExceedsQuota`` after the first sub-request if the rest of the plan won't fit the current rate-limit window.""" - plan: ChunkPlan - frames: list[pd.DataFrame] = field(default_factory=list) - responses: list[requests.Response] = field(default_factory=list) - - @property - def completed(self) -> int: - return len(self.responses) + def __init__(self, plan: ChunkPlan, fetch_once: _FetchOnce) -> None: + self.plan = plan + self.fetch_once = fetch_once + self.frames: list[pd.DataFrame] = [] + self.responses: list[requests.Response] = [] - def run(self, fetch_once: _FetchOnce) -> tuple[pd.DataFrame, requests.Response]: + def run(self) -> tuple[pd.DataFrame, requests.Response]: for sub_args in self.plan.iter_sub_args(): - self.issue(fetch_once, sub_args) + self.issue(sub_args) return self.finalize() - def issue(self, fetch_once: _FetchOnce, sub_args: dict[str, Any]) -> None: + def issue(self, sub_args: dict[str, Any]) -> None: try: - frame, response = fetch_once(sub_args) + frame, response = self.fetch_once(sub_args) except RuntimeError as exc: if not _is_429(exc): raise raise self._quota_exhausted() from exc self.frames.append(frame) self.responses.append(response) - if self.completed == 1 and self.plan.total > 1: + if len(self.responses) == 1 and self.plan.total > 1: self._check_quota_after_first() def finalize(self) -> tuple[pd.DataFrame, requests.Response]: @@ -633,7 +656,7 @@ def _quota_exhausted(self) -> QuotaExhausted: if self.responses else None ), - completed_chunks=self.completed, + completed_chunks=len(self.responses), total_chunks=self.plan.total, ) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 67a71b14..9af20d66 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -599,7 +599,7 @@ def _next_req_url(resp: requests.Response) -> str | None: if os.getenv("API_USGS_PAT", ""): logger.info( "Remaining requests this hour: %s", - header_info.get("x-ratelimit-remaining", ""), + header_info.get(chunking._QUOTA_HEADER, ""), ) for link in body.get("links", []): if link.get("rel") == "next":