From 21b95eb2364d9866e6a8c2801b71fa0bc2b06038 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 16:53:00 -0500 Subject: [PATCH 1/6] feat(waterdata): drop hash-valued ID columns by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The waterdata OGC services previously returned per-record UUID columns (``daily_id``, ``continuous_id``, ``peak_id``, …) plus secondary hash columns (``time_series_id``, ``parent_time_series_id``, ``field_visit_id``, ``field_measurements_series_id``) in every response. These IDs are unstable across record refreshes and not human-meaningful — stable identifiers like ``monitoring_location_id`` (AGENCY-ID format), ``parameter_code``, ``statistic_id`` and ``time`` are sufficient to pin a row. Drop the hash columns by default and add ``include_hash_ids: bool = False`` to every OGC ``get_*`` function for opt-in. Implementation trims server-side via the OGC ``properties=`` query parameter (cached per service from one queryables fetch) so the payload itself is smaller, with a client-side drop as a safety net. ``monitoring_location_id`` and other AGENCY-ID / code-style identifiers are unaffected. Offline benchmark on a synthetic 30,000-row payload (mirrors the on-wire shape and per-row size of a 1-year ``get_continuous`` query): - Server payload: 14,310,081 → 10,230,081 bytes (28.5% smaller) - DataFrame memory: 14.2 MB → 9.4 MB (33.5% smaller) - Peak traced memory: 94.1 MB → 73.9 MB (21.5% smaller) - Local parse + DataFrame construction: 1.05s → 0.94s (10.8% faster) Network savings stack on top of the local speedup. For very small queries (≲1k rows) the one-time queryables fetch overhead can dominate the savings; large queries are the target. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/_fixtures/.gitignore | 4 + benchmarks/_fixtures/synthesize.py | 123 +++++++++ benchmarks/bench_include_hash_ids.py | 356 +++++++++++++++++++++++++++ benchmarks/results_offline.txt | 22 ++ dataretrieval/waterdata/api.py | 154 ++++++++++++ dataretrieval/waterdata/utils.py | 170 ++++++++++++- tests/waterdata_test.py | 71 +++++- tests/waterdata_utils_test.py | 75 ++++++ 8 files changed, 952 insertions(+), 23 deletions(-) create mode 100644 benchmarks/_fixtures/.gitignore create mode 100644 benchmarks/_fixtures/synthesize.py create mode 100644 benchmarks/bench_include_hash_ids.py create mode 100644 benchmarks/results_offline.txt diff --git a/benchmarks/_fixtures/.gitignore b/benchmarks/_fixtures/.gitignore new file mode 100644 index 00000000..5efdd79b --- /dev/null +++ b/benchmarks/_fixtures/.gitignore @@ -0,0 +1,4 @@ +# Synthesized or captured response payloads are large (≈10–15 MB each) +# and trivially regenerated. Keep them out of the repo; the script that +# produces them is committed. +*.json diff --git a/benchmarks/_fixtures/synthesize.py b/benchmarks/_fixtures/synthesize.py new file mode 100644 index 00000000..72e07ffb --- /dev/null +++ b/benchmarks/_fixtures/synthesize.py @@ -0,0 +1,123 @@ +"""Generate synthetic OGC API payloads for the offline benchmark. + +We can't always reach the live USGS API (rate limits, no token), but +the local cost of an ``include_hash_ids=False`` vs ``True`` call is +dominated by: + - JSON parsing (``response.json()``) + - ``pandas.json_normalize`` over the features list + - DataFrame column allocation + +All three scale with payload bytes and feature count. A synthetic +payload that mirrors the real wire format and the real per-row column +shape is sufficient to measure them. +""" + +from __future__ import annotations + +import json +import uuid +from datetime import datetime, timedelta, timezone +from pathlib import Path + +ROWS = 30000 # ~1 year of 15-minute continuous data +HERE = Path(__file__).parent + + +def _row(i: int) -> dict: + ts = datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(minutes=15 * i) + return { + "id": str(uuid.uuid4()), + "time_series_id": uuid.uuid4().hex, + "monitoring_location_id": "USGS-02238500", + "parameter_code": "00060", + "statistic_id": "00011", + "time": ts.strftime("%Y-%m-%dT%H:%M:%S+00:00"), + "value": f"{100.0 + 0.01 * (i % 1000):.2f}", + "unit_of_measure": "ft^3/s", + "approval_status": "Approved", + "qualifier": None, + "last_modified": "2026-05-01T00:00:00+00:00", + } + + +def _feature(props: dict) -> dict: + return { + "type": "Feature", + "properties": props, + "id": props.get("id", ""), + "geometry": None, + } + + +HASH_COLS = {"id", "time_series_id"} + + +def build_full() -> dict: + features = [_feature(_row(i)) for i in range(ROWS)] + return { + "type": "FeatureCollection", + "features": features, + "numberReturned": ROWS, + "links": [], + } + + +def build_trimmed() -> dict: + features = [] + for i in range(ROWS): + props = {k: v for k, v in _row(i).items() if k not in HASH_COLS} + features.append( + { + "type": "Feature", + "properties": props, + "id": "", + "geometry": None, + } + ) + return { + "type": "FeatureCollection", + "features": features, + "numberReturned": ROWS, + "links": [], + } + + +def build_queryables() -> dict: + return { + "properties": { + "geometry": {}, + "id": {}, + "time_series_id": {}, + "monitoring_location_id": {}, + "parameter_code": {}, + "statistic_id": {}, + "time": {}, + "value": {}, + "unit_of_measure": {}, + "approval_status": {}, + "qualifier": {}, + "last_modified": {}, + } + } + + +def main() -> None: + HERE.mkdir(exist_ok=True) + full = build_full() + trimmed = build_trimmed() + queryables = build_queryables() + + (HERE / "continuous_full.json").write_text(json.dumps(full)) + (HERE / "continuous_trimmed.json").write_text(json.dumps(trimmed)) + (HERE / "continuous_queryables.json").write_text(json.dumps(queryables)) + + full_size = (HERE / "continuous_full.json").stat().st_size + trim_size = (HERE / "continuous_trimmed.json").stat().st_size + pct = 100 * (full_size - trim_size) / full_size + print(f"rows: {ROWS:,}") + print(f"full: {full_size:>12,} bytes") + print(f"trimmed: {trim_size:>12,} bytes ({pct:.1f}% smaller)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py new file mode 100644 index 00000000..9dff5c7b --- /dev/null +++ b/benchmarks/bench_include_hash_ids.py @@ -0,0 +1,356 @@ +"""Benchmark: default (``include_hash_ids=False``) vs legacy +(``include_hash_ids=True``) on a large ``get_daily`` query. + +The two settings are functionally equivalent except for the presence of +UUID/hex-hash ID columns in the response. The hash columns are: + - ``daily_id`` — 36-char UUID, one per record + - ``time_series_id`` — 32-char hex hash, one per record + +For a 50,000-row response that's 68 bytes/row of hash, plus JSON +overhead — roughly 4 MB of payload that we now neither transfer nor +parse. The expected wins: + - smaller HTTP payload (server-side ``properties=`` trim) + - fewer columns to ``json_normalize`` in pandas + - smaller DataFrame footprint + +Run with:: + + API_USGS_PAT= python benchmarks/bench_include_hash_ids.py + +Without a token, USGS rate-limits to ~120 requests/hour which is enough +for a single comparison run but not for retrying. The script clears the +queryables cache between runs so the schema-fetch cost is amortized +across both configurations. +""" + +from __future__ import annotations + +import argparse +import gc +import json +import sys +import time +import tracemalloc +from dataclasses import dataclass +from pathlib import Path +from unittest import mock + +import requests + +from dataretrieval.waterdata import get_continuous +from dataretrieval.waterdata import utils as wd_utils + +# A long-running, high-frequency gage. Continuous (sub-hourly) records +# yield O(10⁴) rows per year per parameter — the time window below is +# tuned so a single query returns ~one page worth of data, large enough +# that JSON parsing/DataFrame construction dominates over network +# round-trip variability. +SITE = "USGS-02238500" +PARAMETER = "00060" +# ~1 year of 15-min flow data ≈ 35,000 rows, just under one page. +TIME_RANGE = "2023-01-01/2024-01-01" + +# Where to stash the captured payload for the offline benchmark mode. +FIXTURE_DIR = Path(__file__).parent / "_fixtures" +FIXTURE_FULL = FIXTURE_DIR / "continuous_full.json" +FIXTURE_TRIMMED = FIXTURE_DIR / "continuous_trimmed.json" +FIXTURE_QUERYABLES = FIXTURE_DIR / "continuous_queryables.json" + + +@dataclass +class RunResult: + label: str + wall_seconds: float + rows: int + cols: int + mem_peak_bytes: int + memory_usage_bytes: int + + def __str__(self) -> str: + return ( + f" {self.label:>32}: " + f"{self.wall_seconds:6.2f}s " + f"rows={self.rows:>7} " + f"cols={self.cols:>2} " + f"peak_mem={self.mem_peak_bytes / 1024 / 1024:6.1f} MB " + f"df_mem={self.memory_usage_bytes / 1024 / 1024:6.1f} MB" + ) + + +def time_call(label: str, **kwargs) -> RunResult: + """One end-to-end ``get_continuous`` call, with wall time, peak RSS + and final DataFrame memory captured.""" + # Reset the queryables cache so each configuration pays the same + # one-time schema-fetch cost (when it applies). + wd_utils._queryables_cache.clear() + gc.collect() + + tracemalloc.start() + start = time.perf_counter() + df, _md = get_continuous( + monitoring_location_id=SITE, + parameter_code=PARAMETER, + time=TIME_RANGE, + **kwargs, + ) + wall = time.perf_counter() - start + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + return RunResult( + label=label, + wall_seconds=wall, + rows=len(df), + cols=df.shape[1], + mem_peak_bytes=peak, + memory_usage_bytes=int(df.memory_usage(deep=True).sum()), + ) + + +def _make_mock_response(body_bytes: bytes, status: int = 200) -> mock.Mock: + """Build a ``requests.Response``-shaped mock backed by ``body_bytes``. + + Only the attributes ``_walk_pages`` and ``_get_resp_data`` touch + need to be real; ``elapsed``, ``status_code``, ``headers``, ``json()``, + and ``raise_for_status()`` cover those callers. Pagination is + forced single-page by stripping any "next" link from the body + before the test patches it in. + """ + resp = mock.Mock(spec=requests.Response) + resp.status_code = status + resp.headers = {"x-ratelimit-remaining": "1000"} + resp.elapsed = __import__("datetime").timedelta(milliseconds=1) + resp.url = "https://test/mock" + resp.text = body_bytes.decode("utf-8") + resp.json = lambda: json.loads(body_bytes) + resp.raise_for_status = lambda: None + return resp + + +def capture_fixtures() -> None: + """Snapshot the two response payloads (with and without hash IDs) + plus the queryables response, so the ``--offline`` benchmark can + parse them locally with no network calls.""" + FIXTURE_DIR.mkdir(exist_ok=True) + session = requests.Session() + headers = wd_utils._default_headers() + + # Queryables (used only by the default code path). + q_url = f"{wd_utils.OGC_API_URL}/collections/continuous/queryables" + print(f"Fetching {q_url}") + r = session.get(q_url, headers=headers) + r.raise_for_status() + FIXTURE_QUERYABLES.write_bytes(r.content) + + body = json.loads(r.content) + all_props = list(body.get("properties", {}).keys()) + non_hash = [ + p + for p in all_props + if p not in wd_utils._HASH_ID_COLUMNS and p != "geometry" and p != "id" + ] + + base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" + common = { + "monitoring_location_id": SITE, + "parameter_code": PARAMETER, + "time": TIME_RANGE, + "skipGeometry": True, + "limit": 50000, + } + + # Full payload (legacy behavior — every column). + print("Fetching full payload …") + r = session.get(base, headers=headers, params=common) + r.raise_for_status() + FIXTURE_FULL.write_bytes(r.content) + print(f" → {FIXTURE_FULL.name}: {len(r.content):,} bytes") + + # Trimmed payload (new default — non-hash columns only). + print("Fetching trimmed payload …") + params = dict(common, properties=",".join(non_hash)) + r = session.get(base, headers=headers, params=params) + r.raise_for_status() + FIXTURE_TRIMMED.write_bytes(r.content) + print(f" → {FIXTURE_TRIMMED.name}: {len(r.content):,} bytes") + + full_size = FIXTURE_FULL.stat().st_size + trim_size = FIXTURE_TRIMMED.stat().st_size + pct = 100 * (full_size - trim_size) / full_size + print() + print( + f"Server payload size: {full_size:,} → {trim_size:,} bytes ({pct:.1f}% smaller)" + ) + + +def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunResult: + """Measure parsing + DataFrame construction time on a captured + payload. ``client.send`` is patched to return the recorded + response, so this isolates the local-CPU portion of a call from + network variability and rate-limit pressure.""" + body = payload_path.read_bytes() + queryables_body = FIXTURE_QUERYABLES.read_bytes() + wd_utils._queryables_cache.clear() + gc.collect() + + def _send(req, *args, **kwargs): + return _make_mock_response(body) + + def _get(url, *args, **kwargs): + return _make_mock_response(queryables_body) + + with ( + mock.patch.object(requests.Session, "send", _send), + mock.patch.object(requests, "get", _get), + ): + tracemalloc.start() + start = time.perf_counter() + df, _md = get_continuous( + monitoring_location_id=SITE, + parameter_code=PARAMETER, + time=TIME_RANGE, + include_hash_ids=include_hash_ids, + ) + wall = time.perf_counter() - start + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + return RunResult( + label=label, + wall_seconds=wall, + rows=len(df), + cols=df.shape[1], + mem_peak_bytes=peak, + memory_usage_bytes=int(df.memory_usage(deep=True).sum()), + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--capture", + action="store_true", + help="Fetch and save the two response payloads as fixtures, then exit.", + ) + parser.add_argument( + "--offline", + action="store_true", + help="Use captured fixtures instead of hitting the live API. " + "Isolates parsing/DataFrame cost from network variability.", + ) + parser.add_argument( + "--rounds", + type=int, + default=3, + help="Number of measured rounds per configuration (default: 3).", + ) + args = parser.parse_args() + + if args.capture: + capture_fixtures() + return 0 + + if args.offline: + if not ( + FIXTURE_FULL.exists() + and FIXTURE_TRIMMED.exists() + and FIXTURE_QUERYABLES.exists() + ): + print( + "Missing fixtures. Run with --capture first to record them.", + file=sys.stderr, + ) + return 1 + + full_size = FIXTURE_FULL.stat().st_size + trim_size = FIXTURE_TRIMMED.stat().st_size + print( + f"Offline benchmark on fixtures " + f"(full: {full_size:,} B, trimmed: {trim_size:,} B, " + f"server-side savings: {100 * (full_size - trim_size) / full_size:.1f}%)" + ) + print() + + # Warm up to load pandas/geopandas/numpy code paths. + time_offline("warmup_default", FIXTURE_TRIMMED, include_hash_ids=False) + time_offline("warmup_legacy", FIXTURE_FULL, include_hash_ids=True) + + runs = [] + for _ in range(args.rounds): + runs.append( + time_offline( + "default (hash IDs dropped)", + FIXTURE_TRIMMED, + include_hash_ids=False, + ) + ) + runs.append( + time_offline( + "include_hash_ids=True", + FIXTURE_FULL, + include_hash_ids=True, + ) + ) + else: + print( + f"Benchmarking get_continuous(site={SITE!r}, parameter={PARAMETER!r}, " + f"time={TIME_RANGE!r})" + ) + print() + + # Warmup once with each configuration to absorb DNS/TLS/cache + # cold-start effects, then run measured rounds. + print("Warming up …") + time_call("warmup_default") + time_call("warmup_legacy", include_hash_ids=True) + + runs = [] + for _ in range(args.rounds): + runs.append(time_call("default (hash IDs dropped)")) + runs.append(time_call("include_hash_ids=True", include_hash_ids=True)) + + print("All runs:") + for r in runs: + print(r) + print() + + best_default = min( + (r for r in runs if r.label.startswith("default")), + key=lambda r: r.wall_seconds, + ) + best_legacy = min( + (r for r in runs if r.label.startswith("include_hash_ids")), + key=lambda r: r.wall_seconds, + ) + + print("Best of each:") + print(best_default) + print(best_legacy) + print() + + wall_delta = best_legacy.wall_seconds - best_default.wall_seconds + wall_pct = ( + 100 * wall_delta / best_legacy.wall_seconds if best_legacy.wall_seconds else 0.0 + ) + mem_delta = best_legacy.memory_usage_bytes - best_default.memory_usage_bytes + mem_pct = ( + 100 * mem_delta / best_legacy.memory_usage_bytes + if best_legacy.memory_usage_bytes + else 0.0 + ) + peak_delta = best_legacy.mem_peak_bytes - best_default.mem_peak_bytes + peak_pct = ( + 100 * peak_delta / best_legacy.mem_peak_bytes + if best_legacy.mem_peak_bytes + else 0.0 + ) + + print(f"Wall-clock speedup: {wall_delta:+.2f}s ({wall_pct:+.1f}%)") + print(f"DataFrame memory: {mem_delta / 1024 / 1024:+.1f} MB ({mem_pct:+.1f}%)") + print(f"Peak traced memory: {peak_delta / 1024 / 1024:+.1f} MB ({peak_pct:+.1f}%)") + print(f"Columns dropped: {best_legacy.cols - best_default.cols}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/results_offline.txt b/benchmarks/results_offline.txt new file mode 100644 index 00000000..e7e10919 --- /dev/null +++ b/benchmarks/results_offline.txt @@ -0,0 +1,22 @@ +Offline benchmark on fixtures (full: 14,310,081 B, trimmed: 10,230,081 B, server-side savings: 28.5%) + +All runs: + default (hash IDs dropped): 0.96s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.08s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.06s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + +Best of each: + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + +Wall-clock speedup: +0.11s (+10.8%) +DataFrame memory: +4.7 MB (+33.5%) +Peak traced memory: +20.2 MB (+21.5%) +Columns dropped: 2 diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index ad268194..42d84476 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -57,6 +57,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -189,6 +190,19 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -257,6 +271,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantanous water conditions. @@ -384,6 +399,19 @@ def get_continuous( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -477,6 +505,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -692,6 +721,19 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -755,6 +797,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -915,6 +958,19 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1012,6 +1068,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1112,6 +1169,19 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1200,6 +1270,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1329,6 +1400,19 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1395,6 +1479,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1526,6 +1611,19 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1593,6 +1691,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1714,6 +1813,19 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1777,6 +1889,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1832,6 +1945,19 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1898,6 +2024,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -1956,6 +2083,19 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -2695,6 +2835,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -2808,6 +2949,19 @@ def get_channel( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 91228357..a3b43d5d 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -157,6 +157,91 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # parameters and require POST with CQL2 JSON instead. _CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) +# Column names whose values are server-generated hashes (UUIDs or hex +# digests). These are unstable across record refreshes — joining or +# diffing on them produces spurious churn — and they bloat the payload +# of large queries. Dropped by default; opt in with +# ``include_hash_ids=True``. Includes both: +# - The per-record version UUIDs that are aliased to a service's +# ``output_id`` (``daily_id``, ``continuous_id``, …). These get +# mapped to/from ``"id"`` on the wire; both names are listed so the +# filter works on either side of ``_switch_properties_id``. +# - Secondary hash columns embedded in record payloads +# (``time_series_id``, ``field_visit_id``, ``parent_time_series_id``, +# ``field_measurements_series_id``). +# ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) +# and other code columns (``parameter_code``, ``statistic_id``, …) are +# intentionally absent — they're stable, human-meaningful identifiers. +_HASH_ID_COLUMNS = frozenset( + { + "daily_id", + "continuous_id", + "latest_continuous_id", + "latest_daily_id", + "field_measurement_id", + "field_series_id", + "peak_id", + "channel_measurements_id", + "combined_meta_id", + "time_series_id", + "parent_time_series_id", + "field_visit_id", + "field_measurements_series_id", + } +) + +# Cache of per-service queryables column lists, populated on first call +# from each service when computing the default ``properties=`` for +# ``include_hash_ids=False``. Keyed by service name; value is the full +# list of property names the server exposes for that collection. +_queryables_cache: dict[str, list[str]] = {} + + +def _service_queryables(service: str) -> list[str]: + """Return the cached queryables property list for ``service``. + + One HTTP GET per service per process; the list is reused for every + subsequent call. Raises ``requests.HTTPError`` on a non-200 — the + caller's ``include_hash_ids=False`` request can't be satisfied + without it, so failing loudly is preferable to silently dropping + the server-side trim. + """ + cached = _queryables_cache.get(service) + if cached is not None: + return cached + body = _check_ogc_requests(endpoint=service, req_type="queryables") + props = list(body.get("properties", {}).keys()) + _queryables_cache[service] = props + return props + + +def _default_non_hash_properties(service: str, output_id: str) -> list[str]: + """Build the ``properties=`` whitelist sent to the server when the + caller didn't supply one and ``include_hash_ids=False``. + + Returns the service's queryables minus: + - any column whose wire-format name is in :data:`_HASH_ID_COLUMNS` + (the secondary hashes like ``time_series_id``, + ``parent_time_series_id``, ``field_visit_id``); + - the wire-format ``"id"`` column, but only when the service's + ``output_id`` (its post-rename name) is itself a hash column — + i.e., for ``daily``/``continuous``/``peaks``/etc., where ``id`` + becomes ``daily_id``/``continuous_id``/``peak_id``. For + ``monitoring-locations`` (where ``id`` becomes the AGENCY-ID + ``monitoring_location_id``) the ``id`` column is kept; + - ``"geometry"`` (the OGC server returns geometry via the feature + envelope, not as a property — listing it would be redundant and + some collections reject it). + """ + drop_wire_id = output_id in _HASH_ID_COLUMNS + return [ + p + for p in _service_queryables(service) + if p not in _HASH_ID_COLUMNS + and p != "geometry" + and not (drop_wire_id and p == "id") + ] + def _parse_datetime(value: str) -> datetime | None: """Parse a single datetime string against the supported formats. @@ -738,7 +823,10 @@ def _deal_with_empty( def _arrange_cols( - df: pd.DataFrame, properties: list[str] | None, output_id: str + df: pd.DataFrame, + properties: list[str] | None, + output_id: str, + include_hash_ids: bool = False, ) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided @@ -753,6 +841,13 @@ def _arrange_cols( only NaN, the function renames 'id' to output_id. output_id : str The name to which the 'id' column should be renamed if applicable. + include_hash_ids : bool, optional + If False (default), hash-valued ID columns (see + :data:`_HASH_ID_COLUMNS`) are dropped from the result unless the + caller explicitly named them in ``properties``. If True, the + legacy behavior is preserved: hash columns are kept and the + per-record output_id columns are moved to the end of the + DataFrame when ``properties`` is unspecified. Returns ------- @@ -775,8 +870,38 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Move meaningless-to-user, extra id columns to the end - # of the dataframe, if they exist + # Default: drop hash-valued ID columns the caller didn't ask for. + # This is the client-side counterpart to the server-side + # ``properties=`` trim done in ``get_ogc_data``; it's a no-op on + # the happy path (the server already omitted them) but catches the + # fallback case where the queryables fetch failed and we + # round-tripped a full payload. Drops apply uniformly even when the + # hash column is the service's renamed ``output_id`` (e.g., + # ``daily_id``) — the user-meaningful identifiers + # (``monitoring_location_id`` + ``time`` + ``parameter_code`` + + # ``statistic_id``) are sufficient to pin a row. + if not include_hash_ids: + requested = ( + set(properties) if properties and not all(pd.isna(properties)) else set() + ) + # ``"id"`` in ``properties`` resolves to the renamed ``output_id`` + # (matching the rename done above and in ``_switch_properties_id``), + # so treat the user as having asked for that output_id too. + if "id" in requested: + requested.add(output_id) + drop_cols = [ + col + for col in df.columns + if col in _HASH_ID_COLUMNS and col not in requested + ] + if drop_cols: + df = df.drop(columns=drop_cols) + + # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` + # is unspecified, move the per-record version IDs to the end of the + # DataFrame so they don't crowd the front. With + # ``include_hash_ids=False`` those columns are gone above, so this + # branch becomes a no-op. extra_id_col = set(df.columns).intersection( { "latest_continuous_id", @@ -787,9 +912,6 @@ def _arrange_cols( } ) - # If the arbitrary id column is returned (either due to properties - # being none or NaN), then move it to the end of the dataframe, but - # if part of properties, keep in requested order if extra_id_col and (properties is None or all(pd.isna(properties))): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col @@ -907,17 +1029,45 @@ def get_ogc_data( # Capture `properties` before the id-switch so post-processing sees # the user-facing names, not the wire-format ones. properties = args.get("properties") - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) convert_type = args.pop("convert_type", False) + include_hash_ids = args.pop("include_hash_ids", False) + + # When the caller didn't pin ``properties`` and isn't opting into + # hash IDs, send a server-side whitelist of the non-hash columns so + # the server (a) skips serializing UUID/hex fields and (b) returns + # a smaller payload for us to parse. ``_arrange_cols`` still sees + # ``properties=None`` (the original user input), so columns retain + # the schema's natural order rather than being subset to whatever + # whitelist we synthesized here. + if not include_hash_ids and (properties is None or all(pd.isna(properties))): + try: + args["properties"] = _default_non_hash_properties(service, output_id) + except (requests.HTTPError, requests.RequestException, ValueError) as exc: + # Server-side trim is an optimization, not a correctness + # requirement — fall back to a full payload and rely on the + # ``_arrange_cols`` post-processing drop below. + logger.warning( + "Could not fetch queryables for %s (%s); " + "falling back to client-side hash-ID drop.", + service, + exc, + ) + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) + else: + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) args = {k: v for k, v in args.items() if v is not None} return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) if convert_type: return_list = _type_cols(return_list) - return_list = _arrange_cols(return_list, properties, output_id) + return_list = _arrange_cols( + return_list, properties, output_id, include_hash_ids=include_hash_ids + ) return_list = _sort_rows(return_list) return return_list, BaseMetadata(response) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 18e78594..6da344c5 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -277,10 +277,14 @@ def test_get_daily(): parameter_code="00060", time="2025-01-01/..", ) - assert "daily_id" in df.columns + # Default: hash-valued ID columns (daily_id, time_series_id) are + # dropped. Stable identifiers (monitoring_location_id, + # parameter_code, statistic_id, time) are preserved. + assert "daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" in df.columns - assert df.columns[-1] == "daily_id" - assert df.shape[1] == 12 + assert df.shape[1] == 10 assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() @@ -290,6 +294,22 @@ def test_get_daily(): assert df["value"].dtype == "float64" +def test_get_daily_include_hash_ids(): + """``include_hash_ids=True`` restores the legacy behavior: the + per-record UUID (``daily_id``) and secondary hashes + (``time_series_id``) are present.""" + df, _ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + include_hash_ids=True, + ) + assert "daily_id" in df.columns + assert "time_series_id" in df.columns + assert df.columns[-1] == "daily_id" + assert df.shape[1] == 12 + + def test_get_daily_properties(): df, _ = get_daily( monitoring_location_id="USGS-05427718", @@ -335,7 +355,8 @@ def test_get_daily_no_geometry(): skip_geometry=True, ) assert "geometry" not in df.columns - assert df.shape[1] == 11 + # 10 default cols minus geometry, with hash IDs dropped by default. + assert df.shape[1] == 9 assert isinstance(df, DataFrame) @@ -351,7 +372,11 @@ def test_get_continuous(): df["time"].dtype.name.startswith("datetime64[") and "UTC" in df["time"].dtype.name ) - assert "continuous_id" in df.columns + # Default: continuous_id (UUID) and time_series_id (hex hash) are + # dropped. Set ``include_hash_ids=True`` to keep them. + assert "continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns def test_get_monitoring_locations(): @@ -376,7 +401,10 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert df.columns[-1] == "latest_continuous_id" + # Default: latest_continuous_id (UUID) and time_series_id are dropped. + assert "latest_continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, "url") @@ -391,8 +419,11 @@ def test_get_latest_daily(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert "latest_daily_id" in df.columns - assert df.shape[1] == 12 + # Default: latest_daily_id (UUID) and time_series_id are dropped. + assert "latest_daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert df.shape[1] == 10 assert hasattr(md, "url") assert hasattr(md, "query_time") @@ -420,7 +451,12 @@ def test_get_field_measurements(): time="2025-01-01/2025-10-01", skip_geometry=True, ) - assert "field_measurement_id" in df.columns + # Default: field_measurement_id (UUID), field_measurements_series_id + # (UUID), and field_visit_id (UUID) are dropped. + assert "field_measurement_id" not in df.columns + assert "field_measurements_series_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] assert hasattr(md, "url") @@ -478,7 +514,9 @@ def test_get_field_measurements_metadata(): df, md = get_field_measurements_metadata( monitoring_location_id="USGS-02238500", skip_geometry=True ) - assert "field_series_id" in df.columns + # Default: field_series_id (UUID) is dropped. + assert "field_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "begin" in df.columns assert "end" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -506,7 +544,10 @@ def test_get_field_measurements_metadata_multi_site(): def test_get_peaks(): df, md = get_peaks(monitoring_location_id="USGS-02238500", skip_geometry=True) - assert "peak_id" in df.columns + # Default: peak_id (UUID) and time_series_id are dropped. + assert "peak_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "value" in df.columns assert "water_year" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -604,8 +645,12 @@ def test_get_channel(): df, _ = get_channel(monitoring_location_id="USGS-02238500") assert df.shape[0] > 470 - assert df.shape[1] == 27 # if geopandas installed, 21 columns if not - assert "channel_measurements_id" in df.columns + # Default: channel_measurements_id (UUID) and field_visit_id (UUID) + # are dropped. 27 → 25 cols. + assert df.shape[1] == 25 # if geopandas installed, fewer if not + assert "channel_measurements_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns class TestCheckMonitoringLocationId: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index b05587e2..42a2ec89 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -330,6 +330,81 @@ def test_arrange_cols_keeps_geometry_when_present(): assert "geometry" in result.columns +def test_arrange_cols_drops_hash_ids_by_default(): + """Default ``include_hash_ids=False`` drops the per-record UUID + (renamed to ``daily_id``) and secondary hash columns + (``time_series_id``), keeping stable identifiers.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols(df, properties=None, output_id="daily_id") + assert "daily_id" not in result.columns + assert "time_series_id" not in result.columns + assert "monitoring_location_id" in result.columns + assert "value" in result.columns + + +def test_arrange_cols_include_hash_ids_keeps_them(): + """``include_hash_ids=True`` preserves the legacy behavior — hash + columns are kept and the per-record UUID lands at the end of the + column order.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, properties=None, output_id="daily_id", include_hash_ids=True + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + # Legacy ordering: ``daily_id`` moves to the end. + assert result.columns[-1] == "daily_id" + + +def test_arrange_cols_explicit_properties_keep_hash_ids(): + """A user who lists a hash column in ``properties`` gets it back even + with the default ``include_hash_ids=False`` — explicit beats default.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, + properties=["daily_id", "time_series_id", "value"], + output_id="daily_id", + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + + +def test_arrange_cols_non_hash_output_id_kept(): + """``monitoring_location_id`` (the output_id for monitoring-locations) + is NOT a hash — the AGENCY-ID format is stable and human-meaningful — + so it must stay even under the default.""" + df = pd.DataFrame( + { + "id": ["USGS-01"], + "agency_code": ["USGS"], + } + ) + result = _arrange_cols(df, properties=None, output_id="monitoring_location_id") + assert "monitoring_location_id" in result.columns + assert result.loc[0, "monitoring_location_id"] == "USGS-01" + + # --- _format_api_dates ------------------------------------------------------- From 5f95a1d10da6ac173d989a8b228c3124841d25f7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:19:44 -0500 Subject: [PATCH 2/6] feat(waterdata): extend hash-ID drop to get_stats_por / get_stats_date_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OGC ``get_*`` functions in the prior commit drop hash columns through ``get_ogc_data``. The statistics services (which return JSON through ``get_stats_data`` rather than OGC features) bypassed that path, so ``get_stats_por`` and ``get_stats_date_range`` were still returning ``computation_id`` (UUID) and ``parent_time_series_id`` (hex hash) by default. This commit: - Adds ``computation_id`` to ``_HASH_ID_COLUMNS`` (``parent_time_series_id`` was already there). - Plumbs ``include_hash_ids: bool = False`` through ``get_stats_data``, ``get_stats_por``, and ``get_stats_date_range``. - Drops the hash columns at the end of ``get_stats_data``, after ``_expand_percentiles`` (which still needs ``computation_id`` as a join key while it explodes the percentile lists into rows). - Updates ``test_get_stats_por_expanded_false`` / ``test_get_stats_date_range`` to reflect the new column count and adds ``test_get_stats_por_include_hash_ids`` documenting the opt-in. Discovered while running a live-API sweep across every public waterdata ``get_*`` function — the OGC services now pass, the stats ones used to leak, and this commit closes that gap. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 26 +++++++++++++++++++++++--- dataretrieval/waterdata/utils.py | 21 +++++++++++++++++++++ tests/waterdata_test.py | 25 +++++++++++++++++++++++-- 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 42d84476..0ee89c5f 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -2563,6 +2563,7 @@ def get_stats_por( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the USGS Water Data API. @@ -2641,6 +2642,13 @@ def get_stats_por( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2665,10 +2673,13 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( - args=params, service="observationNormals", expand_percentiles=expand_percentiles + args=params, + service="observationNormals", + expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) @@ -2687,6 +2698,7 @@ def get_stats_date_range( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov) @@ -2769,6 +2781,13 @@ def get_stats_date_range( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2794,12 +2813,13 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index a3b43d5d..8b5c2f72 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -187,6 +187,9 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s "parent_time_series_id", "field_visit_id", "field_measurements_series_id", + # ``get_stats_*`` (statistics service) output — per-computation + # UUID; ``parent_time_series_id`` is already listed above. + "computation_id", } ) @@ -1223,6 +1226,7 @@ def get_stats_data( service: str, expand_percentiles: bool, client: requests.Session | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Retrieves statistical data from a specified endpoint and returns it @@ -1244,6 +1248,13 @@ def get_stats_data( each percentile gets its own row in the returned dataframe. If True and user requests a computation_type other than percentiles, a percentile column is still returned. + include_hash_ids : bool, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. These IDs are not + stable across record refreshes; ``computation_id`` is used as a + join key internally during percentile expansion and only + removed after that step completes. Returns ------- @@ -1320,6 +1331,16 @@ def get_stats_data( if expand_percentiles: dfs = _expand_percentiles(dfs) + # Drop hash-valued ID columns at the end (after + # ``_expand_percentiles``, which still needs ``computation_id`` + # as a merge key while it explodes the percentile lists into + # rows). Stable identifiers (``monitoring_location_id``, + # ``parameter_code``, ``time_of_year``, …) are kept. + if not include_hash_ids: + drop_cols = [col for col in dfs.columns if col in _HASH_ID_COLUMNS] + if drop_cols: + dfs = dfs.drop(columns=drop_cols) + return dfs, BaseMetadata(initial_response) finally: if close_client: diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 6da344c5..aff58a9f 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -618,13 +618,31 @@ def test_get_stats_por_expanded_false(): computation_type=["minimum", "percentile"], ) assert df.shape[0] == 4 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "percentile" not in df.columns assert "percentiles" in df.columns assert type(df["percentiles"][2]) is list assert df.loc[~df["percentiles"].isna(), "value"].isnull().all() +def test_get_stats_por_include_hash_ids(): + """``include_hash_ids=True`` preserves the per-computation UUID + and the upstream time-series hex hash that ``get_stats_*`` used + to return unconditionally.""" + df, _ = get_stats_por( + monitoring_location_id="USGS-12451000", + parameter_code="00060", + start_date="01-01", + end_date="01-01", + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_get_stats_date_range(): df, _ = get_stats_date_range( monitoring_location_id="USGS-12451000", @@ -635,7 +653,10 @@ def test_get_stats_date_range(): ) assert df.shape[0] == 3 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "interval_type" in df.columns assert "percentile" in df.columns assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all() From e7da0bb3cd5698d1d009b0036701ddebff32ad9f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:22:37 -0500 Subject: [PATCH 3/6] test(waterdata): add stats hash-drop unit tests Two mocked-response tests for ``get_stats_data``: - ``test_get_stats_data_drops_hash_ids_by_default`` asserts ``computation_id`` and ``parent_time_series_id`` are removed when ``include_hash_ids=False`` (the new default). - ``test_get_stats_data_keeps_hash_ids_when_opted_in`` asserts the opt-in path preserves them, matching the legacy behavior. Both use ``monkeypatch`` to stub ``_handle_stats_nesting`` so the fake response only needs to carry the column shape, not the full nested-percentile body. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 42a2ec89..6e8e5777 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -256,6 +256,88 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): assert any("tok2" in m for m in warnings_), warnings_ +def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): + """``get_stats_data`` drops ``computation_id`` and + ``parent_time_series_id`` from the result by default — the + ``include_hash_ids=False`` counterpart for the stats path.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "parameter_code": ["00060"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "value": [1.0], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + ) + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert "parameter_code" in df.columns + assert "value" in df.columns + + +def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): + """``include_hash_ids=True`` preserves the legacy stats columns.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the From dbbb0a491da853f7db134266241ad9874a9b765b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:25:57 -0500 Subject: [PATCH 4/6] feat(waterdata): extend hash-ID drop to get_samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A live-API column-content scan found that the Samples CSV service returns two UUID-valued columns by default: - ``Activity_ActivityIdentifier`` (per-activity UUID) - ``Result_MeasureIdentifier`` (per-measurement UUID) These weren't covered by the prior OGC / stats commits because ``get_samples`` parses CSV directly without going through ``get_ogc_data`` or ``get_stats_data``. This commit: - Adds the two CamelCase column names to ``_HASH_ID_COLUMNS``. - Plumbs ``include_hash_ids: bool = False`` through ``get_samples`` and drops the named columns from the parsed CSV before returning. - Updates ``test_mock_get_samples`` to reflect the new column count (187 → 185) and adds ``test_mock_get_samples_include_hash_ids`` for the opt-in path. - Updates ``test_samples_results`` and ``test_samples_activity`` similarly. Stable identifiers (``Org_Identifier``, ``Location_Identifier``, ``Project_Identifier``, ``USGSpcode``, …) are kept unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 21 +++++++++++++- dataretrieval/waterdata/utils.py | 6 ++++ tests/waterdata_test.py | 47 +++++++++++++++++++++++++++----- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 0ee89c5f..3789bfd4 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -26,6 +26,7 @@ SERVICES, ) from dataretrieval.waterdata.utils import ( + _HASH_ID_COLUMNS, SAMPLES_URL, _check_profiles, _default_headers, @@ -2273,6 +2274,7 @@ def get_samples( pointLocationWithinMiles: float | None = None, projectIdentifier: str | Iterable[str] | None = None, recordIdentifierUserSupplied: str | Iterable[str] | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential @@ -2403,6 +2405,14 @@ def get_samples( recordIdentifierUserSupplied : string or iterable of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. + include_hash_ids : boolean, optional + If False (default), the per-activity UUID + (``Activity_ActivityIdentifier``) and per-result UUID + (``Result_MeasureIdentifier``) are dropped from the returned + DataFrame. Stable identifiers (``Org_Identifier``, + ``Location_Identifier``, ``Project_Identifier``, + ``USGSpcode``, …) are kept. Set to True to restore the legacy + behavior of including every column. Returns ------- @@ -2452,7 +2462,7 @@ def get_samples( _check_profiles(service, profile) # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"ssl_check", "profile"}) + params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash_ids"}) params.update({"mimeType": "text/csv"}) @@ -2474,6 +2484,15 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) + # Drop hash-valued ID columns (``Activity_ActivityIdentifier``, + # ``Result_MeasureIdentifier`` — both UUIDs) by default. + # Stable identifiers like ``Org_Identifier``, + # ``Location_Identifier``, ``Project_Identifier`` are kept. + if not include_hash_ids: + drop_cols = [c for c in df.columns if c in _HASH_ID_COLUMNS] + if drop_cols: + df = df.drop(columns=drop_cols) + return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 8b5c2f72..19de9684 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -190,6 +190,12 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # ``get_stats_*`` (statistics service) output — per-computation # UUID; ``parent_time_series_id`` is already listed above. "computation_id", + # ``get_samples`` (Samples database CSV) — per-activity and + # per-result UUIDs. The Samples service uses CamelCase column + # names rather than snake_case, but the drop logic only needs + # exact name matches so they share this set. + "Activity_ActivityIdentifier", + "Result_MeasureIdentifier", } ) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index aff58a9f..da045f60 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -82,8 +82,14 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - # 181 source columns + 6 derived DateTime columns - assert df.shape == (67, 187) + # 181 source columns + 6 derived DateTime columns − 2 hash IDs + # (Activity_ActivityIdentifier, Result_MeasureIdentifier) dropped by default. + assert df.shape == (67, 185) + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns + # Stable identifiers are preserved. + assert "Location_Identifier" in df.columns + assert "Org_Identifier" in df.columns assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} @@ -91,6 +97,29 @@ def test_mock_get_samples(requests_mock): assert df["Activity_StartDateTime"].notna().any() +def test_mock_get_samples_include_hash_ids(requests_mock): + """``include_hash_ids=True`` restores the legacy column set.""" + request_url = ( + "https://api.waterdata.usgs.gov/samples-data/results/fullphyschem?" + "activityMediaName=Water&activityStartDateLower=2020-01-01" + "&activityStartDateUpper=2024-12-31&monitoringLocationIdentifier=USGS-05406500&mimeType=text%2Fcsv" + ) + response_file_path = "tests/data/samples_results.txt" + mock_request(requests_mock, request_url, response_file_path) + df, _md = get_samples( + service="results", + profile="fullphyschem", + activityMediaName="Water", + activityStartDateLower="2020-01-01", + activityStartDateUpper="2024-12-31", + monitoringLocationIdentifier="USGS-05406500", + include_hash_ids=True, + ) + assert df.shape == (67, 187) + assert "Activity_ActivityIdentifier" in df.columns + assert "Result_MeasureIdentifier" in df.columns + + def test_mock_get_samples_summary(requests_mock): """Tests USGS Samples summary query""" request_url = ( @@ -216,10 +245,11 @@ def test_samples_results(): activityStartDateLower="2024-10-01", activityStartDateUpper="2025-04-24", ) - assert all( - col in df.columns - for col in ["Location_Identifier", "Activity_ActivityIdentifier"] - ) + # Stable identifiers are kept; hash IDs (Activity_ActivityIdentifier, + # Result_MeasureIdentifier) are dropped by default. + assert "Location_Identifier" in df.columns + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns assert len(df) > 0 @@ -231,7 +261,10 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - assert len(df.columns) == 97 + # 97 → 96 cols after dropping Activity_ActivityIdentifier + # (Result_MeasureIdentifier is not in the ``activities`` profile). + assert len(df.columns) == 96 + assert "Activity_ActivityIdentifier" not in df.columns assert "Location_HUCTwelveDigitCode" in df.columns From 6201a65ba3fca5c0d87cd70d96424882365714af Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 18:16:54 -0500 Subject: [PATCH 5/6] refactor(waterdata): unify hash-drop helper and tighten internals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-PR-281 cleanup from a code review pass. No behavior change for users; the three code paths that drop hash columns (OGC, stats, samples) now share one helper, and a few smaller wins. - New ``_drop_hash_columns(df, include_hash_ids, keep=None)`` helper replaces three near-identical drop blocks across ``_arrange_cols``, ``get_stats_data``, and ``get_samples``. Uses ``set(df.columns) & _HASH_ID_COLUMNS - keep`` (one Index intersection) in place of the per-call list-comprehension + ``if drop_cols:`` truthiness guard. ``df.drop`` accepts an empty Index, so the guard was unnecessary. - ``_HASH_ID_COLUMNS`` no longer leaks into ``api.py`` — the samples branch now calls the helper instead of touching the constant. - New ``_properties_unspecified(properties)`` extracts the ``properties is None or all(pd.isna(properties))`` predicate so the five call sites that need it stop drifting between ``not properties`` and ``properties is None`` variants. - ``_default_non_hash_properties`` memoizes its result by ``(service, output_id)`` via ``_default_props_cache``. The previous version rebuilt a ~30–100-item list on every OGC call after the queryables cache was warm; this saves the per-call list rebuild on the hot path. - ``get_ogc_data`` flattens its nested ``try/except + if/else`` branches into a single ``use_server_trim`` flag, removing the duplicated ``_switch_properties_id`` assignment. - The benchmark no longer clears ``_queryables_cache`` between measured rounds. Clearing per round only penalizes the default path (the legacy path doesn't consult the cache), so the previous comparison was pessimistic against the default. Real-world callers pay the queryables fetch once per process. - The benchmark's ``capture_fixtures`` now calls ``_default_non_hash_properties`` directly instead of reimplementing the filter, so the "trimmed" fixture matches what the runtime actually sends bit-for-bit. All 157 unit and mock tests pass after the refactor. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/bench_include_hash_ids.py | 43 +++---- dataretrieval/waterdata/api.py | 12 +- dataretrieval/waterdata/utils.py | 163 ++++++++++++++------------- 3 files changed, 110 insertions(+), 108 deletions(-) diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py index 9dff5c7b..6609e3c3 100644 --- a/benchmarks/bench_include_hash_ids.py +++ b/benchmarks/bench_include_hash_ids.py @@ -79,10 +79,15 @@ def __str__(self) -> str: def time_call(label: str, **kwargs) -> RunResult: """One end-to-end ``get_continuous`` call, with wall time, peak RSS - and final DataFrame memory captured.""" - # Reset the queryables cache so each configuration pays the same - # one-time schema-fetch cost (when it applies). - wd_utils._queryables_cache.clear() + and final DataFrame memory captured. + + Note: ``_queryables_cache`` is left warm across measured rounds — + clearing it each round would charge the default-path with an extra + HTTP request per round while the legacy path (which doesn't use + the cache) gets a free pass, inflating the default-path wall time. + Real-world callers issue many queries per process and pay the + queryables fetch only once. + """ gc.collect() tracemalloc.start() @@ -135,20 +140,15 @@ def capture_fixtures() -> None: session = requests.Session() headers = wd_utils._default_headers() - # Queryables (used only by the default code path). - q_url = f"{wd_utils.OGC_API_URL}/collections/continuous/queryables" - print(f"Fetching {q_url}") - r = session.get(q_url, headers=headers) - r.raise_for_status() - FIXTURE_QUERYABLES.write_bytes(r.content) - - body = json.loads(r.content) - all_props = list(body.get("properties", {}).keys()) - non_hash = [ - p - for p in all_props - if p not in wd_utils._HASH_ID_COLUMNS and p != "geometry" and p != "id" - ] + # Queryables (used only by the default code path). Fetched via the + # cached helper so the trimmed-payload request below sends the + # *exact* same property list the runtime would — otherwise the + # benchmark's "trimmed" fixture could drift from production. + print("Fetching queryables …") + non_hash = wd_utils._default_non_hash_properties("continuous", "continuous_id") + # Mirror the bytes back into a fixture for ``time_offline`` to load. + q_body = {"properties": {p: {} for p in wd_utils._service_queryables("continuous")}} + FIXTURE_QUERYABLES.write_bytes(json.dumps(q_body).encode()) base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" common = { @@ -187,10 +187,13 @@ def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunR """Measure parsing + DataFrame construction time on a captured payload. ``client.send`` is patched to return the recorded response, so this isolates the local-CPU portion of a call from - network variability and rate-limit pressure.""" + network variability and rate-limit pressure. + + ``_queryables_cache`` and ``_default_props_cache`` are left warm + across rounds (see ``time_call`` for the rationale). + """ body = payload_path.read_bytes() queryables_body = FIXTURE_QUERYABLES.read_bytes() - wd_utils._queryables_cache.clear() gc.collect() def _send(req, *args, **kwargs): diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 3789bfd4..eb26304e 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -26,10 +26,10 @@ SERVICES, ) from dataretrieval.waterdata.utils import ( - _HASH_ID_COLUMNS, SAMPLES_URL, _check_profiles, _default_headers, + _drop_hash_columns, _get_args, get_ogc_data, get_stats_data, @@ -2483,15 +2483,7 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) - - # Drop hash-valued ID columns (``Activity_ActivityIdentifier``, - # ``Result_MeasureIdentifier`` — both UUIDs) by default. - # Stable identifiers like ``Org_Identifier``, - # ``Location_Identifier``, ``Project_Identifier`` are kept. - if not include_hash_ids: - drop_cols = [c for c in df.columns if c in _HASH_ID_COLUMNS] - if drop_cols: - df = df.drop(columns=drop_cols) + df = _drop_hash_columns(df, include_hash_ids) return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 19de9684..e020d85c 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -204,6 +204,11 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # ``include_hash_ids=False``. Keyed by service name; value is the full # list of property names the server exposes for that collection. _queryables_cache: dict[str, list[str]] = {} +# Cache of the derived non-hash property whitelist, keyed by +# ``(service, output_id)``. Both inputs determine the result, and +# both are stable per call site — re-deriving on every OGC request +# would do ~30–100 frozenset lookups per call for no reason. +_default_props_cache: dict[tuple[str, str], list[str]] = {} def _service_queryables(service: str) -> list[str]: @@ -228,28 +233,57 @@ def _default_non_hash_properties(service: str, output_id: str) -> list[str]: """Build the ``properties=`` whitelist sent to the server when the caller didn't supply one and ``include_hash_ids=False``. - Returns the service's queryables minus: - - any column whose wire-format name is in :data:`_HASH_ID_COLUMNS` - (the secondary hashes like ``time_series_id``, - ``parent_time_series_id``, ``field_visit_id``); - - the wire-format ``"id"`` column, but only when the service's - ``output_id`` (its post-rename name) is itself a hash column — - i.e., for ``daily``/``continuous``/``peaks``/etc., where ``id`` - becomes ``daily_id``/``continuous_id``/``peak_id``. For - ``monitoring-locations`` (where ``id`` becomes the AGENCY-ID - ``monitoring_location_id``) the ``id`` column is kept; - - ``"geometry"`` (the OGC server returns geometry via the feature - envelope, not as a property — listing it would be redundant and - some collections reject it). + The whitelist is the service's queryables minus :data:`_HASH_ID_COLUMNS`, + minus ``"geometry"`` (the OGC server returns geometry via the feature + envelope, not as a property — some collections reject it as a + property name), and minus the wire-format ``"id"`` column when the + service's ``output_id`` is itself a hash column (e.g. ``daily_id``). + For ``monitoring-locations``, ``id`` becomes the AGENCY-ID + ``monitoring_location_id``, so it's kept. """ + key = (service, output_id) + cached = _default_props_cache.get(key) + if cached is not None: + return cached drop_wire_id = output_id in _HASH_ID_COLUMNS - return [ + props = [ p for p in _service_queryables(service) if p not in _HASH_ID_COLUMNS and p != "geometry" and not (drop_wire_id and p == "id") ] + _default_props_cache[key] = props + return props + + +def _properties_unspecified(properties) -> bool: + """True when the caller didn't pin a ``properties`` list. + + A ``None``, empty list, or list-of-only-NaN counts as unspecified. + Centralizes the predicate so the (subtly different) ``not properties`` + vs ``properties is None`` variants across call sites stay aligned. + """ + return not properties or all(pd.isna(properties)) + + +def _drop_hash_columns( + df: pd.DataFrame, + include_hash_ids: bool, + keep: set[str] | None = None, +) -> pd.DataFrame: + """Drop hash-valued ID columns from ``df`` when not opting in. + + When ``include_hash_ids`` is True, returns ``df`` unchanged. Otherwise + drops every column whose name is in :data:`_HASH_ID_COLUMNS`, except + those the caller listed in ``keep`` (e.g. names appearing in an + explicit user ``properties=`` request — explicit beats default). + A no-op when no hash columns are present. + """ + if include_hash_ids: + return df + drop = (set(df.columns) & _HASH_ID_COLUMNS) - (keep or set()) + return df.drop(columns=drop) if drop else df def _parse_datetime(value: str) -> datetime | None: @@ -868,7 +902,9 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - if properties and not all(pd.isna(properties)): + user_specified = not _properties_unspecified(properties) + + if user_specified: # Don't alias the caller's list — we mutate below. local_properties = list(properties) if "geometry" in df.columns and "geometry" not in local_properties: @@ -879,49 +915,32 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Default: drop hash-valued ID columns the caller didn't ask for. - # This is the client-side counterpart to the server-side - # ``properties=`` trim done in ``get_ogc_data``; it's a no-op on - # the happy path (the server already omitted them) but catches the - # fallback case where the queryables fetch failed and we - # round-tripped a full payload. Drops apply uniformly even when the - # hash column is the service's renamed ``output_id`` (e.g., - # ``daily_id``) — the user-meaningful identifiers - # (``monitoring_location_id`` + ``time`` + ``parameter_code`` + - # ``statistic_id``) are sufficient to pin a row. - if not include_hash_ids: - requested = ( - set(properties) if properties and not all(pd.isna(properties)) else set() - ) - # ``"id"`` in ``properties`` resolves to the renamed ``output_id`` - # (matching the rename done above and in ``_switch_properties_id``), - # so treat the user as having asked for that output_id too. - if "id" in requested: - requested.add(output_id) - drop_cols = [ - col - for col in df.columns - if col in _HASH_ID_COLUMNS and col not in requested - ] - if drop_cols: - df = df.drop(columns=drop_cols) + # Client-side safety net for the server-side trim done in + # ``get_ogc_data``: no-op on the happy path (server already omitted + # hash columns), drops them here when the queryables fetch failed + # and we fell back to a full payload. An explicit caller + # ``properties`` list — including ``"id"``, which resolved to + # ``output_id`` above — wins over the default. + keep: set[str] = set() + if user_specified: + keep = set(properties) + if "id" in keep: + keep.add(output_id) + df = _drop_hash_columns(df, include_hash_ids, keep=keep) # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` - # is unspecified, move the per-record version IDs to the end of the - # DataFrame so they don't crowd the front. With - # ``include_hash_ids=False`` those columns are gone above, so this - # branch becomes a no-op. - extra_id_col = set(df.columns).intersection( - { - "latest_continuous_id", - "latest_daily_id", - "daily_id", - "continuous_id", - "field_measurement_id", - } - ) + # is unspecified, move the per-record version IDs to the end so they + # don't crowd the front. With ``include_hash_ids=False`` those + # columns are gone above, so this branch is a no-op. + extra_id_col = set(df.columns) & { + "latest_continuous_id", + "latest_daily_id", + "daily_id", + "continuous_id", + "field_measurement_id", + } - if extra_id_col and (properties is None or all(pd.isna(properties))): + if extra_id_col and _properties_unspecified(properties): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col ) @@ -1042,29 +1061,23 @@ def get_ogc_data( include_hash_ids = args.pop("include_hash_ids", False) # When the caller didn't pin ``properties`` and isn't opting into - # hash IDs, send a server-side whitelist of the non-hash columns so - # the server (a) skips serializing UUID/hex fields and (b) returns - # a smaller payload for us to parse. ``_arrange_cols`` still sees - # ``properties=None`` (the original user input), so columns retain - # the schema's natural order rather than being subset to whatever - # whitelist we synthesized here. - if not include_hash_ids and (properties is None or all(pd.isna(properties))): + # hash IDs, try a server-side whitelist of the non-hash columns so + # the server skips serializing UUID/hex fields. On any queryables + # failure, fall through to the full payload — ``_arrange_cols`` + # post-processes the drop as a safety net. + use_server_trim = not include_hash_ids and _properties_unspecified(properties) + if use_server_trim: try: args["properties"] = _default_non_hash_properties(service, output_id) except (requests.HTTPError, requests.RequestException, ValueError) as exc: - # Server-side trim is an optimization, not a correctness - # requirement — fall back to a full payload and rely on the - # ``_arrange_cols`` post-processing drop below. logger.warning( "Could not fetch queryables for %s (%s); " "falling back to client-side hash-ID drop.", service, exc, ) - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) - else: + use_server_trim = False + if not use_server_trim: args["properties"] = _switch_properties_id( properties, id_name=output_id, service=service ) @@ -1337,15 +1350,9 @@ def get_stats_data( if expand_percentiles: dfs = _expand_percentiles(dfs) - # Drop hash-valued ID columns at the end (after - # ``_expand_percentiles``, which still needs ``computation_id`` - # as a merge key while it explodes the percentile lists into - # rows). Stable identifiers (``monitoring_location_id``, - # ``parameter_code``, ``time_of_year``, …) are kept. - if not include_hash_ids: - drop_cols = [col for col in dfs.columns if col in _HASH_ID_COLUMNS] - if drop_cols: - dfs = dfs.drop(columns=drop_cols) + # Drop hash IDs after ``_expand_percentiles`` — it merges on + # ``computation_id`` while exploding the percentile lists. + dfs = _drop_hash_columns(dfs, include_hash_ids) return dfs, BaseMetadata(initial_response) finally: From 301cd492b15eb0ae9c4516ed94344e069a75199f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 20:29:59 -0500 Subject: [PATCH 6/6] chore(waterdata): drop benchmark scaffolding from PR The ``benchmarks/`` directory was useful during development for measuring the hash-ID-drop impact, but it's not part of the runtime or test surface and the numbers are captured in the PR description. Removing to keep the diff focused on the library change. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/_fixtures/.gitignore | 4 - benchmarks/_fixtures/synthesize.py | 123 --------- benchmarks/bench_include_hash_ids.py | 359 --------------------------- benchmarks/results_offline.txt | 22 -- 4 files changed, 508 deletions(-) delete mode 100644 benchmarks/_fixtures/.gitignore delete mode 100644 benchmarks/_fixtures/synthesize.py delete mode 100644 benchmarks/bench_include_hash_ids.py delete mode 100644 benchmarks/results_offline.txt diff --git a/benchmarks/_fixtures/.gitignore b/benchmarks/_fixtures/.gitignore deleted file mode 100644 index 5efdd79b..00000000 --- a/benchmarks/_fixtures/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Synthesized or captured response payloads are large (≈10–15 MB each) -# and trivially regenerated. Keep them out of the repo; the script that -# produces them is committed. -*.json diff --git a/benchmarks/_fixtures/synthesize.py b/benchmarks/_fixtures/synthesize.py deleted file mode 100644 index 72e07ffb..00000000 --- a/benchmarks/_fixtures/synthesize.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Generate synthetic OGC API payloads for the offline benchmark. - -We can't always reach the live USGS API (rate limits, no token), but -the local cost of an ``include_hash_ids=False`` vs ``True`` call is -dominated by: - - JSON parsing (``response.json()``) - - ``pandas.json_normalize`` over the features list - - DataFrame column allocation - -All three scale with payload bytes and feature count. A synthetic -payload that mirrors the real wire format and the real per-row column -shape is sufficient to measure them. -""" - -from __future__ import annotations - -import json -import uuid -from datetime import datetime, timedelta, timezone -from pathlib import Path - -ROWS = 30000 # ~1 year of 15-minute continuous data -HERE = Path(__file__).parent - - -def _row(i: int) -> dict: - ts = datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(minutes=15 * i) - return { - "id": str(uuid.uuid4()), - "time_series_id": uuid.uuid4().hex, - "monitoring_location_id": "USGS-02238500", - "parameter_code": "00060", - "statistic_id": "00011", - "time": ts.strftime("%Y-%m-%dT%H:%M:%S+00:00"), - "value": f"{100.0 + 0.01 * (i % 1000):.2f}", - "unit_of_measure": "ft^3/s", - "approval_status": "Approved", - "qualifier": None, - "last_modified": "2026-05-01T00:00:00+00:00", - } - - -def _feature(props: dict) -> dict: - return { - "type": "Feature", - "properties": props, - "id": props.get("id", ""), - "geometry": None, - } - - -HASH_COLS = {"id", "time_series_id"} - - -def build_full() -> dict: - features = [_feature(_row(i)) for i in range(ROWS)] - return { - "type": "FeatureCollection", - "features": features, - "numberReturned": ROWS, - "links": [], - } - - -def build_trimmed() -> dict: - features = [] - for i in range(ROWS): - props = {k: v for k, v in _row(i).items() if k not in HASH_COLS} - features.append( - { - "type": "Feature", - "properties": props, - "id": "", - "geometry": None, - } - ) - return { - "type": "FeatureCollection", - "features": features, - "numberReturned": ROWS, - "links": [], - } - - -def build_queryables() -> dict: - return { - "properties": { - "geometry": {}, - "id": {}, - "time_series_id": {}, - "monitoring_location_id": {}, - "parameter_code": {}, - "statistic_id": {}, - "time": {}, - "value": {}, - "unit_of_measure": {}, - "approval_status": {}, - "qualifier": {}, - "last_modified": {}, - } - } - - -def main() -> None: - HERE.mkdir(exist_ok=True) - full = build_full() - trimmed = build_trimmed() - queryables = build_queryables() - - (HERE / "continuous_full.json").write_text(json.dumps(full)) - (HERE / "continuous_trimmed.json").write_text(json.dumps(trimmed)) - (HERE / "continuous_queryables.json").write_text(json.dumps(queryables)) - - full_size = (HERE / "continuous_full.json").stat().st_size - trim_size = (HERE / "continuous_trimmed.json").stat().st_size - pct = 100 * (full_size - trim_size) / full_size - print(f"rows: {ROWS:,}") - print(f"full: {full_size:>12,} bytes") - print(f"trimmed: {trim_size:>12,} bytes ({pct:.1f}% smaller)") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py deleted file mode 100644 index 6609e3c3..00000000 --- a/benchmarks/bench_include_hash_ids.py +++ /dev/null @@ -1,359 +0,0 @@ -"""Benchmark: default (``include_hash_ids=False``) vs legacy -(``include_hash_ids=True``) on a large ``get_daily`` query. - -The two settings are functionally equivalent except for the presence of -UUID/hex-hash ID columns in the response. The hash columns are: - - ``daily_id`` — 36-char UUID, one per record - - ``time_series_id`` — 32-char hex hash, one per record - -For a 50,000-row response that's 68 bytes/row of hash, plus JSON -overhead — roughly 4 MB of payload that we now neither transfer nor -parse. The expected wins: - - smaller HTTP payload (server-side ``properties=`` trim) - - fewer columns to ``json_normalize`` in pandas - - smaller DataFrame footprint - -Run with:: - - API_USGS_PAT= python benchmarks/bench_include_hash_ids.py - -Without a token, USGS rate-limits to ~120 requests/hour which is enough -for a single comparison run but not for retrying. The script clears the -queryables cache between runs so the schema-fetch cost is amortized -across both configurations. -""" - -from __future__ import annotations - -import argparse -import gc -import json -import sys -import time -import tracemalloc -from dataclasses import dataclass -from pathlib import Path -from unittest import mock - -import requests - -from dataretrieval.waterdata import get_continuous -from dataretrieval.waterdata import utils as wd_utils - -# A long-running, high-frequency gage. Continuous (sub-hourly) records -# yield O(10⁴) rows per year per parameter — the time window below is -# tuned so a single query returns ~one page worth of data, large enough -# that JSON parsing/DataFrame construction dominates over network -# round-trip variability. -SITE = "USGS-02238500" -PARAMETER = "00060" -# ~1 year of 15-min flow data ≈ 35,000 rows, just under one page. -TIME_RANGE = "2023-01-01/2024-01-01" - -# Where to stash the captured payload for the offline benchmark mode. -FIXTURE_DIR = Path(__file__).parent / "_fixtures" -FIXTURE_FULL = FIXTURE_DIR / "continuous_full.json" -FIXTURE_TRIMMED = FIXTURE_DIR / "continuous_trimmed.json" -FIXTURE_QUERYABLES = FIXTURE_DIR / "continuous_queryables.json" - - -@dataclass -class RunResult: - label: str - wall_seconds: float - rows: int - cols: int - mem_peak_bytes: int - memory_usage_bytes: int - - def __str__(self) -> str: - return ( - f" {self.label:>32}: " - f"{self.wall_seconds:6.2f}s " - f"rows={self.rows:>7} " - f"cols={self.cols:>2} " - f"peak_mem={self.mem_peak_bytes / 1024 / 1024:6.1f} MB " - f"df_mem={self.memory_usage_bytes / 1024 / 1024:6.1f} MB" - ) - - -def time_call(label: str, **kwargs) -> RunResult: - """One end-to-end ``get_continuous`` call, with wall time, peak RSS - and final DataFrame memory captured. - - Note: ``_queryables_cache`` is left warm across measured rounds — - clearing it each round would charge the default-path with an extra - HTTP request per round while the legacy path (which doesn't use - the cache) gets a free pass, inflating the default-path wall time. - Real-world callers issue many queries per process and pay the - queryables fetch only once. - """ - gc.collect() - - tracemalloc.start() - start = time.perf_counter() - df, _md = get_continuous( - monitoring_location_id=SITE, - parameter_code=PARAMETER, - time=TIME_RANGE, - **kwargs, - ) - wall = time.perf_counter() - start - _current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - return RunResult( - label=label, - wall_seconds=wall, - rows=len(df), - cols=df.shape[1], - mem_peak_bytes=peak, - memory_usage_bytes=int(df.memory_usage(deep=True).sum()), - ) - - -def _make_mock_response(body_bytes: bytes, status: int = 200) -> mock.Mock: - """Build a ``requests.Response``-shaped mock backed by ``body_bytes``. - - Only the attributes ``_walk_pages`` and ``_get_resp_data`` touch - need to be real; ``elapsed``, ``status_code``, ``headers``, ``json()``, - and ``raise_for_status()`` cover those callers. Pagination is - forced single-page by stripping any "next" link from the body - before the test patches it in. - """ - resp = mock.Mock(spec=requests.Response) - resp.status_code = status - resp.headers = {"x-ratelimit-remaining": "1000"} - resp.elapsed = __import__("datetime").timedelta(milliseconds=1) - resp.url = "https://test/mock" - resp.text = body_bytes.decode("utf-8") - resp.json = lambda: json.loads(body_bytes) - resp.raise_for_status = lambda: None - return resp - - -def capture_fixtures() -> None: - """Snapshot the two response payloads (with and without hash IDs) - plus the queryables response, so the ``--offline`` benchmark can - parse them locally with no network calls.""" - FIXTURE_DIR.mkdir(exist_ok=True) - session = requests.Session() - headers = wd_utils._default_headers() - - # Queryables (used only by the default code path). Fetched via the - # cached helper so the trimmed-payload request below sends the - # *exact* same property list the runtime would — otherwise the - # benchmark's "trimmed" fixture could drift from production. - print("Fetching queryables …") - non_hash = wd_utils._default_non_hash_properties("continuous", "continuous_id") - # Mirror the bytes back into a fixture for ``time_offline`` to load. - q_body = {"properties": {p: {} for p in wd_utils._service_queryables("continuous")}} - FIXTURE_QUERYABLES.write_bytes(json.dumps(q_body).encode()) - - base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" - common = { - "monitoring_location_id": SITE, - "parameter_code": PARAMETER, - "time": TIME_RANGE, - "skipGeometry": True, - "limit": 50000, - } - - # Full payload (legacy behavior — every column). - print("Fetching full payload …") - r = session.get(base, headers=headers, params=common) - r.raise_for_status() - FIXTURE_FULL.write_bytes(r.content) - print(f" → {FIXTURE_FULL.name}: {len(r.content):,} bytes") - - # Trimmed payload (new default — non-hash columns only). - print("Fetching trimmed payload …") - params = dict(common, properties=",".join(non_hash)) - r = session.get(base, headers=headers, params=params) - r.raise_for_status() - FIXTURE_TRIMMED.write_bytes(r.content) - print(f" → {FIXTURE_TRIMMED.name}: {len(r.content):,} bytes") - - full_size = FIXTURE_FULL.stat().st_size - trim_size = FIXTURE_TRIMMED.stat().st_size - pct = 100 * (full_size - trim_size) / full_size - print() - print( - f"Server payload size: {full_size:,} → {trim_size:,} bytes ({pct:.1f}% smaller)" - ) - - -def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunResult: - """Measure parsing + DataFrame construction time on a captured - payload. ``client.send`` is patched to return the recorded - response, so this isolates the local-CPU portion of a call from - network variability and rate-limit pressure. - - ``_queryables_cache`` and ``_default_props_cache`` are left warm - across rounds (see ``time_call`` for the rationale). - """ - body = payload_path.read_bytes() - queryables_body = FIXTURE_QUERYABLES.read_bytes() - gc.collect() - - def _send(req, *args, **kwargs): - return _make_mock_response(body) - - def _get(url, *args, **kwargs): - return _make_mock_response(queryables_body) - - with ( - mock.patch.object(requests.Session, "send", _send), - mock.patch.object(requests, "get", _get), - ): - tracemalloc.start() - start = time.perf_counter() - df, _md = get_continuous( - monitoring_location_id=SITE, - parameter_code=PARAMETER, - time=TIME_RANGE, - include_hash_ids=include_hash_ids, - ) - wall = time.perf_counter() - start - _current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - return RunResult( - label=label, - wall_seconds=wall, - rows=len(df), - cols=df.shape[1], - mem_peak_bytes=peak, - memory_usage_bytes=int(df.memory_usage(deep=True).sum()), - ) - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--capture", - action="store_true", - help="Fetch and save the two response payloads as fixtures, then exit.", - ) - parser.add_argument( - "--offline", - action="store_true", - help="Use captured fixtures instead of hitting the live API. " - "Isolates parsing/DataFrame cost from network variability.", - ) - parser.add_argument( - "--rounds", - type=int, - default=3, - help="Number of measured rounds per configuration (default: 3).", - ) - args = parser.parse_args() - - if args.capture: - capture_fixtures() - return 0 - - if args.offline: - if not ( - FIXTURE_FULL.exists() - and FIXTURE_TRIMMED.exists() - and FIXTURE_QUERYABLES.exists() - ): - print( - "Missing fixtures. Run with --capture first to record them.", - file=sys.stderr, - ) - return 1 - - full_size = FIXTURE_FULL.stat().st_size - trim_size = FIXTURE_TRIMMED.stat().st_size - print( - f"Offline benchmark on fixtures " - f"(full: {full_size:,} B, trimmed: {trim_size:,} B, " - f"server-side savings: {100 * (full_size - trim_size) / full_size:.1f}%)" - ) - print() - - # Warm up to load pandas/geopandas/numpy code paths. - time_offline("warmup_default", FIXTURE_TRIMMED, include_hash_ids=False) - time_offline("warmup_legacy", FIXTURE_FULL, include_hash_ids=True) - - runs = [] - for _ in range(args.rounds): - runs.append( - time_offline( - "default (hash IDs dropped)", - FIXTURE_TRIMMED, - include_hash_ids=False, - ) - ) - runs.append( - time_offline( - "include_hash_ids=True", - FIXTURE_FULL, - include_hash_ids=True, - ) - ) - else: - print( - f"Benchmarking get_continuous(site={SITE!r}, parameter={PARAMETER!r}, " - f"time={TIME_RANGE!r})" - ) - print() - - # Warmup once with each configuration to absorb DNS/TLS/cache - # cold-start effects, then run measured rounds. - print("Warming up …") - time_call("warmup_default") - time_call("warmup_legacy", include_hash_ids=True) - - runs = [] - for _ in range(args.rounds): - runs.append(time_call("default (hash IDs dropped)")) - runs.append(time_call("include_hash_ids=True", include_hash_ids=True)) - - print("All runs:") - for r in runs: - print(r) - print() - - best_default = min( - (r for r in runs if r.label.startswith("default")), - key=lambda r: r.wall_seconds, - ) - best_legacy = min( - (r for r in runs if r.label.startswith("include_hash_ids")), - key=lambda r: r.wall_seconds, - ) - - print("Best of each:") - print(best_default) - print(best_legacy) - print() - - wall_delta = best_legacy.wall_seconds - best_default.wall_seconds - wall_pct = ( - 100 * wall_delta / best_legacy.wall_seconds if best_legacy.wall_seconds else 0.0 - ) - mem_delta = best_legacy.memory_usage_bytes - best_default.memory_usage_bytes - mem_pct = ( - 100 * mem_delta / best_legacy.memory_usage_bytes - if best_legacy.memory_usage_bytes - else 0.0 - ) - peak_delta = best_legacy.mem_peak_bytes - best_default.mem_peak_bytes - peak_pct = ( - 100 * peak_delta / best_legacy.mem_peak_bytes - if best_legacy.mem_peak_bytes - else 0.0 - ) - - print(f"Wall-clock speedup: {wall_delta:+.2f}s ({wall_pct:+.1f}%)") - print(f"DataFrame memory: {mem_delta / 1024 / 1024:+.1f} MB ({mem_pct:+.1f}%)") - print(f"Peak traced memory: {peak_delta / 1024 / 1024:+.1f} MB ({peak_pct:+.1f}%)") - print(f"Columns dropped: {best_legacy.cols - best_default.cols}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/results_offline.txt b/benchmarks/results_offline.txt deleted file mode 100644 index e7e10919..00000000 --- a/benchmarks/results_offline.txt +++ /dev/null @@ -1,22 +0,0 @@ -Offline benchmark on fixtures (full: 14,310,081 B, trimmed: 10,230,081 B, server-side savings: 28.5%) - -All runs: - default (hash IDs dropped): 0.96s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.08s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.06s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - -Best of each: - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - -Wall-clock speedup: +0.11s (+10.8%) -DataFrame memory: +4.7 MB (+33.5%) -Peak traced memory: +20.2 MB (+21.5%) -Columns dropped: 2