From 127c3eac2ce9698f55ed205d0d080519e429cfc0 Mon Sep 17 00:00:00 2001 From: bw4sz Date: Wed, 29 Apr 2026 15:54:41 -0700 Subject: [PATCH 1/2] Add USGS aerial wildlife release download and FathomNet CSV export. ScienceBase item 658da7c1d34e3265ab14bb00 (DOI 10.5066/P9CBZQV1): script fetches manifest JSON, downloads annotations and FGDC metadata by default, optional codebase and imagery with S3 request URL hint. --peek-schema prints COCO layout from train/train.json inside the annotations zip. coco_to_fathomnet_localizations maps COCO bbox xywh to FathomNet CSV columns for public-hosted patch URLs (e.g. HiPerGator /orange/ewhite/web/public). Made-with: Cursor --- scripts/coco_to_fathomnet_localizations.py | 93 ++++++++ .../download_usgs_aerial_wildlife_release.py | 203 ++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 scripts/coco_to_fathomnet_localizations.py create mode 100644 scripts/download_usgs_aerial_wildlife_release.py diff --git a/scripts/coco_to_fathomnet_localizations.py b/scripts/coco_to_fathomnet_localizations.py new file mode 100644 index 0000000..1e4d1a0 --- /dev/null +++ b/scripts/coco_to_fathomnet_localizations.py @@ -0,0 +1,93 @@ +"""Convert COCO detection JSON (bbox xywh) to FathomNet localization CSV. + +FathomNet expects columns including: concept, image (public URL), x, y, width, height +with origin top-left (+Y down). COCO ``bbox`` uses the same convention. + +Use on HiPerGator (or any host) after imagery is reachable at a stable HTTPS URL, for +example under UF Orange public web space: + + /orange/ewhite/web/public/... + +so each patch URL looks like: + https:///.../0400AGL_P1_20170224_102133_631_136760_56257_0.png + +See: https://www.fathomnet.org/post/how-to-submit-localized-image-annotations-to-the-fathomnet-database + +Example: + uv run python scripts/coco_to_fathomnet_localizations.py \\ + --coco-json ./data/usgs_P9CBZQV1/annotations_extracted/train/train.json \\ + --image-base-url https://example.rc.ufl.edu/public/boem/usgs_patches/ \\ + --output-csv ./fathomnet_localizations.csv +""" + +from __future__ import annotations + +import argparse +import csv +import json +import sys +from pathlib import Path +from typing import Any +from urllib.parse import urljoin + + +def load_coco(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--coco-json", type=Path, required=True) + parser.add_argument( + "--image-base-url", + required=True, + help="Base URL ending with /; file_name from COCO is appended (urljoin).", + ) + parser.add_argument("--output-csv", type=Path, required=True) + args = parser.parse_args() + + coco_path: Path = args.coco_json.expanduser().resolve() + out_csv: Path = args.output_csv.expanduser().resolve() + base: str = args.image_base_url + if not base.endswith("/"): + base = base + "/" + + data = load_coco(coco_path) + id_to_name = {int(c["id"]): str(c.get("name", "")) for c in data.get("categories", [])} + id_to_image = {int(im["id"]): im for im in data.get("images", [])} + rows: list[dict[str, Any]] = [] + for ann in data.get("annotations", []): + bbox = ann.get("bbox") + if not bbox or len(bbox) != 4: + continue + x, y, w, h = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])) + image_id = int(ann["image_id"]) + image = id_to_image.get(image_id) + if image is None: + continue + file_name = str(image["file_name"]) + url = urljoin(base, file_name.split("/")[-1]) + cat_id = int(ann.get("category_id", 0)) + concept = id_to_name.get(cat_id, str(cat_id)) + rows.append( + { + "concept": concept, + "image": url, + "x": int(round(x)), + "y": int(round(y)), + "width": int(round(w)), + "height": int(round(h)), + } + ) + + out_csv.parent.mkdir(parents=True, exist_ok=True) + fieldnames = ["concept", "image", "x", "y", "width", "height"] + with out_csv.open("w", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + w.writerows(rows) + print(f"Wrote {len(rows)} rows to {out_csv}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/download_usgs_aerial_wildlife_release.py b/scripts/download_usgs_aerial_wildlife_release.py new file mode 100644 index 0000000..d2d115c --- /dev/null +++ b/scripts/download_usgs_aerial_wildlife_release.py @@ -0,0 +1,203 @@ +"""Download assets from the USGS ScienceBase data release for Ke et al. aerial wildlife ML data. + +Catalog page: https://data.usgs.gov/datacatalog/data/USGS:658da7c1d34e3265ab14bb00 +DOI: https://doi.org/10.5066/P9CBZQV1 + +The release bundles Lake Michigan and Nantucket Shoals (Atlantic) imagery and COCO-format +annotations. Place keywords on the catalog include Cape Cod; the bundled FGDC metadata +describes geographic coverage as "Lake Michigan and Nantucket Shoals". COCO ``file_name`` +values do not encode region, so splitting "Cape Cod only" requires external flight metadata +from the data provider if you need a strict geographic subset. + +Large imagery (``02_Imagery.zip``, ~18 GB) is served via ScienceBase/S3 and may require +using the request-download flow in a browser when direct ``curl`` receives HTML instead +of bytes. This script prints the request URL when you pass ``--include-imagery``. + +Examples: + uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1 + uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1 --peek-schema +""" + +from __future__ import annotations + +import argparse +import json +import sys +import urllib.error +import urllib.request +import zipfile +from pathlib import Path +from typing import Any + +SCIENCEBASE_ITEM_ID = "658da7c1d34e3265ab14bb00" +ITEM_JSON_URL = f"https://www.sciencebase.gov/catalog/item/{SCIENCEBASE_ITEM_ID}?format=json" + + +def fetch_item_json() -> dict[str, Any]: + with urllib.request.urlopen(ITEM_JSON_URL, timeout=120) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def file_entries(item: dict[str, Any]) -> list[dict[str, Any]]: + return list(item.get("files") or []) + + +def pick_download_uri(entry: dict[str, Any]) -> str: + uri = entry.get("downloadUri") or entry.get("url") + if not uri: + raise ValueError(f"No download URI for file: {entry.get('name')}") + return str(uri) + + +def human_size(n: int | None) -> str: + if n is None: + return "unknown" + units = ("B", "KB", "MB", "GB", "TB") + v = float(n) + u = 0 + while v >= 1024 and u < len(units) - 1: + v /= 1024.0 + u += 1 + if u == 0: + return f"{int(n)} {units[u]}" + return f"{v:.1f} {units[u]}" + + +def download_url(url: str, dest: Path) -> None: + dest.parent.mkdir(parents=True, exist_ok=True) + with urllib.request.urlopen(url, timeout=600) as resp: + body = resp.read() + if len(body) < 5000 and body.lstrip().startswith(b"<"): + raise RuntimeError( + f"Download from {url} returned HTML ({len(body)} bytes), not a file. " + "Try the ScienceBase request-download page for S3-backed assets." + ) + dest.write_bytes(body) + + +def peek_coco_schema(annotations_zip: Path) -> None: + with zipfile.ZipFile(annotations_zip) as zf: + names = [n for n in zf.namelist() if n.endswith("train/train.json")] + if not names: + print("No annotations/train/train.json inside zip; members:", zf.namelist()[:30]) + return + inner = names[0] + with zf.open(inner) as f: + data = json.load(f) + imgs = data.get("images") or [] + anns = data.get("annotations") or [] + cats = data.get("categories") or [] + print("COCO keys:", sorted(data.keys())) + print("n_images:", len(imgs), "n_annotations:", len(anns), "n_categories:", len(cats)) + if imgs: + print("sample image:", {k: imgs[0].get(k) for k in ("id", "file_name", "width", "height")}) + if anns: + a0 = anns[0] + print( + "sample annotation keys:", + sorted(a0.keys()), + ) + print("sample bbox (xywh):", a0.get("bbox"), "image_id:", a0.get("image_id")) + if cats: + print("sample category:", cats[0]) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--out-dir", + type=Path, + default=Path("./data/usgs_P9CBZQV1"), + help="Directory to write downloaded zips and metadata.", + ) + parser.add_argument( + "--include-codebase", + action="store_true", + help="Also download 01_CodeBase.zip (~8 MB).", + ) + parser.add_argument( + "--include-imagery", + action="store_true", + help="Attempt 02_Imagery.zip (~18 GB). Often requires ScienceBase browser download.", + ) + parser.add_argument( + "--skip-annotations", + action="store_true", + help="Do not download 03_Annotations.zip.", + ) + parser.add_argument( + "--peek-schema", + action="store_true", + help="After annotations zip exists, print COCO schema summary from train/train.json.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="List resolved URLs and sizes without downloading.", + ) + args = parser.parse_args() + out_dir: Path = args.out_dir.expanduser().resolve() + + item = fetch_item_json() + by_name = {str(e.get("name")): e for e in file_entries(item)} + + plan: list[tuple[str, Path]] = [] + if not args.skip_annotations: + e = by_name.get("03_Annotations.zip") + if not e: + print("Item JSON missing 03_Annotations.zip", file=sys.stderr) + sys.exit(1) + plan.append(("03_Annotations.zip", out_dir / "03_Annotations.zip")) + if args.include_codebase: + e = by_name.get("01_CodeBase.zip") + if e: + plan.append(("01_CodeBase.zip", out_dir / "01_CodeBase.zip")) + if args.include_imagery: + e = by_name.get("02_Imagery.zip") + if e: + plan.append(("02_Imagery.zip", out_dir / "02_Imagery.zip")) + s3_page = e.get("s3DownloadRequestPageUri") + if s3_page: + print("If direct download fails, request the large imagery zip via:", s3_page) + + meta = by_name.get("Metadata for Deep Learning Wildlife Detection Model.xml") + if meta: + plan.append( + ( + "Metadata for Deep Learning Wildlife Detection Model.xml", + out_dir / "metadata_fgdc.xml", + ) + ) + + for logical, dest in plan: + entry = by_name.get(logical) or next( + (e for e in file_entries(item) if e.get("name") == logical), + None, + ) + if entry is None and logical == "metadata_fgdc.xml": + continue + name = str(entry.get("name")) if entry else logical + size = int(entry.get("size") or 0) if entry else 0 + url = pick_download_uri(entry) if entry else "" + print(f"{name} -> {dest} ({human_size(size)})") + print(f" {url}") + if args.dry_run: + continue + if dest.exists() and dest.stat().st_size > 0: + print(f" skip (exists): {dest}") + continue + if name == "02_Imagery.zip": + print(" Warning: very large file; ensure disk space and stable network.") + download_url(url, dest) + print(f" wrote {dest} ({human_size(dest.stat().st_size)})") + + ann_zip = out_dir / "03_Annotations.zip" + if args.peek_schema: + if not ann_zip.is_file(): + print("--peek-schema: annotations zip not found at", ann_zip, file=sys.stderr) + sys.exit(1) + peek_coco_schema(ann_zip) + + +if __name__ == "__main__": + main() From 61d7e6bfd875abafa386b49955e848901bf5e973 Mon Sep 17 00:00:00 2001 From: bw4sz Date: Fri, 1 May 2026 13:43:53 -0400 Subject: [PATCH 2/2] Update FathomNet conversion script with real HPC paths and --limit flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds --limit N to sample the first N images (useful for pilot submissions). Updates docstring with concrete HPC paths: - Imagery: /orange/ewhite/web/public/BOEM/usgs/images/ → https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/ - Annotations: data/usgs_P9CBZQV1/annotations_extracted/annotations/{train,eval,test}/ Co-Authored-By: Claude Sonnet 4.6 --- scripts/coco_to_fathomnet_localizations.py | 41 +++++++++++++++------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/scripts/coco_to_fathomnet_localizations.py b/scripts/coco_to_fathomnet_localizations.py index 1e4d1a0..eb181fc 100644 --- a/scripts/coco_to_fathomnet_localizations.py +++ b/scripts/coco_to_fathomnet_localizations.py @@ -1,23 +1,27 @@ """Convert COCO detection JSON (bbox xywh) to FathomNet localization CSV. -FathomNet expects columns including: concept, image (public URL), x, y, width, height +FathomNet expects columns: concept, image (public URL), x, y, width, height with origin top-left (+Y down). COCO ``bbox`` uses the same convention. -Use on HiPerGator (or any host) after imagery is reachable at a stable HTTPS URL, for -example under UF Orange public web space: +Imagery from the USGS ScienceBase release (DOI: 10.5066/P9CBZQV1) is served +from UF Orange public web space: - /orange/ewhite/web/public/... + /orange/ewhite/web/public/BOEM/usgs/images/ -so each patch URL looks like: - https:///.../0400AGL_P1_20170224_102133_631_136760_56257_0.png +which maps to: + https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/ + +Annotations are extracted from 03_Annotations.zip into: + data/usgs_P9CBZQV1/annotations_extracted/annotations/{train,eval,test}/ See: https://www.fathomnet.org/post/how-to-submit-localized-image-annotations-to-the-fathomnet-database -Example: +Example (10-image sample from test split): uv run python scripts/coco_to_fathomnet_localizations.py \\ - --coco-json ./data/usgs_P9CBZQV1/annotations_extracted/train/train.json \\ - --image-base-url https://example.rc.ufl.edu/public/boem/usgs_patches/ \\ - --output-csv ./fathomnet_localizations.csv + --coco-json data/usgs_P9CBZQV1/annotations_extracted/annotations/test/test_gt.json \\ + --image-base-url https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/ \\ + --output-csv output/usgs_fathomnet_sample.csv \\ + --limit 10 """ from __future__ import annotations @@ -44,6 +48,12 @@ def main() -> None: help="Base URL ending with /; file_name from COCO is appended (urljoin).", ) parser.add_argument("--output-csv", type=Path, required=True) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Keep only the first N unique images (useful for sampling).", + ) args = parser.parse_args() coco_path: Path = args.coco_json.expanduser().resolve() @@ -54,14 +64,21 @@ def main() -> None: data = load_coco(coco_path) id_to_name = {int(c["id"]): str(c.get("name", "")) for c in data.get("categories", [])} - id_to_image = {int(im["id"]): im for im in data.get("images", [])} + all_images = data.get("images", []) + if args.limit is not None: + kept_ids = {int(im["id"]) for im in all_images[: args.limit]} + else: + kept_ids = None + id_to_image = {int(im["id"]): im for im in all_images} rows: list[dict[str, Any]] = [] for ann in data.get("annotations", []): + image_id = int(ann["image_id"]) + if kept_ids is not None and image_id not in kept_ids: + continue bbox = ann.get("bbox") if not bbox or len(bbox) != 4: continue x, y, w, h = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])) - image_id = int(ann["image_id"]) image = id_to_image.get(image_id) if image is None: continue