From 127c3eac2ce9698f55ed205d0d080519e429cfc0 Mon Sep 17 00:00:00 2001
From: bw4sz <benweinstein2010@gmail.com>
Date: Wed, 29 Apr 2026 15:54:41 -0700
Subject: [PATCH 1/2] Add USGS aerial wildlife release download and FathomNet
 CSV export.

ScienceBase item 658da7c1d34e3265ab14bb00 (DOI 10.5066/P9CBZQV1): script fetches
manifest JSON, downloads annotations and FGDC metadata by default, optional
codebase and imagery with S3 request URL hint. --peek-schema prints COCO layout
from train/train.json inside the annotations zip.

coco_to_fathomnet_localizations maps COCO bbox xywh to FathomNet CSV columns for
public-hosted patch URLs (e.g. HiPerGator /orange/ewhite/web/public).

Made-with: Cursor
---
 scripts/coco_to_fathomnet_localizations.py    |  93 ++++++++
 .../download_usgs_aerial_wildlife_release.py  | 203 ++++++++++++++++++
 2 files changed, 296 insertions(+)
 create mode 100644 scripts/coco_to_fathomnet_localizations.py
 create mode 100644 scripts/download_usgs_aerial_wildlife_release.py
diff --git a/scripts/coco_to_fathomnet_localizations.py b/scripts/coco_to_fathomnet_localizations.py
new file mode 100644
index 0000000..1e4d1a0
--- /dev/null
+++ b/scripts/coco_to_fathomnet_localizations.py
@@ -0,0 +1,93 @@
+"""Convert COCO detection JSON (bbox xywh) to FathomNet localization CSV.
+
+FathomNet expects columns including: concept, image (public URL), x, y, width, height
+with origin top-left (+Y down). COCO ``bbox`` uses the same convention.
+
+Use on HiPerGator (or any host) after imagery is reachable at a stable HTTPS URL, for
+example under UF Orange public web space:
+
+  /orange/ewhite/web/public/...
+
+so each patch URL looks like:
+  https://<your-public-host>/.../0400AGL_P1_20170224_102133_631_136760_56257_0.png
+
+See: https://www.fathomnet.org/post/how-to-submit-localized-image-annotations-to-the-fathomnet-database
+
+Example:
+  uv run python scripts/coco_to_fathomnet_localizations.py \\
+    --coco-json ./data/usgs_P9CBZQV1/annotations_extracted/train/train.json \\
+    --image-base-url https://example.rc.ufl.edu/public/boem/usgs_patches/ \\
+    --output-csv ./fathomnet_localizations.csv
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+from typing import Any
+from urllib.parse import urljoin
+
+
+def load_coco(path: Path) -> dict[str, Any]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--coco-json", type=Path, required=True)
+    parser.add_argument(
+        "--image-base-url",
+        required=True,
+        help="Base URL ending with /; file_name from COCO is appended (urljoin).",
+    )
+    parser.add_argument("--output-csv", type=Path, required=True)
+    args = parser.parse_args()
+
+    coco_path: Path = args.coco_json.expanduser().resolve()
+    out_csv: Path = args.output_csv.expanduser().resolve()
+    base: str = args.image_base_url
+    if not base.endswith("/"):
+        base = base + "/"
+
+    data = load_coco(coco_path)
+    id_to_name = {int(c["id"]): str(c.get("name", "")) for c in data.get("categories", [])}
+    id_to_image = {int(im["id"]): im for im in data.get("images", [])}
+    rows: list[dict[str, Any]] = []
+    for ann in data.get("annotations", []):
+        bbox = ann.get("bbox")
+        if not bbox or len(bbox) != 4:
+            continue
+        x, y, w, h = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
+        image_id = int(ann["image_id"])
+        image = id_to_image.get(image_id)
+        if image is None:
+            continue
+        file_name = str(image["file_name"])
+        url = urljoin(base, file_name.split("/")[-1])
+        cat_id = int(ann.get("category_id", 0))
+        concept = id_to_name.get(cat_id, str(cat_id))
+        rows.append(
+            {
+                "concept": concept,
+                "image": url,
+                "x": int(round(x)),
+                "y": int(round(y)),
+                "width": int(round(w)),
+                "height": int(round(h)),
+            }
+        )
+
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = ["concept", "image", "x", "y", "width", "height"]
+    with out_csv.open("w", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=fieldnames)
+        w.writeheader()
+        w.writerows(rows)
+    print(f"Wrote {len(rows)} rows to {out_csv}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/download_usgs_aerial_wildlife_release.py b/scripts/download_usgs_aerial_wildlife_release.py
new file mode 100644
index 0000000..d2d115c
--- /dev/null
+++ b/scripts/download_usgs_aerial_wildlife_release.py
@@ -0,0 +1,203 @@
+"""Download assets from the USGS ScienceBase data release for Ke et al. aerial wildlife ML data.
+
+Catalog page: https://data.usgs.gov/datacatalog/data/USGS:658da7c1d34e3265ab14bb00
+DOI: https://doi.org/10.5066/P9CBZQV1
+
+The release bundles Lake Michigan and Nantucket Shoals (Atlantic) imagery and COCO-format
+annotations. Place keywords on the catalog include Cape Cod; the bundled FGDC metadata
+describes geographic coverage as "Lake Michigan and Nantucket Shoals". COCO ``file_name``
+values do not encode region, so splitting "Cape Cod only" requires external flight metadata
+from the data provider if you need a strict geographic subset.
+
+Large imagery (``02_Imagery.zip``, ~18 GB) is served via ScienceBase/S3 and may require
+using the request-download flow in a browser when direct ``curl`` receives HTML instead
+of bytes. This script prints the request URL when you pass ``--include-imagery``.
+
+Examples:
+  uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1
+  uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1 --peek-schema
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import urllib.error
+import urllib.request
+import zipfile
+from pathlib import Path
+from typing import Any
+
+SCIENCEBASE_ITEM_ID = "658da7c1d34e3265ab14bb00"
+ITEM_JSON_URL = f"https://www.sciencebase.gov/catalog/item/{SCIENCEBASE_ITEM_ID}?format=json"
+
+
+def fetch_item_json() -> dict[str, Any]:
+    with urllib.request.urlopen(ITEM_JSON_URL, timeout=120) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def file_entries(item: dict[str, Any]) -> list[dict[str, Any]]:
+    return list(item.get("files") or [])
+
+
+def pick_download_uri(entry: dict[str, Any]) -> str:
+    uri = entry.get("downloadUri") or entry.get("url")
+    if not uri:
+        raise ValueError(f"No download URI for file: {entry.get('name')}")
+    return str(uri)
+
+
+def human_size(n: int | None) -> str:
+    if n is None:
+        return "unknown"
+    units = ("B", "KB", "MB", "GB", "TB")
+    v = float(n)
+    u = 0
+    while v >= 1024 and u < len(units) - 1:
+        v /= 1024.0
+        u += 1
+    if u == 0:
+        return f"{int(n)} {units[u]}"
+    return f"{v:.1f} {units[u]}"
+
+
+def download_url(url: str, dest: Path) -> None:
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with urllib.request.urlopen(url, timeout=600) as resp:
+        body = resp.read()
+    if len(body) < 5000 and body.lstrip().startswith(b"<"):
+        raise RuntimeError(
+            f"Download from {url} returned HTML ({len(body)} bytes), not a file. "
+            "Try the ScienceBase request-download page for S3-backed assets."
+        )
+    dest.write_bytes(body)
+
+
+def peek_coco_schema(annotations_zip: Path) -> None:
+    with zipfile.ZipFile(annotations_zip) as zf:
+        names = [n for n in zf.namelist() if n.endswith("train/train.json")]
+        if not names:
+            print("No annotations/train/train.json inside zip; members:", zf.namelist()[:30])
+            return
+        inner = names[0]
+        with zf.open(inner) as f:
+            data = json.load(f)
+    imgs = data.get("images") or []
+    anns = data.get("annotations") or []
+    cats = data.get("categories") or []
+    print("COCO keys:", sorted(data.keys()))
+    print("n_images:", len(imgs), "n_annotations:", len(anns), "n_categories:", len(cats))
+    if imgs:
+        print("sample image:", {k: imgs[0].get(k) for k in ("id", "file_name", "width", "height")})
+    if anns:
+        a0 = anns[0]
+        print(
+            "sample annotation keys:",
+            sorted(a0.keys()),
+        )
+        print("sample bbox (xywh):", a0.get("bbox"), "image_id:", a0.get("image_id"))
+    if cats:
+        print("sample category:", cats[0])
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=Path("./data/usgs_P9CBZQV1"),
+        help="Directory to write downloaded zips and metadata.",
+    )
+    parser.add_argument(
+        "--include-codebase",
+        action="store_true",
+        help="Also download 01_CodeBase.zip (~8 MB).",
+    )
+    parser.add_argument(
+        "--include-imagery",
+        action="store_true",
+        help="Attempt 02_Imagery.zip (~18 GB). Often requires ScienceBase browser download.",
+    )
+    parser.add_argument(
+        "--skip-annotations",
+        action="store_true",
+        help="Do not download 03_Annotations.zip.",
+    )
+    parser.add_argument(
+        "--peek-schema",
+        action="store_true",
+        help="After annotations zip exists, print COCO schema summary from train/train.json.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="List resolved URLs and sizes without downloading.",
+    )
+    args = parser.parse_args()
+    out_dir: Path = args.out_dir.expanduser().resolve()
+
+    item = fetch_item_json()
+    by_name = {str(e.get("name")): e for e in file_entries(item)}
+
+    plan: list[tuple[str, Path]] = []
+    if not args.skip_annotations:
+        e = by_name.get("03_Annotations.zip")
+        if not e:
+            print("Item JSON missing 03_Annotations.zip", file=sys.stderr)
+            sys.exit(1)
+        plan.append(("03_Annotations.zip", out_dir / "03_Annotations.zip"))
+    if args.include_codebase:
+        e = by_name.get("01_CodeBase.zip")
+        if e:
+            plan.append(("01_CodeBase.zip", out_dir / "01_CodeBase.zip"))
+    if args.include_imagery:
+        e = by_name.get("02_Imagery.zip")
+        if e:
+            plan.append(("02_Imagery.zip", out_dir / "02_Imagery.zip"))
+            s3_page = e.get("s3DownloadRequestPageUri")
+            if s3_page:
+                print("If direct download fails, request the large imagery zip via:", s3_page)
+
+    meta = by_name.get("Metadata for Deep Learning Wildlife Detection Model.xml")
+    if meta:
+        plan.append(
+            (
+                "Metadata for Deep Learning Wildlife Detection Model.xml",
+                out_dir / "metadata_fgdc.xml",
+            )
+        )
+
+    for logical, dest in plan:
+        entry = by_name.get(logical) or next(
+            (e for e in file_entries(item) if e.get("name") == logical),
+            None,
+        )
+        if entry is None and logical == "metadata_fgdc.xml":
+            continue
+        name = str(entry.get("name")) if entry else logical
+        size = int(entry.get("size") or 0) if entry else 0
+        url = pick_download_uri(entry) if entry else ""
+        print(f"{name} -> {dest} ({human_size(size)})")
+        print(f"  {url}")
+        if args.dry_run:
+            continue
+        if dest.exists() and dest.stat().st_size > 0:
+            print(f"  skip (exists): {dest}")
+            continue
+        if name == "02_Imagery.zip":
+            print("  Warning: very large file; ensure disk space and stable network.")
+        download_url(url, dest)
+        print(f"  wrote {dest} ({human_size(dest.stat().st_size)})")
+
+    ann_zip = out_dir / "03_Annotations.zip"
+    if args.peek_schema:
+        if not ann_zip.is_file():
+            print("--peek-schema: annotations zip not found at", ann_zip, file=sys.stderr)
+            sys.exit(1)
+        peek_coco_schema(ann_zip)
+
+
+if __name__ == "__main__":
+    main()

From 61d7e6bfd875abafa386b49955e848901bf5e973 Mon Sep 17 00:00:00 2001
From: bw4sz <benweinstein2010@gmail.com>
Date: Fri, 1 May 2026 13:43:53 -0400
Subject: [PATCH 2/2] Update FathomNet conversion script with real HPC paths
 and --limit flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds --limit N to sample the first N images (useful for pilot submissions).
Updates docstring with concrete HPC paths:
- Imagery: /orange/ewhite/web/public/BOEM/usgs/images/ → https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/
- Annotations: data/usgs_P9CBZQV1/annotations_extracted/annotations/{train,eval,test}/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/coco_to_fathomnet_localizations.py | 41 +++++++++++++++-------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/scripts/coco_to_fathomnet_localizations.py b/scripts/coco_to_fathomnet_localizations.py
index 1e4d1a0..eb181fc 100644
--- a/scripts/coco_to_fathomnet_localizations.py
+++ b/scripts/coco_to_fathomnet_localizations.py
@@ -1,23 +1,27 @@
 """Convert COCO detection JSON (bbox xywh) to FathomNet localization CSV.
 
-FathomNet expects columns including: concept, image (public URL), x, y, width, height
+FathomNet expects columns: concept, image (public URL), x, y, width, height
 with origin top-left (+Y down). COCO ``bbox`` uses the same convention.
 
-Use on HiPerGator (or any host) after imagery is reachable at a stable HTTPS URL, for
-example under UF Orange public web space:
+Imagery from the USGS ScienceBase release (DOI: 10.5066/P9CBZQV1) is served
+from UF Orange public web space:
 
-  /orange/ewhite/web/public/...
+  /orange/ewhite/web/public/BOEM/usgs/images/
 
-so each patch URL looks like:
-  https://<your-public-host>/.../0400AGL_P1_20170224_102133_631_136760_56257_0.png
+which maps to:
+  https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/
+
+Annotations are extracted from 03_Annotations.zip into:
+  data/usgs_P9CBZQV1/annotations_extracted/annotations/{train,eval,test}/
 
 See: https://www.fathomnet.org/post/how-to-submit-localized-image-annotations-to-the-fathomnet-database
 
-Example:
+Example (10-image sample from test split):
   uv run python scripts/coco_to_fathomnet_localizations.py \\
-    --coco-json ./data/usgs_P9CBZQV1/annotations_extracted/train/train.json \\
-    --image-base-url https://example.rc.ufl.edu/public/boem/usgs_patches/ \\
-    --output-csv ./fathomnet_localizations.csv
+    --coco-json data/usgs_P9CBZQV1/annotations_extracted/annotations/test/test_gt.json \\
+    --image-base-url https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/ \\
+    --output-csv output/usgs_fathomnet_sample.csv \\
+    --limit 10
 """
 
 from __future__ import annotations
@@ -44,6 +48,12 @@ def main() -> None:
         help="Base URL ending with /; file_name from COCO is appended (urljoin).",
     )
     parser.add_argument("--output-csv", type=Path, required=True)
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Keep only the first N unique images (useful for sampling).",
+    )
     args = parser.parse_args()
 
     coco_path: Path = args.coco_json.expanduser().resolve()
@@ -54,14 +64,21 @@ def main() -> None:
 
     data = load_coco(coco_path)
     id_to_name = {int(c["id"]): str(c.get("name", "")) for c in data.get("categories", [])}
-    id_to_image = {int(im["id"]): im for im in data.get("images", [])}
+    all_images = data.get("images", [])
+    if args.limit is not None:
+        kept_ids = {int(im["id"]) for im in all_images[: args.limit]}
+    else:
+        kept_ids = None
+    id_to_image = {int(im["id"]): im for im in all_images}
     rows: list[dict[str, Any]] = []
     for ann in data.get("annotations", []):
+        image_id = int(ann["image_id"])
+        if kept_ids is not None and image_id not in kept_ids:
+            continue
         bbox = ann.get("bbox")
         if not bbox or len(bbox) != 4:
             continue
         x, y, w, h = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
-        image_id = int(ann["image_id"])
         image = id_to_image.get(image_id)
         if image is None:
             continue