Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions scripts/coco_to_fathomnet_localizations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Convert COCO detection JSON (bbox xywh) to FathomNet localization CSV.

FathomNet expects columns: concept, image (public URL), x, y, width, height
with origin top-left (+Y down). COCO ``bbox`` uses the same convention.

Imagery from the USGS ScienceBase release (DOI: 10.5066/P9CBZQV1) is served
from UF Orange public web space:

/orange/ewhite/web/public/BOEM/usgs/images/

which maps to:
https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/

Annotations are extracted from 03_Annotations.zip into:
data/usgs_P9CBZQV1/annotations_extracted/annotations/{train,eval,test}/

See: https://www.fathomnet.org/post/how-to-submit-localized-image-annotations-to-the-fathomnet-database

Example (10-image sample from test split):
uv run python scripts/coco_to_fathomnet_localizations.py \\
--coco-json data/usgs_P9CBZQV1/annotations_extracted/annotations/test/test_gt.json \\
--image-base-url https://data.rc.ufl.edu/pub/ewhite/BOEM/usgs/images/ \\
--output-csv output/usgs_fathomnet_sample.csv \\
--limit 10
"""

from __future__ import annotations

import argparse
import csv
import json
import sys
from pathlib import Path
from typing import Any
from urllib.parse import urljoin


def load_coco(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--coco-json", type=Path, required=True)
parser.add_argument(
"--image-base-url",
required=True,
help="Base URL ending with /; file_name from COCO is appended (urljoin).",
)
parser.add_argument("--output-csv", type=Path, required=True)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Keep only the first N unique images (useful for sampling).",
)
args = parser.parse_args()

coco_path: Path = args.coco_json.expanduser().resolve()
out_csv: Path = args.output_csv.expanduser().resolve()
base: str = args.image_base_url
if not base.endswith("/"):
base = base + "/"

data = load_coco(coco_path)
id_to_name = {int(c["id"]): str(c.get("name", "")) for c in data.get("categories", [])}
all_images = data.get("images", [])
if args.limit is not None:
kept_ids = {int(im["id"]) for im in all_images[: args.limit]}
else:
kept_ids = None
id_to_image = {int(im["id"]): im for im in all_images}
rows: list[dict[str, Any]] = []
for ann in data.get("annotations", []):
image_id = int(ann["image_id"])
if kept_ids is not None and image_id not in kept_ids:
continue
bbox = ann.get("bbox")
if not bbox or len(bbox) != 4:
continue
x, y, w, h = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
image = id_to_image.get(image_id)
if image is None:
continue
file_name = str(image["file_name"])
url = urljoin(base, file_name.split("/")[-1])
cat_id = int(ann.get("category_id", 0))
concept = id_to_name.get(cat_id, str(cat_id))
rows.append(
{
"concept": concept,
"image": url,
"x": int(round(x)),
"y": int(round(y)),
"width": int(round(w)),
"height": int(round(h)),
}
)

out_csv.parent.mkdir(parents=True, exist_ok=True)
fieldnames = ["concept", "image", "x", "y", "width", "height"]
with out_csv.open("w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
w.writerows(rows)
print(f"Wrote {len(rows)} rows to {out_csv}", file=sys.stderr)


if __name__ == "__main__":
main()
203 changes: 203 additions & 0 deletions scripts/download_usgs_aerial_wildlife_release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
"""Download assets from the USGS ScienceBase data release for Ke et al. aerial wildlife ML data.

Catalog page: https://data.usgs.gov/datacatalog/data/USGS:658da7c1d34e3265ab14bb00
DOI: https://doi.org/10.5066/P9CBZQV1

The release bundles Lake Michigan and Nantucket Shoals (Atlantic) imagery and COCO-format
annotations. Place keywords on the catalog include Cape Cod; the bundled FGDC metadata
describes geographic coverage as "Lake Michigan and Nantucket Shoals". COCO ``file_name``
values do not encode region, so splitting "Cape Cod only" requires external flight metadata
from the data provider if you need a strict geographic subset.

Large imagery (``02_Imagery.zip``, ~18 GB) is served via ScienceBase/S3 and may require
using the request-download flow in a browser when direct ``curl`` receives HTML instead
of bytes. This script prints the request URL when you pass ``--include-imagery``.

Examples:
uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1
uv run python scripts/download_usgs_aerial_wildlife_release.py --out-dir ./data/usgs_P9CBZQV1 --peek-schema
"""

from __future__ import annotations

import argparse
import json
import sys
import urllib.error
import urllib.request
import zipfile
from pathlib import Path
from typing import Any

SCIENCEBASE_ITEM_ID = "658da7c1d34e3265ab14bb00"
ITEM_JSON_URL = f"https://www.sciencebase.gov/catalog/item/{SCIENCEBASE_ITEM_ID}?format=json"


def fetch_item_json() -> dict[str, Any]:
with urllib.request.urlopen(ITEM_JSON_URL, timeout=120) as resp:
return json.loads(resp.read().decode("utf-8"))


def file_entries(item: dict[str, Any]) -> list[dict[str, Any]]:
return list(item.get("files") or [])


def pick_download_uri(entry: dict[str, Any]) -> str:
uri = entry.get("downloadUri") or entry.get("url")
if not uri:
raise ValueError(f"No download URI for file: {entry.get('name')}")
return str(uri)


def human_size(n: int | None) -> str:
if n is None:
return "unknown"
units = ("B", "KB", "MB", "GB", "TB")
v = float(n)
u = 0
while v >= 1024 and u < len(units) - 1:
v /= 1024.0
u += 1
if u == 0:
return f"{int(n)} {units[u]}"
return f"{v:.1f} {units[u]}"


def download_url(url: str, dest: Path) -> None:
dest.parent.mkdir(parents=True, exist_ok=True)
with urllib.request.urlopen(url, timeout=600) as resp:
body = resp.read()
if len(body) < 5000 and body.lstrip().startswith(b"<"):
raise RuntimeError(
f"Download from {url} returned HTML ({len(body)} bytes), not a file. "
"Try the ScienceBase request-download page for S3-backed assets."
)
dest.write_bytes(body)


def peek_coco_schema(annotations_zip: Path) -> None:
with zipfile.ZipFile(annotations_zip) as zf:
names = [n for n in zf.namelist() if n.endswith("train/train.json")]
if not names:
print("No annotations/train/train.json inside zip; members:", zf.namelist()[:30])
return
inner = names[0]
with zf.open(inner) as f:
data = json.load(f)
imgs = data.get("images") or []
anns = data.get("annotations") or []
cats = data.get("categories") or []
print("COCO keys:", sorted(data.keys()))
print("n_images:", len(imgs), "n_annotations:", len(anns), "n_categories:", len(cats))
if imgs:
print("sample image:", {k: imgs[0].get(k) for k in ("id", "file_name", "width", "height")})
if anns:
a0 = anns[0]
print(
"sample annotation keys:",
sorted(a0.keys()),
)
print("sample bbox (xywh):", a0.get("bbox"), "image_id:", a0.get("image_id"))
if cats:
print("sample category:", cats[0])


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--out-dir",
type=Path,
default=Path("./data/usgs_P9CBZQV1"),
help="Directory to write downloaded zips and metadata.",
)
parser.add_argument(
"--include-codebase",
action="store_true",
help="Also download 01_CodeBase.zip (~8 MB).",
)
parser.add_argument(
"--include-imagery",
action="store_true",
help="Attempt 02_Imagery.zip (~18 GB). Often requires ScienceBase browser download.",
)
parser.add_argument(
"--skip-annotations",
action="store_true",
help="Do not download 03_Annotations.zip.",
)
parser.add_argument(
"--peek-schema",
action="store_true",
help="After annotations zip exists, print COCO schema summary from train/train.json.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="List resolved URLs and sizes without downloading.",
)
args = parser.parse_args()
out_dir: Path = args.out_dir.expanduser().resolve()

item = fetch_item_json()
by_name = {str(e.get("name")): e for e in file_entries(item)}

plan: list[tuple[str, Path]] = []
if not args.skip_annotations:
e = by_name.get("03_Annotations.zip")
if not e:
print("Item JSON missing 03_Annotations.zip", file=sys.stderr)
sys.exit(1)
plan.append(("03_Annotations.zip", out_dir / "03_Annotations.zip"))
if args.include_codebase:
e = by_name.get("01_CodeBase.zip")
if e:
plan.append(("01_CodeBase.zip", out_dir / "01_CodeBase.zip"))
if args.include_imagery:
e = by_name.get("02_Imagery.zip")
if e:
plan.append(("02_Imagery.zip", out_dir / "02_Imagery.zip"))
s3_page = e.get("s3DownloadRequestPageUri")
if s3_page:
print("If direct download fails, request the large imagery zip via:", s3_page)

meta = by_name.get("Metadata for Deep Learning Wildlife Detection Model.xml")
if meta:
plan.append(
(
"Metadata for Deep Learning Wildlife Detection Model.xml",
out_dir / "metadata_fgdc.xml",
)
)

for logical, dest in plan:
entry = by_name.get(logical) or next(
(e for e in file_entries(item) if e.get("name") == logical),
None,
)
if entry is None and logical == "metadata_fgdc.xml":
continue
name = str(entry.get("name")) if entry else logical
size = int(entry.get("size") or 0) if entry else 0
url = pick_download_uri(entry) if entry else ""
print(f"{name} -> {dest} ({human_size(size)})")
print(f" {url}")
if args.dry_run:
continue
if dest.exists() and dest.stat().st_size > 0:
print(f" skip (exists): {dest}")
continue
if name == "02_Imagery.zip":
print(" Warning: very large file; ensure disk space and stable network.")
download_url(url, dest)
print(f" wrote {dest} ({human_size(dest.stat().st_size)})")

ann_zip = out_dir / "03_Annotations.zip"
if args.peek_schema:
if not ann_zip.is_file():
print("--peek-schema: annotations zip not found at", ann_zip, file=sys.stderr)
sys.exit(1)
peek_coco_schema(ann_zip)


if __name__ == "__main__":
main()
Loading