diff --git a/scripts/item_create.py b/scripts/item_create.py index 828577b..79f012d 100644 --- a/scripts/item_create.py +++ b/scripts/item_create.py @@ -149,14 +149,13 @@ def load_validation_cache(urls_to_check: list[str]) -> dict: existing_urls = set() logger.info("No existing validation cache found, will validate all URLs") - # Detect old-format rows missing spatial metadata - has_spatial = set() + # Detect old-format rows missing spatial metadata (all spatial columns null). + # Rows that were extracted but have epsg=None (no CRS) are NOT re-upgraded — + # they'll have height/width/transform populated from the extraction. needs_upgrade = set() - if "epsg" in df_existing.columns: + if "transform" in df_existing.columns: for _, row in df_existing.iterrows(): - if pd.notna(row.get("epsg")): - has_spatial.add(row["url"]) - elif row.get("is_geotiff"): + if row.get("is_geotiff") and pd.isna(row.get("transform")): needs_upgrade.add(row["url"]) else: needs_upgrade = {row["url"] for _, row in df_existing.iterrows() if row["is_geotiff"]} diff --git a/scripts/stac_utils.py b/scripts/stac_utils.py index 46a141b..913cbe6 100644 --- a/scripts/stac_utils.py +++ b/scripts/stac_utils.py @@ -9,6 +9,7 @@ """ import json +import logging import re from datetime import datetime, timezone @@ -19,6 +20,8 @@ from rio_cogeo.cogeo import cog_validate from shapely.geometry import box, mapping +logger = logging.getLogger(__name__) + # ============================================================================= # Path Configuration @@ -93,11 +96,12 @@ def datetime_parse_item(s: str | None) -> datetime | None: # ============================================================================= def geotiff_extract_metadata(url: str) -> dict: - """Extract spatial metadata and validate GeoTIFF/COG status in one remote read. + """Extract spatial metadata and validate GeoTIFF/COG status. - Opens the remote GeoTIFF via /vsicurl/, extracts CRS, bounds, shape, and - transform, then validates COG status. All metadata needed for STAC item - creation is returned so subsequent builds can skip the remote read. + Opens the remote GeoTIFF via /vsicurl/ to extract CRS, bounds, shape, and + transform, then validates COG status (second remote read via cog_validate). + All metadata needed for STAC item creation is cached so subsequent builds + skip remote reads entirely. Returns dict with url, is_geotiff, is_cog, epsg, height, width, transform, bounds. """ @@ -106,7 +110,7 @@ def geotiff_extract_metadata(url: str) -> dict: try: with rasterio.open(vsicurl_path) as src: - epsg = src.crs.to_epsg() + epsg = src.crs.to_epsg() if src.crs else None height = src.height width = src.width transform = list(src.transform)[:6] @@ -124,7 +128,8 @@ def geotiff_extract_metadata(url: str) -> dict: "transform": json.dumps(transform), "bounds": json.dumps(bounds), } - except Exception: + except Exception as e: + logger.warning("Failed to read %s: %s", url, e) return { "url": url, "is_geotiff": False,