Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions src/opera_utils/nisar/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,26 @@
from pathlib import Path

import h5py
import numpy as np
from shapely import from_wkt
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map

from opera_utils.nisar._product import NISAR_GSLC_GRIDS, NISAR_POLARIZATIONS
from opera_utils.nisar._product import (
NISAR_GSLC_GRIDS,
NISAR_GSLC_IDENTIFICATION,
NISAR_GSLC_ROOT,
NISAR_POLARIZATIONS,
)

from ._product import GslcProduct, UrlType
from ._remote import open_h5
from ._search import search

# Metadata groups to copy into subset files (full path from root)
_IDENTIFICATION_GROUP = NISAR_GSLC_IDENTIFICATION
_ORBIT_GROUP = f"{NISAR_GSLC_ROOT}/metadata/orbit"

logger = logging.getLogger("opera_utils")

__all__ = ["process_file", "run_download"]
Expand Down Expand Up @@ -157,9 +167,15 @@ def _extract_subset_from_h5(
col_slice = cols if cols is not None else slice(None)

with h5py.File(outpath, "w") as dst:
# Note: We skip copying identification/metadata groups for remote files
# because h5py's copy() doesn't work well with fsspec byte-range access.
# The essential georeferencing info is in the frequency group.
# Copy identification and orbit metadata groups
for group in [_IDENTIFICATION_GROUP, _ORBIT_GROUP]:
if group in src:
dst.require_group("/".join(group.split("/")[:-1]))
src.copy(group, dst, name=group)

# Add complex64 type to root for GDAL recognition of datasets
ctype = h5py.h5t.py_create(np.complex64)
ctype.commit(dst["/"].id, np.bytes_("complex64"))

# Get available polarizations for the frequency
freq_path = f"{NISAR_GSLC_GRIDS}/frequency{frequency}"
Expand All @@ -183,9 +199,9 @@ def _extract_subset_from_h5(
msg = f"No polarizations to extract from {source_name}"
raise ValueError(msg)

# Create the grids structure
dst.create_group(NISAR_GSLC_GRIDS)
dst_freq_group = dst.create_group(freq_path)
# Create the grids structure (require_group since parents may already exist)
dst.require_group(NISAR_GSLC_GRIDS)
dst_freq_group = dst.require_group(freq_path)

# Copy coordinate datasets if present (x, y coordinates)
for coord_name in [
Expand Down Expand Up @@ -277,14 +293,8 @@ def _get_rowcol_slice(

lon_left, lat_bottom, lon_right, lat_top = bbox

# Need to open the file to get coordinate info
with open_h5(str(product.filename)) as h5f:
row_start, col_start = product.lonlat_to_rowcol(
h5f, lon_left, lat_top, frequency
)
row_stop, col_stop = product.lonlat_to_rowcol(
h5f, lon_right, lat_bottom, frequency
)
row_start, col_start = product.lonlat_to_rowcol(lon_left, lat_top, frequency)
row_stop, col_stop = product.lonlat_to_rowcol(lon_right, lat_bottom, frequency)

# Ensure start < stop
if row_start > row_stop:
Expand Down
130 changes: 65 additions & 65 deletions src/opera_utils/nisar/_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import os
import re
from collections.abc import Iterator
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import Enum
Expand All @@ -17,6 +19,7 @@
from typing_extensions import Self

from opera_utils._cmr import get_download_url
from opera_utils._remote import open_h5
from opera_utils.constants import NISAR_SDS_FILE_REGEX, UrlType

# NISAR GSLC HDF5 dataset paths
Expand Down Expand Up @@ -133,9 +136,15 @@ def from_filename(cls, name: Path | str) -> Self:

@property
def track_frame_id(self) -> str:
"""Get the combined track and frame identifier."""
"""Get the combined track and frame identifier.

Format is ``RRR_D_TTT`` where RRR = relative orbit number,
D = orbit direction (A/D), TTT = track frame number.
These three fields are constant across repeat-pass acquisitions,
so this ID uniquely identifies a geographic footprint.
"""
return (
f"{self.cycle_number:03d}_{self.relative_orbit_number:03d}_"
f"{self.relative_orbit_number:03d}_"
f"{self.orbit_direction}_{self.track_frame_number:03d}"
)

Expand All @@ -144,6 +153,12 @@ def version(self) -> str:
"""Get the full version string."""
return f"{self.major_version}.{self.minor_version:03d}"

@contextmanager
def _open(self) -> Iterator[h5py.File]:
"""Open the HDF5 file (local or remote) as a context manager."""
with open_h5(str(self.filename)) as hf:
yield hf

def get_dataset_path(self, frequency: str = "A", polarization: str = "HH") -> str:
"""Get the HDF5 dataset path for a specific frequency and polarization.

Expand All @@ -159,11 +174,6 @@ def get_dataset_path(self, frequency: str = "A", polarization: str = "HH") -> st
str
The HDF5 dataset path.

Raises
------
ValueError
If the frequency or polarization is invalid.

"""
if frequency not in NISAR_FREQUENCIES:
msg = f"Invalid frequency {frequency}. Choices: {NISAR_FREQUENCIES}"
Expand All @@ -173,15 +183,11 @@ def get_dataset_path(self, frequency: str = "A", polarization: str = "HH") -> st
raise ValueError(msg)
return f"{NISAR_GSLC_GRIDS}/frequency{frequency}/{polarization}"

def get_available_polarizations(
self, h5file: h5py.File, frequency: str = "A"
) -> list[str]:
"""Get available polarizations for a frequency from an open HDF5 file.
def get_available_polarizations(self, frequency: str = "A") -> list[str]:
"""Get available polarizations for a frequency.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
frequency : str
Frequency band, either "A" or "B". Default is "A".

Expand All @@ -192,29 +198,27 @@ def get_available_polarizations(

"""
freq_group = f"{NISAR_GSLC_GRIDS}/frequency{frequency}"
if freq_group not in h5file:
return []
return [name for name in h5file[freq_group] if name in NISAR_POLARIZATIONS]
with self._open() as hf:
group = hf.get(freq_group)
if group is None:
return []
return [name for name in NISAR_POLARIZATIONS if name in group]

def get_available_frequencies(self, h5file: h5py.File) -> list[str]:
"""Get available frequencies from an open HDF5 file.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
def get_available_frequencies(self) -> list[str]:
"""Get available frequencies from the HDF5 file.

Returns
-------
list[str]
List of available frequency names ("A" and/or "B").

"""
return [
freq
for freq in NISAR_FREQUENCIES
if f"{NISAR_GSLC_GRIDS}/frequency{freq}" in h5file
]
with self._open() as hf:
return [
freq
for freq in NISAR_FREQUENCIES
if f"{NISAR_GSLC_GRIDS}/frequency{freq}" in hf
]

@cached_property
def _identification_cache(self) -> dict[str, Any]:
Expand All @@ -236,14 +240,14 @@ def bounding_polygon(self) -> str | None:
return bp

def get_shape(
self, h5file: h5py.File, frequency: str = "A", polarization: str = "HH"
self,
frequency: str = "A",
polarization: str = "HH",
) -> tuple[int, int]:
"""Get the shape of a GSLC dataset.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
frequency : str
Frequency band. Default is "A".
polarization : str
Expand All @@ -255,26 +259,24 @@ def get_shape(
Shape as (rows, cols).

"""
dset_path = self.get_dataset_path(frequency, polarization)
return h5file[dset_path].shape
with self._open() as hf:
dset_path = self.get_dataset_path(frequency, polarization)
return hf[dset_path].shape # type: ignore[return-value]

def read_subset(
self,
h5file: h5py.File,
rows: slice,
cols: slice,
rows: slice | int | None,
cols: slice | int | None,
frequency: str = "A",
polarization: str = "HH",
) -> np.ndarray:
"""Read a subset of the GSLC data.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
rows : slice
rows : slice | int | None
Row slice for subsetting.
cols : slice
cols : slice | int | None
Column slice for subsetting.
frequency : str
Frequency band. Default is "A".
Expand All @@ -287,19 +289,23 @@ def read_subset(
The subset of complex GSLC data.

"""
dset_path = self.get_dataset_path(frequency, polarization)
return h5file[dset_path][rows, cols]
if rows is None:
rows = slice(None)
if cols is None:
cols = slice(None)

with self._open() as hf:
dset_path = self.get_dataset_path(frequency, polarization)
return hf[dset_path][rows, cols]

def __fspath__(self) -> str:
return os.fspath(self.filename)

def get_epsg(self, h5file: h5py.File, frequency: str = "A") -> int:
"""Get the EPSG code from an open HDF5 file.
def get_epsg(self, frequency: str = "A") -> int:
"""Get the EPSG code for the coordinate system.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
frequency : str
Frequency band. Default is "A".

Expand All @@ -310,21 +316,14 @@ def get_epsg(self, h5file: h5py.File, frequency: str = "A") -> int:

"""
freq_path = f"{NISAR_GSLC_GRIDS}/frequency{frequency}"
# The projection dataset contains the EPSG code as a scalar integer
if "projection" in h5file[freq_path]:
return int(h5file[freq_path]["projection"][()])
msg = f"No projection dataset found in {freq_path}"
raise ValueError(msg)
with self._open() as hf:
return int(hf[freq_path]["projection"][()]) # type: ignore[index]

def get_coordinates(
self, h5file: h5py.File, frequency: str = "A"
) -> tuple[np.ndarray, np.ndarray]:
"""Get the x and y coordinate arrays from an open HDF5 file.
def get_coordinates(self, frequency: str = "A") -> tuple[np.ndarray, np.ndarray]:
"""Get the x and y coordinate arrays.

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
frequency : str
Frequency band. Default is "A".

Expand All @@ -335,13 +334,13 @@ def get_coordinates(

"""
freq_path = f"{NISAR_GSLC_GRIDS}/frequency{frequency}"
x_coords = h5file[freq_path]["xCoordinates"][:]
y_coords = h5file[freq_path]["yCoordinates"][:]
return x_coords, y_coords
with self._open() as hf:
x_coords = hf[freq_path]["xCoordinates"][:] # type: ignore[index]
y_coords = hf[freq_path]["yCoordinates"][:] # type: ignore[index]
return x_coords, y_coords

def lonlat_to_rowcol(
self,
h5file: h5py.File,
lon: float,
lat: float,
frequency: str = "A",
Expand All @@ -350,8 +349,6 @@ def lonlat_to_rowcol(

Parameters
----------
h5file : h5py.File
Open HDF5 file handle.
lon : float
Longitude in degrees.
lat : float
Expand All @@ -370,8 +367,11 @@ def lonlat_to_rowcol(
If the coordinates are outside the image bounds.

"""
epsg = self.get_epsg(h5file, frequency)
x_coords, y_coords = self.get_coordinates(h5file, frequency)
with self._open() as hf:
freq_path = f"{NISAR_GSLC_GRIDS}/frequency{frequency}"
epsg = int(hf[freq_path]["projection"][()]) # type: ignore[index]
x_coords = hf[freq_path]["xCoordinates"][:] # type: ignore[index]
y_coords = hf[freq_path]["yCoordinates"][:] # type: ignore[index]

# Transform lon/lat to projected coordinates
transformer = pyproj.Transformer.from_crs(
Expand Down
Loading