Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ build
.pixi
*.egg-info

# ignore .env files
.env


# ignore the version file made by setuptools_scm
dem_handler/_version.py
6 changes: 5 additions & 1 deletion dem_handler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
DATA_DIR = Path(__file__).parent / Path("data")
REMA_GPKG_PATH = DATA_DIR / Path("REMA_Mosaic_Index_v2.gpkg")
COP30_GPKG_PATH = DATA_DIR / Path("copdem_tindex_filename.gpkg")
REMA_SERIES_CONFIG_PATH = DATA_DIR / Path("rema_series_config.json")

REMAResolutions = typing.Literal[2, 10, 32]
REMAResolutions = typing.Literal[2, 10, 30, 32]
COPResolutions = typing.Literal[30]
ValidDEMResolutions = typing.Literal[REMAResolutions, COPResolutions]

REMABoundCheckSkipResolutions = typing.Literal[30]

REMA_VALID_RESOLUTIONS = typing.get_args(REMAResolutions)
COP_VALID_RESOLUTIONS = typing.get_args(COPResolutions)
REMA_BOUND_CHECK_SKIP_RESOLUTIONS = typing.get_args(REMABoundCheckSkipResolutions)
24 changes: 24 additions & 0 deletions dem_handler/data/rema_series_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"resolution": {
"30": "v0.5",
"32": "v0.4",
"10": "v0.4",
"2": "v0.4"
},
"aoi_name": {
"v0.4": "thwaites",
"v0.5": null
},
"v0.4": {
"endpoint": "umn1.osn.mghpcc.org",
"s3_bucket": "cse-pgc-test",
"remote_folder": "digital_earth_antarctica/v0.4/mosaics",
"indexing_file": null
},
"v0.5": {
"endpoint": "umn1.osn.mghpcc.org",
"s3_bucket": "cse-pgc-test",
"remote_folder": "digital_earth_antarctica/v0.5",
"indexing_file": "MultiTemporalREMAIndex.parquet"
}
}
263 changes: 201 additions & 62 deletions dem_handler/dem/rema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations
import json
from pathlib import Path
import shapely
from shapely import box
Expand All @@ -20,11 +21,15 @@
from dem_handler.dem.geoid import apply_geoid
from dem_handler.download.aws import download_egm_08_geoid


# Create a custom type that allows use of BoundingBox or tuple(left, bottom, right, top)
BBox = BoundingBox | tuple[float | int, float | int, float | int, float | int]

from dem_handler import REMA_GPKG_PATH, REMA_VALID_RESOLUTIONS, REMAResolutions
from dem_handler import (
REMA_GPKG_PATH,
REMA_SERIES_CONFIG_PATH,
REMA_VALID_RESOLUTIONS,
REMAResolutions,
)


@log_timing
Expand All @@ -45,6 +50,7 @@ def get_rema_dem_for_bounds(
num_tasks: int | None = None,
return_paths: bool = False,
download_dir: Path | str = "rema_dems_temp_folder",
rema_series_config_file: Path | str | None = None,
) -> tuple[np.ndarray, Profile | list[Path]] | list[Path] | tuple[None, None, None]:
"""Finds the REMA DEM tiles in a given bounding box and merges them into a single tile.

Expand Down Expand Up @@ -86,6 +92,12 @@ def get_rema_dem_for_bounds(
Flag to return the local paths for downloaded DEMs only, by default False
download_dir: Path | str , optional
Directory to download the REMA DEMs to, by default "rema_dems_temp_folder"
rema_series_config_file: Path | str | None, optional
Path to the REMA series config file, by default None.
If None, it will look for a file named "rema_series_config.json" in the same directory as this script.
This config file contains information on the remote folder and indexing file names for different versions of the REMA timeseries product.
This is only used when reading the REMA timeseries product.

Returns
-------
tuple[np.ndarray, Profile | list[Path]] | list[Path]
Expand Down Expand Up @@ -214,12 +226,32 @@ def get_rema_dem_for_bounds(

else:
# Read in the rema timeseries dem from appropriate vrt

if (rema_series_config_file is None) or (rema_series_config_file == ""):
rema_series_config_file = REMA_SERIES_CONFIG_PATH
else:
rema_series_config_file = Path(rema_series_config_file).resolve()
if not rema_series_config_file.exists():
raise FileNotFoundError(
f"REMA series config file not found at {rema_series_config_file}. "
f"Please provide the correct path to the config file."
)

with open(rema_series_config_file) as f:
rema_remote_config_data = json.load(f)

rema_version = rema_remote_config_data["resolution"][str(resolution)]
rema_remote_config = rema_remote_config_data[rema_version]
aoi_name = rema_remote_config_data["aoi_name"][rema_version]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could maybe be hardcoded as thwaites if thats the only area in v0.3 and v0.4, so one less input in the config is needed. v0.5 moves to tiles we wont need to worry about it


logging.info("Reading in REMA timeseries .vrt")
dem_array, dem_profile = read_rema_timeseries_vrt(
year=rema_year,
bounds=bounds,
save_path=save_path,
resolution=resolution,
aoi_name=aoi_name,
**rema_remote_config,
)
raster_paths = []

Expand Down Expand Up @@ -353,16 +385,86 @@ def _round_up_to_step(coord, origin, res):
import tempfile
import boto3
from rasterio.session import AWSSession
from dem_handler.utils.raster import read_raster_from_vrt
from dem_handler.utils.raster import read_raster_from_vrt, read_raster_from_gdf


def read_indexing_file(indexing_file: str, storage_options: dict | None = None):
"""Reads the indexing file for the REMA timeseries product,
which contains the urls for the individual DEM tiles and their metadata.
The indexing file can be read from a local path or from a remote datastore using the provided storage options.

Parameters
----------
indexing_file : str
Path to the indexing file, either local or remote.
storage_options : dict, optional
Storage options for reading the indexing file from a remote datastore, by default None

Returns
-------
gpd.GeoDataFrame
GeoDataFrame containing the metadata and urls for the REMA DEM tiles
"""
if os.path.exists(indexing_file):
print(f"Reading indexing file from local path: {indexing_file}")
indexing_gdf = gpd.read_parquet(indexing_file)
else:
try:
indexing_gdf = gpd.read_parquet(
indexing_file, storage_options=storage_options
)
except Exception as e:
raise FileNotFoundError(
f"Indexing file not found at {indexing_file}. Please provide the correct path to the indexing file."
) from e
return indexing_gdf


def read_rema_timeseries_vrt(
year,
bounds,
save_path,
resolution,
version_number=0.3,
):
year: int,
bounds: BBox,
save_path: str | None,
resolution: int,
aoi_name: (
str | None
) = "thwaites", # Reading data using indexing file doesn't need the aoi name.
endpoint: str = "umn1.osn.mghpcc.org",
s3_bucket: str = "cse-pgc-test",
remote_folder: str = "digital_earth_antarctica/v0.4/mosaics", # or "digital_earth_antarctica/v0.5" for version 0.5
indexing_file: (
str | gpd.GeoDataFrame | None
) = None, # or "MultiTemporalREMAIndex.parquet" for version 0.5
) -> tuple[np.ndarray, Profile]:
"""Reads the REMA timeseries .vrt or a provided indexing file for a given year and bounds, merging the intersecting tiles into one raster.

Parameters
----------
year : int
The year for the DEM if the timeseries product is required.
bounds : BBox
Bounding box for the area of interest in 3031.
save_path : str | None
Local path to save the output tile.
resolution : int
Resolution of the required tiles, for example 10.
aoi_name : str | None, optional
Name of the area of interest, by default "thwaites". This is only used
when reading data without an indexing file.
endpoint : str, optional
Endpoint for the s3 datastore, by default "umn1.osn.mghpcc.org"
s3_bucket : str, optional
S3 bucket for the datastore, by default "cse-pgc-test"
remote_folder : str, optional
Remote folder in the s3 bucket where the data is stored, by default "digital_earth_antarctica/v0.4/mosaics"
indexing_file : str | gpd.GeoDataFrame | None, optional
Name of the indexing file in the remote datastore or a GeoDataFrame.
If None, it will attempt to read directly from the vrt, which is the case for version 0.4 of the REMA timeseries product

Returns
-------
tuple[np.ndarray, Profile]
Tuple of the output tile array and its profile
"""

logging.info(f"Reading DEM .vrt from remote datastore")
if not os.environ.get("REMA_AWS_ACCESS_KEY_ID"):
Expand All @@ -376,66 +478,103 @@ def read_rema_timeseries_vrt(
else:
aws_secret_access_key = os.environ.get("REMA_AWS_SECRET_ACCESS_KEY")

# S3-compatible endpoint
endpoint = "umn1.osn.mghpcc.org"
endpoint_url = "https://umn1.osn.mghpcc.org"

# File path inside the bucket
s3_bucket = "cse-pgc-test"
vrt_folder = f"digital_earth_antarctica/v{version_number}/mosaics/thwaites_remote"

# Start boto3 session
session = boto3.Session(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)

# download and edit the vrt as the remote vrt references browse images and not the data
s3 = session.client("s3", endpoint_url=endpoint_url)

# check to ensure the year is there
vrt_pattern = re.compile(rf"thwaites_{resolution}m_remote_z_(\d+)\.vrt$")
# List all objects under prefix
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=s3_bucket, Prefix=vrt_folder)
valid_years = set()

for page in pages:
for obj in page.get("Contents", []):
key = obj["Key"]
match = vrt_pattern.search(key)
if match:
valid_years.add(int(match.group(1)))

years_list = sorted(valid_years)

if year not in valid_years:
raise ValueError(
f'The requested year "{year}" does not have a rema timeseries. '
f"valid years are : {years_list}"
)

vrt_path = f"{vrt_folder}/thwaites_{resolution}m_remote_z_{year}.vrt"
logging.info(f"Downloading .vrt from : {endpoint_url}/{vrt_path}")

response = s3.get_object(Bucket=s3_bucket, Key=vrt_path)
vrt_content = response["Body"].read()

with tempfile.NamedTemporaryFile("w", delete=False) as f:
f.write(vrt_content.decode("utf-8"))
temp_vrt = f.name
endpoint_url = f"https://{endpoint}"

if indexing_file is not None:
if isinstance(indexing_file, gpd.GeoDataFrame):
print("Using provided GeoDataFrame as indexing file")
indexing_gdf = indexing_file
else:
if os.path.exists(indexing_file):
print(f"Reading indexing file from local path: {indexing_file}")
indexing_file_path = indexing_file
storage_options = None
else:
indexing_file_prefix = f"{remote_folder}/{indexing_file}"
print(
f"Looking for indexing file at s3://{s3_bucket}/{indexing_file_prefix}"
)
storage_options = {
"key": aws_access_key_id,
"secret": aws_secret_access_key,
"client_kwargs": {"endpoint_url": endpoint_url},
}
indexing_file_path = f"s3://{s3_bucket}/{indexing_file_prefix}"

indexing_gdf = read_indexing_file(indexing_file_path, storage_options)

indexing_gdf = indexing_gdf[indexing_gdf.year == str(year)]
bounds_poly = box(*bounds)
indexing_gdf = indexing_gdf[indexing_gdf.intersects(bounds_poly)]
indexing_gdf = indexing_gdf[
indexing_gdf.dem_url.apply(lambda x: os.path.basename(x).split("_")[2])
== str(resolution) + "m"
]

logging.info("Adjusting rema bounds to stop small offsets")
logging.info(f"original bounds : {bounds}")
bounds = adjust_bounds_for_rema_profile(bounds, [temp_vrt])
logging.info(f"adjusted bounds : {bounds}")
with rasterio.Env(
AWSSession(session),
AWS_S3_ENDPOINT=endpoint,
AWS_VIRTUAL_HOSTING=False, # critical for non-AWS S3 providers
):
logging.info(f"Reading raster for bounds from remote indexing file")
dem_array, dem_profile = read_raster_from_gdf(
indexing_gdf, bounds_poly, save_path
)
else:
# download and edit the vrt as the remote vrt references browse images and not the data
s3 = session.client("s3", endpoint_url=endpoint_url)

# check to ensure the year is there
vrt_pattern = re.compile(rf"{aoi_name}_.*?{resolution}m_dem_(\d+)\.vrt$")
# List all objects under prefix
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=s3_bucket, Prefix=remote_folder)
valid_years = set()
valid_files = {}

for page in pages:
for obj in page.get("Contents", []):
key = obj["Key"]
match = vrt_pattern.search(key)
if match:
year_found = int(match.group(1))
valid_years.add(year_found)
valid_files[year_found] = key

years_list = sorted(valid_years)

if year not in valid_years:
raise ValueError(
f'The requested year "{year}" does not have a rema timeseries. '
f"valid years are : {years_list}"
)

with rasterio.Env(
AWSSession(session),
AWS_S3_ENDPOINT=endpoint,
AWS_VIRTUAL_HOSTING=False, # critical for non-AWS S3 providers
):
logging.info(f"Reading raster for bounds from remote vrt")
dem_array, dem_profile = read_raster_from_vrt(temp_vrt, bounds, save_path)
vrt_path = valid_files[year]
logging.info(f"Downloading .vrt from : {endpoint_url}/{vrt_path}")
response = s3.get_object(Bucket=s3_bucket, Key=vrt_path)
vrt_content = response["Body"].read()

with tempfile.NamedTemporaryFile("w", delete=False) as f:
f.write(vrt_content.decode("utf-8"))
temp_vrt = f.name

logging.info("Adjusting rema bounds to stop small offsets")
logging.info(f"original bounds : {bounds}")
bounds = adjust_bounds_for_rema_profile(bounds, [temp_vrt])
logging.info(f"adjusted bounds : {bounds}")

with rasterio.Env(
AWSSession(session),
AWS_S3_ENDPOINT=endpoint,
AWS_VIRTUAL_HOSTING=False, # critical for non-AWS S3 providers
):
logging.info(f"Reading raster for bounds from remote vrt")
dem_array, dem_profile = read_raster_from_vrt(temp_vrt, bounds, save_path)

return dem_array, dem_profile
Loading
Loading