From 61c0e575abad0621f27a28294425d14da65a19c5 Mon Sep 17 00:00:00 2001 From: RoxGamba Date: Tue, 27 Jan 2026 13:52:37 -0800 Subject: [PATCH 1/9] Add function to download GRA data from scholarsphere --- PyART/catalogs/gra.py | 238 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 224 insertions(+), 14 deletions(-) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index ab037e1..cab984a 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -2,8 +2,23 @@ import os import h5py from ..waveform import Waveform -import glob as glob import json +import logging +import re +import time + +# librares for downloading +try: + import requests + from bs4 import BeautifulSoup + from urllib.parse import urljoin + from requests.adapters import HTTPAdapter + from urllib3.util.retry import Retry +except ImportError as e: + raise ImportError( + "To use the GRA catalog, please install the required " + "dependencies: requests, beautifulsoup4, urllib3" + ) from e class Waveform_GRA(Waveform): @@ -17,18 +32,22 @@ class Waveform_GRA(Waveform): def __init__( self, - path, + ID="0001", + path="../dat/GRA", ellmax=8, ext="ext", + res="128", r_ext=None, cut_N=None, cut_U=None, - mtdt_path=None, - rescale=False, + nu_rescale=False, modes=[(2, 2)], + download=False, + downloads=["hlm", "metadata"], ): super().__init__() + self.ID = ID self.path = path self.cut_N = cut_N self.cut_U = cut_U @@ -37,19 +56,79 @@ def __init__( self.extrap = ext self.domain = "Time" self.r_ext = r_ext - self.rescale = rescale + self.nu_rescale = nu_rescale + self.res = res # comment out the following for the moment - self.load_metadata(mtdt_path) + + if download: + self.download_simulation(ID=ID, path=path, downloads=downloads, res=res) + + self.load_metadata() self.load_hlm(extrap=ext, ellmax=ellmax, r_ext=r_ext) pass - def load_metadata(self, path): + def download_simulation( + self, + ID="0001", + path=None, + downloads=["hlm", "metadata"], + res=None, + ): + """ + Automatically download and unpack a GRAthena++ + simulation from scholarsphere. + """ + + session = make_session() + + logging.info("Fetching catalog...") + id_map = get_id_to_item_url(session) + + if ID not in id_map: + raise RuntimeError(f"ID {ID} not found in catalog") + + item_url = id_map[ID] + + soup = get_item_soup(session, item_url) + + if "hlm" in downloads: + logging.info("Downloading hlm data...") + if res is None: + res = "128" + self.res = res + logging.warning("No resolution specified, defaulting to res=128") + + filename, tar_url = find_tar_for_resolution(soup, res) + logging.info(f"Found .tar: {filename}") + logging.info(f"Downloading from: {tar_url}") + download_safe(session, tar_url, filename) + # untar, execute via os.system for the moment + extract_path = os.path.join(path, f"GRA_BHBH_{ID}") + os.makedirs(extract_path, exist_ok=True) + logging.info(f"Extracting to: {extract_path}") + os.system(f"tar -xf {filename} -C {extract_path}") + os.remove(filename) + + if "metadata" in downloads: + logging.info("Downloading metadata...") + filename, meta_url = find_metadata_file(soup) + logging.info(f"Found metadata file: {filename}") + logging.info(f"Downloading from: {meta_url}") + download_safe(session, meta_url, filename) + # move to correct location + extract_path = os.path.join(path, f"GRA_BHBH_{ID}", "metadata.json") + os.makedirs(os.path.dirname(extract_path), exist_ok=True) + os.rename(filename, extract_path) + + # Be polite to the server + time.sleep(3) + + def load_metadata(self): """ Load the metadata, if path is None assume that they are in the same dir as the .h5 files """ - if path is None: - path = self.path + path = os.path.join(self.path, f"GRA_BHBH_{self.ID}", self.res, "metadata.json") ometa = json.load(open(path, "r")) m1 = float(ometa["initial-mass1"]) @@ -124,11 +203,23 @@ def load_hlm(self, extrap="ext", ellmax=None, load_m0=False, r_ext=None): r_ext = "100.00" if extrap == "ext": - h5_file = os.path.join(self.path, "rh_Asymptotic_GeometricUnits.h5") + h5_file = os.path.join( + self.path, + f"GRA_BHBH_{self.ID}", + self.res, + "rh_Asymptotic_GeometricUnits.h5", + ) elif extrap == "CCE": - h5_file = os.path.join(self.path, "rh_CCE_GeometricUnits.h5") + h5_file = os.path.join( + self.path, f"GRA_BHBH_{self.ID}", self.res, "rh_CCE_GeometricUnits.h5" + ) elif extrap == "finite": - h5_file = os.path.join(self.path, "rh_FiniteRadii_GeometricUnits.h5") + h5_file = os.path.join( + self.path, + f"GRA_BHBH_{self.ID}", + self.res, + "rh_FiniteRadii_GeometricUnits.h5", + ) else: raise ValueError('extrap should be either "ext", "CCE" or "finite"') @@ -171,7 +262,7 @@ def load_hlm(self, extrap="ext", ellmax=None, load_m0=False, r_ext=None): mode = "Y_l" + str(l) + "_m" + str(m) + ".dat" hlm = nr[r_ext][mode] h = hlm[:, 1] + 1j * hlm[:, 2] - if self.rescale: + if self.nu_rescale: h /= self.metadata["nu"] # amp and phase Alm = abs(h)[self.cut_N :] @@ -282,7 +373,7 @@ def load_psi4lm( mode = "Y_l" + str(l) + "_m" + str(m) + ".dat" psi4lm = nr[r_ext][mode] psi4 = psi4lm[:, 1] + 1j * psi4lm[:, 2] - if self.rescale: + if self.nu_rescale: psi4 /= self.metadata["nu"] Alm = abs(psi4)[self.cut_N :] plm = -np.unwrap(np.angle(psi4))[self.cut_N :] @@ -297,3 +388,122 @@ def load_psi4lm( self._psi4lm = dict_psi4lm pass + + +# ---------------------------------------------------------------------- +# Functions needed to download data from GRAthena++ +# ---------------------------------------------------------------------- + +CATALOG_URL = ( + "https://scholarsphere.psu.edu/resources/610744ac-80b9-4689-8119-320dfd2e2b9a" +) +BASE_URL = "https://scholarsphere.psu.edu" + + +def make_session(): + session = requests.Session() + + retries = Retry( + total=5, + backoff_factor=1.5, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["GET"], + ) + + adapter = HTTPAdapter(max_retries=retries) + session.mount("https://", adapter) + session.mount("http://", adapter) + + session.headers.update( + { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/121.0 Safari/537.36" + ), + "Accept": "*/*", + "Accept-Encoding": "identity", # avoids chunked/gzip resets + "Connection": "keep-alive", + "Referer": "https://scholarsphere.psu.edu/", + } + ) + + return session + + +def get_id_to_item_url(session): + r = session.get(CATALOG_URL, timeout=30) + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + + id_map = {} + + for a in soup.find_all("a", href=True): + text = a.get_text(strip=True) + m = re.search(r"GRAthena:BHBH:(\d{4})", text) + if m: + id_map[m.group(1)] = urljoin(BASE_URL, a["href"]) + + if not id_map: + raise RuntimeError("No GRAthena IDs found on catalog page") + + return id_map + + +def get_item_soup(session, item_url): + r = session.get(item_url, timeout=30) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + + +def find_tar_for_resolution(item_soup, resolution): + resolution = resolution.lower() + + for a in item_soup.find_all("a", href=True): + href = a["href"].lower() + text = a.get_text(strip=True).lower() + if ( + "/downloads/" in href + and text.endswith(".tar") + and resolution in (href + text) + ): + filename = os.path.basename(href) + return filename, urljoin(BASE_URL, a["href"]) + + raise RuntimeError(f"No .tar found for resolution '{resolution}'") + + +def find_metadata_file(item_soup): + for a in item_soup.find_all("a", href=True): + href = a["href"].lower() + text = a.get_text(strip=True).lower() + if "/downloads/" in href and text.endswith(".json"): + filename = os.path.basename(href) + return filename, urljoin(BASE_URL, a["href"]) + + raise RuntimeError(f"No metadata.json file found") + + +def download_safe(session, url, filename, chunk_size=1024 * 1024): + tmp_file = filename + ".part" + downloaded = 0 + + if os.path.exists(tmp_file): + downloaded = os.path.getsize(tmp_file) + logging.info(f"Resuming download from byte {downloaded}") + + headers = {} + if downloaded > 0: + headers["Range"] = f"bytes={downloaded}-" + + with session.get(url, stream=True, headers=headers, timeout=60) as r: + r.raise_for_status() + + mode = "ab" if downloaded > 0 else "wb" + with open(tmp_file, mode) as f: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + + os.rename(tmp_file, filename) + logging.info(f"Download completed") From bd3c49e9b93c5ba6a1a4d4d283216ddb03142b88 Mon Sep 17 00:00:00 2001 From: RoxGamba Date: Tue, 27 Jan 2026 13:53:55 -0800 Subject: [PATCH 2/9] update libraries for testing gra --- .github/workflows/tests.yml | 2 +- tests/test_gra.py | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tests/test_gra.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a7f79d1..4a5e803 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install flake8 pytest 'sxs==2025.0.9' romspline pycbc + pip install flake8 pytest 'sxs==2025.0.9' romspline pycbc requests bs4 urllib #- name: Lint with flake8 # run: | # stop the build if there are Python syntax errors or undefined names diff --git a/tests/test_gra.py b/tests/test_gra.py new file mode 100644 index 0000000..727462c --- /dev/null +++ b/tests/test_gra.py @@ -0,0 +1,42 @@ +""" +Tests for the SXS catalog. +""" + +from PyART.catalogs import gra +import os + +mode_keys = ["A", "p", "real", "imag", "z"] + + +def test_gra(): + """ + Test the SXS download function. + """ + wf = gra.Waveform_GRA( + ID="0001", + path="./", + download=True, + res="128", + downloads=["hlm", "metadata"], + ) + # check attributes + assert wf.ID == "0001" + + # check that the files were downloaded + assert os.path.exists("GRA_BHBH_0001") + assert os.path.exists(f"GRA_BHBH_0001/metadata.json") + assert os.path.exists(f"GRA_BHBH_0001/128/rh_CCE_GeometricUnits.h5") + # check that the modes loaded make sense + for mode in wf.hlm.keys(): + + # check ell, emm + assert mode[0] >= abs(mode[1]) + # check keys + for key in mode_keys: + assert key in wf.hlm[mode].keys() + # check length + assert len(wf.hlm[mode]["A"]) == len(wf.u) + + +if __name__ == "__main__": + test_gra() From 6b578a086f923e1fd6ee3c7d06029dcddcf06e73 Mon Sep 17 00:00:00 2001 From: RoxGamba Date: Tue, 27 Jan 2026 14:00:00 -0800 Subject: [PATCH 3/9] update libraries --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4a5e803..8efdd57 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install flake8 pytest 'sxs==2025.0.9' romspline pycbc requests bs4 urllib + pip install flake8 pytest 'sxs==2025.0.9' romspline pycbc requests bs4 #- name: Lint with flake8 # run: | # stop the build if there are Python syntax errors or undefined names From b48f31d333ebc53c4fd8a1003a1e5b9b2fe933d2 Mon Sep 17 00:00:00 2001 From: RoxGamba Date: Tue, 27 Jan 2026 14:05:36 -0800 Subject: [PATCH 4/9] Fix: metadata path when loadng --- PyART/catalogs/gra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index cab984a..844ec3e 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -128,7 +128,7 @@ def load_metadata(self): Load the metadata, if path is None assume that they are in the same dir as the .h5 files """ - path = os.path.join(self.path, f"GRA_BHBH_{self.ID}", self.res, "metadata.json") + path = os.path.join(self.path, f"GRA_BHBH_{self.ID}", "metadata.json") ometa = json.load(open(path, "r")) m1 = float(ometa["initial-mass1"]) From 46cdca29d4b7770f7ef0bb5b8194c537e7f45128 Mon Sep 17 00:00:00 2001 From: Rossella Gamba <72128273+RoxGamba@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:09:51 -0800 Subject: [PATCH 5/9] Update tests/test_gra.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test_gra.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_gra.py b/tests/test_gra.py index 727462c..33c0d73 100644 --- a/tests/test_gra.py +++ b/tests/test_gra.py @@ -18,6 +18,7 @@ def test_gra(): download=True, res="128", downloads=["hlm", "metadata"], + ext="CCE", ) # check attributes assert wf.ID == "0001" From ed37dc4252a026a10273c3bf465a0d6e9c44debe Mon Sep 17 00:00:00 2001 From: Rossella Gamba <72128273+RoxGamba@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:10:10 -0800 Subject: [PATCH 6/9] Update PyART/catalogs/gra.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- PyART/catalogs/gra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index 844ec3e..b01ae93 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -7,7 +7,7 @@ import re import time -# librares for downloading +# libraries for downloading try: import requests from bs4 import BeautifulSoup From e431d195449f666809e691a089107c063022c95c Mon Sep 17 00:00:00 2001 From: Rossella Gamba <72128273+RoxGamba@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:12:03 -0800 Subject: [PATCH 7/9] Update PyART/catalogs/gra.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- PyART/catalogs/gra.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index b01ae93..7fb0cd9 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -47,6 +47,11 @@ def __init__( ): super().__init__() + # Normalize ID to a 4-digit zero-padded string for consistency + if isinstance(ID, int): + ID = f"{ID:04d}" + elif isinstance(ID, str) and ID.isdigit() and len(ID) < 4: + ID = ID.zfill(4) self.ID = ID self.path = path self.cut_N = cut_N From 4e57132c1900321cbc3cb2dd739bbbe0ee11a1d2 Mon Sep 17 00:00:00 2001 From: RoxGamba Date: Fri, 6 Feb 2026 11:27:00 -0800 Subject: [PATCH 8/9] Some updates after review --- PyART/catalogs/gra.py | 32 +++++++++++++++++++++++++------- tests/test_gra.py | 4 ++-- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index 7fb0cd9..b341ab0 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -84,6 +84,9 @@ def download_simulation( simulation from scholarsphere. """ + if path is None: + path = self.path + session = make_session() logging.info("Fetching catalog...") @@ -130,8 +133,7 @@ def download_simulation( def load_metadata(self): """ - Load the metadata, if path is None assume - that they are in the same dir as the .h5 files + Load the metadata from the json file and store it in self.metadata """ path = os.path.join(self.path, f"GRA_BHBH_{self.ID}", "metadata.json") ometa = json.load(open(path, "r")) @@ -318,13 +320,14 @@ def get_indices_dict(self): def load_psi4lm( self, - path=None, - fname=None, ellmax=None, r_ext=None, extrap="ext", load_m0=False, ): + """ + Load the data from the h5 file, but for psi4 instead of h. + """ if ellmax == None: ellmax = self.ellmax @@ -332,11 +335,26 @@ def load_psi4lm( r_ext = "100.00" if extrap == "ext": - h5_file = os.path.join(self.path, "rPsi4_Asymptotic_GeometricUnits.h5") + h5_file = os.path.join( + self.path, + f"GRA_BHBH_{self.ID}", + self.res, + "rPsi4_Asymptotic_GeometricUnits.h5", + ) elif extrap == "CCE": - h5_file = os.path.join(self.path, "rPsi4_CCE_GeometricUnits.h5") + h5_file = os.path.join( + self.path, + f"GRA_BHBH_{self.ID}", + self.res, + "rPsi4_CCE_GeometricUnits.h5", + ) elif extrap == "finite": - h5_file = os.path.join(self.path, "rPsi4_FiniteRadii_GeometricUnits.h5") + h5_file = os.path.join( + self.path, + f"GRA_BHBH_{self.ID}", + self.res, + "rPsi4_FiniteRadii_GeometricUnits.h5", + ) else: raise ValueError('extrap should be either "ext", "CCE" or "finite"') diff --git a/tests/test_gra.py b/tests/test_gra.py index 33c0d73..60363aa 100644 --- a/tests/test_gra.py +++ b/tests/test_gra.py @@ -1,5 +1,5 @@ """ -Tests for the SXS catalog. +Tests for the GRA catalog. """ from PyART.catalogs import gra @@ -10,7 +10,7 @@ def test_gra(): """ - Test the SXS download function. + Test the GRA download function. """ wf = gra.Waveform_GRA( ID="0001", From 6c66270aca9953d5c1d37217685ebd5f1a17083e Mon Sep 17 00:00:00 2001 From: Rossella Gamba <72128273+RoxGamba@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:32:14 -0800 Subject: [PATCH 9/9] Update PyART/catalogs/gra.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- PyART/catalogs/gra.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/PyART/catalogs/gra.py b/PyART/catalogs/gra.py index b341ab0..a7f1f13 100644 --- a/PyART/catalogs/gra.py +++ b/PyART/catalogs/gra.py @@ -522,7 +522,31 @@ def download_safe(session, url, filename, chunk_size=1024 * 1024): with session.get(url, stream=True, headers=headers, timeout=60) as r: r.raise_for_status() - mode = "ab" if downloaded > 0 else "wb" + # Decide whether we can safely resume or must restart from scratch. + resume_supported = False + if downloaded > 0: + if r.status_code == 206: + content_range = r.headers.get("Content-Range", "") + # Expect the content range to start at our downloaded offset. + expected = f"bytes {downloaded}-" + if content_range.startswith(expected) or expected in content_range: + resume_supported = True + else: + logging.info( + "Server did not honor Range header (status %s); " + "restarting full download", + r.status_code, + ) + + if not resume_supported: + # If we had a partial file, overwrite it rather than append, to avoid + # corrupting the file when the server sends the full content. + if downloaded > 0: + logging.info("Discarding existing partial download and restarting") + downloaded = 0 + mode = "wb" + else: + mode = "ab" with open(tmp_file, mode) as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: