From fcc27c21f59dbd0578eb744511bf072912869ad5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 2 Dec 2025 15:38:25 -0500 Subject: [PATCH 01/66] This initializes a uv package in this repository. --- .python-version | 1 + main.py | 6 ++++++ pyproject.toml | 7 +++++++ 3 files changed, 14 insertions(+) create mode 100644 .python-version create mode 100644 main.py create mode 100644 pyproject.toml diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/main.py b/main.py new file mode 100644 index 0000000..55226a0 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +def main(): + print("Hello from babel-xrefs!") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..826dbc8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "babel-xrefs" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [] From 876353dae4b62b7f6ea9872cbfb29cb2fe221bc7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 2 Dec 2025 15:42:12 -0500 Subject: [PATCH 02/66] Added basic CLI. --- main.py | 6 ------ pyproject.toml | 17 ++++++++++++++++- src/babel_xrefs/__init__.py | 0 src/babel_xrefs/cli.py | 9 +++++++++ 4 files changed, 25 insertions(+), 7 deletions(-) delete mode 100644 main.py create mode 100644 src/babel_xrefs/__init__.py create mode 100644 src/babel_xrefs/cli.py diff --git a/main.py b/main.py deleted file mode 100644 index 55226a0..0000000 --- a/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello from babel-xrefs!") - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index 826dbc8..7c84773 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,19 @@ version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.11" -dependencies = [] +dependencies = [ + "click>=8.3.1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[dependency-groups] +dev = [ + "pytest>=8.3.5", + "ruff>=0.11.0", +] + +[project.scripts] +babel-xrefs = "babel_xrefs.cli:main" diff --git a/src/babel_xrefs/__init__.py b/src/babel_xrefs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py new file mode 100644 index 0000000..537fe60 --- /dev/null +++ b/src/babel_xrefs/cli.py @@ -0,0 +1,9 @@ +# Command line interface for babel-xrefs +import click + +@click.command() +def main(): + pass + +if __name__ == "__main__": + main() From ec1d1f09b11f45b5bc31466f32d52b442b1423ac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 2 Dec 2025 16:09:07 -0500 Subject: [PATCH 03/66] Add /data to the .gitignore. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b7faf40..67d8b31 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Ignore data files. +/data + # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] From eff8f26988981601c578e54ee5f47a867c77fdd0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 3 Dec 2025 02:27:10 -0500 Subject: [PATCH 04/66] Initial implementation of a basic xref query-er. --- pyproject.toml | 4 +- src/__init__.py | 0 src/babel_xrefs/babel_xrefs.py | 33 ++++++++++ src/babel_xrefs/cli.py | 38 +++++++++++- src/babel_xrefs/core/__init__.py | 0 src/babel_xrefs/core/downloader.py | 98 ++++++++++++++++++++++++++++++ src/babel_xrefs/core/model.py | 2 + 7 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/babel_xrefs/babel_xrefs.py create mode 100644 src/babel_xrefs/core/__init__.py create mode 100644 src/babel_xrefs/core/downloader.py create mode 100644 src/babel_xrefs/core/model.py diff --git a/pyproject.toml b/pyproject.toml index 7c84773..5696f67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,8 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "click>=8.3.1", + "duckdb>=1.4.2", + "requests>=2.32.5", ] [build-system] @@ -19,4 +21,4 @@ dev = [ ] [project.scripts] -babel-xrefs = "babel_xrefs.cli:main" +babel-xrefs = "babel_xrefs.cli:cli" diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py new file mode 100644 index 0000000..3ec44ad --- /dev/null +++ b/src/babel_xrefs/babel_xrefs.py @@ -0,0 +1,33 @@ +# Babel XRefs is a tool for accessing and querying the intermediate files +# that we make available with Babel builds. This allows you to find out +# why we consider two identifiers to be identical. +import logging +import duckdb + +from babel_xrefs.core.downloader import BabelDownloader + + +class BabelXRefs: + def __init__(self, downloader: BabelDownloader): + self.downloader = downloader + + def get_curie_xrefs(self, curies: list[str]): + """ + Search for all identifiers that are cross-referenced to the given CURIE. + + :param curie: A CURIE to search for. + :return: A list of cross-references containing that CURIE. + """ + + concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') + concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/ConcordMetadata.parquet') + + # Query the Parquet files using DuckDB. + duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') + db = duckdb.connect(duckdb_path) + concord_table = db.read_parquet(concord_parquet) + xrefs = db.execute(f"SELECT * FROM concord_table WHERE subj IN $1 OR obj in $1", [curies]) + + # TODO: convert into case classes. + + return xrefs.fetchall() diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index 537fe60..691fcd6 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -1,9 +1,41 @@ # Command line interface for babel-xrefs import click +import logging +import babel_xrefs +from babel_xrefs.core.downloader import BabelDownloader +from babel_xrefs.babel_xrefs import BabelXRefs -@click.command() -def main(): + +@click.group() +def cli(): pass +@cli.command("xrefs") +@click.argument("curies", type=str, required=True, nargs=-1) +@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") +@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") +def xrefs(curies: list[str], babel_url: str, local_dir: str): + """ + Fetches and prints the cross-references (xrefs) for the given CURIEs. + + This function searches for xrefs associated with the provided CURIEs. + + \f + + :param curies: A list of CURIEs (Compact URI) for which cross-references need + to be retrieved. + :type curies: list[str] + :param babel_url: Base URL of the Babel server + :type babel_url: str + + :return: None + """ + logging.basicConfig(level=logging.INFO) + + bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir)) + xrefs = bxref.get_curie_xrefs(curies) + for xref in xrefs: + print(xref) + if __name__ == "__main__": - main() + cli() diff --git a/src/babel_xrefs/core/__init__.py b/src/babel_xrefs/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py new file mode 100644 index 0000000..34e4cb0 --- /dev/null +++ b/src/babel_xrefs/core/downloader.py @@ -0,0 +1,98 @@ +import os +import urllib.parse +import subprocess +import requests +import logging + +class BabelDownloader: + """ + Class for downloading Babel cross-reference files to a local directory as needed. + """ + + def __init__(self, url_base, local_path=None, retries=10): + # We assume the URL base is correct (if not, we can fix it later). + self.url_base = url_base + self.retries = retries + self.logger = logging.getLogger(BabelDownloader.__name__) + + if local_path is None: + # Default to using TMPDIR. + # TODO: replace with a real temporary directory. + tmpdir = os.environ.get("TMPDIR") + if tmpdir: + local_path = tmpdir + + # Make sure the local path is an existing directory or that we can create it. + if not os.path.exists(local_path): + os.makedirs(local_path, exist_ok=True) + self.local_path = local_path + elif os.path.exists(local_path) and os.path.isdir(local_path): + self.local_path = local_path + else: + raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'") + + def get_output_file(self, filename): + filepath = os.path.join(self.local_path, filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + return filepath + + def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024): + local_path_to_download_to = os.path.join(self.local_path, dirpath) + os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) + + url_to_download = urllib.parse.urljoin(self.url_base, dirpath) + bytes_downloaded = 0 + + wget_command_line = [ + "wget", + "--progress=bar:force:noscroll", # Display progress bar. + "--compression=auto", # Compress files if available. + "--continue", # Continue downloading in case of interruption. + f"--tries={self.retries}", + "-O" + local_path_to_download_to, + ] + + # Add URL and output file. + wget_command_line.append(url_to_download) + + # Execute wget. + self.logger.info(f"Downloading {url_to_download} using wget: {wget_command_line}") + process = subprocess.run(wget_command_line) + if process.returncode != 0: + raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") + + bytes_downloaded = os.path.getsize(local_path_to_download_to) + self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") + return local_path_to_download_to + + + def get_downloaded_dir(self, dirpath: str): + local_path_to_download_to = os.path.join(self.local_path, dirpath) + os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) + + url_to_download_recursively = urllib.parse.urljoin(self.url_base, dirpath) + + wget_command_line = [ + "wget", + "--progress=bar:force:noscroll", # Display progress bar. + "--compression=auto", # Compress files if available. + "--continue", # Continue downloading in case of interruption. + f"--tries={self.retries}", + "--recursive", + "--no-parent", + "--no-host-directories", + "--directory-prefix=" + local_path_to_download_to, + ] + + # Add URL and output file. + if url_to_download_recursively[-1] != "/": + url_to_download_recursively += "/" + wget_command_line.append(url_to_download_recursively) + + # Execute wget. + self.logger.info(f"Downloading {url_to_download_recursively} using wget: {wget_command_line}") + process = subprocess.run(wget_command_line) + if process.returncode != 0: + raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") + + return local_path_to_download_to diff --git a/src/babel_xrefs/core/model.py b/src/babel_xrefs/core/model.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/src/babel_xrefs/core/model.py @@ -0,0 +1,2 @@ + + From 4d04e2ae01a768b358a5f6d918edc6d8d442aca1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 3 Dec 2025 23:02:21 -0500 Subject: [PATCH 05/66] Added a method to look up a particular identifier. --- src/babel_xrefs/babel_xrefs.py | 24 +++++++++++++++++++++++- src/babel_xrefs/cli.py | 26 +++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py index 3ec44ad..6e171fa 100644 --- a/src/babel_xrefs/babel_xrefs.py +++ b/src/babel_xrefs/babel_xrefs.py @@ -11,6 +11,28 @@ class BabelXRefs: def __init__(self, downloader: BabelDownloader): self.downloader = downloader + def get_curie_ids(self, curies: list[str]): + """ + Search for all identifiers in the /ids/ files for a particular CURIE. + + :param curie: A CURIE to search for. + :return: A list of cross-references containing that CURIE. + """ + + identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') + concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') + + # Query the Parquet files using DuckDB. + duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') + db = duckdb.connect(duckdb_path) + identifier_table = db.read_parquet(identifier_parquet) + xrefs = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies]) + + # TODO: convert into case classes. + + return xrefs.fetchall() + + def get_curie_xrefs(self, curies: list[str]): """ Search for all identifiers that are cross-referenced to the given CURIE. @@ -20,7 +42,7 @@ def get_curie_xrefs(self, curies: list[str]): """ concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/ConcordMetadata.parquet') + concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') # Query the Parquet files using DuckDB. duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index 691fcd6..c77be3b 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -1,7 +1,6 @@ # Command line interface for babel-xrefs import click import logging -import babel_xrefs from babel_xrefs.core.downloader import BabelDownloader from babel_xrefs.babel_xrefs import BabelXRefs @@ -37,5 +36,30 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str): for xref in xrefs: print(xref) +@cli.command("ids") +@click.argument("curies", type=str, required=True, nargs=-1) +@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") +@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") +def ids(curies: list[str], babel_url: str, local_dir: str): + """ + Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided. + + \f + + :param curies: A list of CURIEs (Compact URI) for which cross-references need + to be retrieved. + :type curies: list[str] + :param babel_url: Base URL of the Babel server + :type babel_url: str + + :return: None + """ + logging.basicConfig(level=logging.INFO) + + bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir)) + xrefs = bxref.get_curie_ids(curies) + for xref in xrefs: + print(xref) + if __name__ == "__main__": cli() From 8531cb7a9e4c65ea5459a2e56d75cb09cb30bf9d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 3 Dec 2025 23:29:17 -0500 Subject: [PATCH 06/66] Added CURIE expansion/recursive lookup. --- src/babel_xrefs/babel_xrefs.py | 54 ++++++++++++++++++++++++------ src/babel_xrefs/cli.py | 5 +-- src/babel_xrefs/core/downloader.py | 6 +++- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py index 6e171fa..f321e5e 100644 --- a/src/babel_xrefs/babel_xrefs.py +++ b/src/babel_xrefs/babel_xrefs.py @@ -1,11 +1,30 @@ # Babel XRefs is a tool for accessing and querying the intermediate files # that we make available with Babel builds. This allows you to find out # why we consider two identifiers to be identical. +import dataclasses import logging import duckdb +import functools from babel_xrefs.core.downloader import BabelDownloader +@dataclasses.dataclass(frozen=True) +class CrossReference: + filename: str + subj: str + pred: str + obj: str + + @staticmethod + def from_tuple(tuple: tuple[str, str, str, str]): + return CrossReference(filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3]) + + @property + def curies(self): + return frozenset([self.subj, self.obj]) + + def __lt__(self, other): + return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred) class BabelXRefs: def __init__(self, downloader: BabelDownloader): @@ -32,24 +51,37 @@ def get_curie_ids(self, curies: list[str]): return xrefs.fetchall() + @functools.lru_cache(maxsize=None) + def get_curie_xref(self, curie: str): + concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') + concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') + + duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') + db = duckdb.connect(duckdb_path) + concord_table = db.read_parquet(concord_parquet) + xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() + xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples)) + return xrefs - def get_curie_xrefs(self, curies: list[str]): + def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set()): """ Search for all identifiers that are cross-referenced to the given CURIE. :param curie: A CURIE to search for. + :param expand: Whether to expand the cross-references (i.e. recursively follow all identifiers). :return: A list of cross-references containing that CURIE. """ - concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') + xrefs = set() + for curie in curies: + logging.info(f"Searching for cross-references for {curie}") + xrefs.update(self.get_curie_xref(curie)) - # Query the Parquet files using DuckDB. - duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') - db = duckdb.connect(duckdb_path) - concord_table = db.read_parquet(concord_parquet) - xrefs = db.execute(f"SELECT * FROM concord_table WHERE subj IN $1 OR obj in $1", [curies]) - - # TODO: convert into case classes. + if expand: + # Get a unique set of referenced curies, not including the ones currently queried. + new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion) + if new_curies: + logging.info(f"Expanding cross-references to {new_curies}") + xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies))) - return xrefs.fetchall() + return sorted(xrefs) diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index c77be3b..c73d3db 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -13,7 +13,8 @@ def cli(): @click.argument("curies", type=str, required=True, nargs=-1) @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") -def xrefs(curies: list[str], babel_url: str, local_dir: str): +@click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs") +def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -32,7 +33,7 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str): logging.basicConfig(level=logging.INFO) bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir)) - xrefs = bxref.get_curie_xrefs(curies) + xrefs = bxref.get_curie_xrefs(curies, expand) for xref in xrefs: print(xref) diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py index 34e4cb0..9313685 100644 --- a/src/babel_xrefs/core/downloader.py +++ b/src/babel_xrefs/core/downloader.py @@ -1,8 +1,10 @@ +import functools import os import urllib.parse import subprocess import requests import logging +import functools class BabelDownloader: """ @@ -31,11 +33,13 @@ def __init__(self, url_base, local_path=None, retries=10): else: raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'") + @functools.lru_cache(maxsize=None) def get_output_file(self, filename): filepath = os.path.join(self.local_path, filename) os.makedirs(os.path.dirname(filepath), exist_ok=True) return filepath + @functools.lru_cache(maxsize=None) def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024): local_path_to_download_to = os.path.join(self.local_path, dirpath) os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) @@ -65,7 +69,7 @@ def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024): self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") return local_path_to_download_to - + @functools.lru_cache(maxsize=None) def get_downloaded_dir(self, dirpath: str): local_path_to_download_to = os.path.join(self.local_path, dirpath) os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) From a1aeec6be4ad11a3720c666d59483d34a77e0560 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 4 Dec 2025 00:56:06 -0500 Subject: [PATCH 07/66] Added a basic ConcordTester. --- src/babel_xrefs/cli.py | 18 ++++++++++++ src/babel_xrefs/core/nodenorm.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 src/babel_xrefs/core/nodenorm.py diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index c73d3db..4c65391 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -3,6 +3,7 @@ import logging from babel_xrefs.core.downloader import BabelDownloader from babel_xrefs.babel_xrefs import BabelXRefs +from babel_xrefs.core.nodenorm import NodeNorm @click.group() @@ -62,5 +63,22 @@ def ids(curies: list[str], babel_url: str, local_dir: str): for xref in xrefs: print(xref) +@cli.command("test-concord") +@click.argument("curies", type=str, required=True, nargs=-1) +@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") +def test_concord(curies, nodenorm_url): + # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm? + # By definition, this can only combine all the cliques mentioned in the CURIEs. + + nodenorm = NodeNorm(nodenorm_url) + for curie in curies: + identifiers = nodenorm.get_clique_identifiers(curie) + for identifier in identifiers: + if identifier.label: + print(f"{curie}\t{identifier.curie}\t{identifier.label}") + else: + print(f"{curie}\t{identifier.curie}\t") + + if __name__ == "__main__": cli() diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py new file mode 100644 index 0000000..6c45e02 --- /dev/null +++ b/src/babel_xrefs/core/nodenorm.py @@ -0,0 +1,50 @@ +import dataclasses +import functools +import requests + +@dataclasses.dataclass +class Identifier: + curie: str + label: str = "" + taxa: list[str] = dataclasses.field(default_factory=list) + description: list[str] = dataclasses.field(default_factory=list) + + def __lt__(self, other): + return self.curie < other.curie + + @staticmethod + def from_dict(d: dict): + identifier = Identifier(curie=d['identifier']) + if 'label' in d: + identifier.label = d['label'] + if 'taxa' in d: + identifier.taxa = d['taxa'] + if 'description' in d: + identifier.description = d['description'] + return identifier + +class NodeNorm: + def __init__(self, nodenorm_url: str=""): + self.nodenorm_url = nodenorm_url + + @functools.lru_cache(maxsize=None) + def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=False, description=False, individual_types=None, include_taxa=None): + response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={ + "curie": curie, + "conflate": conflate, + "drug_chemical_conflate": drug_chemical_conflate, + "description": description, + "individual_types": individual_types, + "include_taxa": include_taxa, + }) + response.raise_for_status() + result = response.json() + + return result[curie] + + @functools.lru_cache(maxsize=None) + def get_clique_identifiers(self, curie, **kwargs): + result = self.normalize_curie(curie, **kwargs) + if 'equivalent_identifiers' not in result: + return None + return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers'])) From bb1eb996a851010726576c298e0bce54eeeb6414 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 4 Dec 2025 01:58:24 -0500 Subject: [PATCH 08/66] Added labels via NodeNorm. --- src/babel_xrefs/babel_xrefs.py | 42 ++++++++++++++++++++++++++++---- src/babel_xrefs/cli.py | 12 +++++---- src/babel_xrefs/core/nodenorm.py | 17 ++++++++++++- 3 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py index f321e5e..e953c0f 100644 --- a/src/babel_xrefs/babel_xrefs.py +++ b/src/babel_xrefs/babel_xrefs.py @@ -7,6 +7,8 @@ import functools from babel_xrefs.core.downloader import BabelDownloader +from babel_xrefs.core.nodenorm import NodeNorm + @dataclasses.dataclass(frozen=True) class CrossReference: @@ -26,9 +28,26 @@ def curies(self): def __lt__(self, other): return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred) +class LabeledCrossReference(CrossReference): + subj_label: str + subj_biolink_type: str + obj_label: str + obj_biolink_type: str + + def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: str, subj_biolink_type: str, obj_label: str, obj_biolink_type: str): + super().__init__(subj=subj, obj=obj, filename=filename, pred=pred) + self.subj_label = subj_label + self.subj_biolink_type = subj_biolink_type + self.obj_label = obj_label + self.obj_biolink_type = obj_biolink_type + + def __str__(self): + return f"""LabeledCrossReference(subj="{self.subj}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")""" + class BabelXRefs: - def __init__(self, downloader: BabelDownloader): + def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None): self.downloader = downloader + self.nodenorm = nodenorm def get_curie_ids(self, curies: list[str]): """ @@ -52,7 +71,7 @@ def get_curie_ids(self, curies: list[str]): return xrefs.fetchall() @functools.lru_cache(maxsize=None) - def get_curie_xref(self, curie: str): + def get_curie_xref(self, curie: str, label_curies: bool = False): concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') @@ -61,9 +80,22 @@ def get_curie_xref(self, curie: str): concord_table = db.read_parquet(concord_parquet) xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples)) + + if label_curies: + xrefs = map(lambda xref: LabeledCrossReference( + subj=xref.subj, + obj=xref.obj, + filename=xref.filename, + pred=xref.pred, + subj_label=self.nodenorm.get_identifier(xref.subj).label, + subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, + obj_label=self.nodenorm.get_identifier(xref.obj).label, + obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, + ), xrefs) + return xrefs - def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set()): + def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False): """ Search for all identifiers that are cross-referenced to the given CURIE. @@ -75,13 +107,13 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies xrefs = set() for curie in curies: logging.info(f"Searching for cross-references for {curie}") - xrefs.update(self.get_curie_xref(curie)) + xrefs.update(self.get_curie_xref(curie, label_curies)) if expand: # Get a unique set of referenced curies, not including the ones currently queried. new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion) if new_curies: logging.info(f"Expanding cross-references to {new_curies}") - xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies))) + xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies), label_curies=label_curies)) return sorted(xrefs) diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index 4c65391..673500d 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -14,8 +14,10 @@ def cli(): @click.argument("curies", type=str, required=True, nargs=-1) @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") +@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") @click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs") -def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool): +@click.option("--labels", is_flag=True, help="Include labels for CURIEs") +def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -33,8 +35,8 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool): """ logging.basicConfig(level=logging.INFO) - bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir)) - xrefs = bxref.get_curie_xrefs(curies, expand) + bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir), NodeNorm(nodenorm_url)) + xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels) for xref in xrefs: print(xref) @@ -75,9 +77,9 @@ def test_concord(curies, nodenorm_url): identifiers = nodenorm.get_clique_identifiers(curie) for identifier in identifiers: if identifier.label: - print(f"{curie}\t{identifier.curie}\t{identifier.label}") + print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}") else: - print(f"{curie}\t{identifier.curie}\t") + print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}") if __name__ == "__main__": diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py index 6c45e02..5c8e0c9 100644 --- a/src/babel_xrefs/core/nodenorm.py +++ b/src/babel_xrefs/core/nodenorm.py @@ -1,11 +1,13 @@ import dataclasses import functools import requests +import logging @dataclasses.dataclass class Identifier: curie: str label: str = "" + biolink_type: str = "" taxa: list[str] = dataclasses.field(default_factory=list) description: list[str] = dataclasses.field(default_factory=list) @@ -21,6 +23,8 @@ def from_dict(d: dict): identifier.taxa = d['taxa'] if 'description' in d: identifier.description = d['description'] + if 'type' in d: + identifier.biolink_type = d['type'] return identifier class NodeNorm: @@ -28,7 +32,18 @@ def __init__(self, nodenorm_url: str=""): self.nodenorm_url = nodenorm_url @functools.lru_cache(maxsize=None) - def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=False, description=False, individual_types=None, include_taxa=None): + def get_identifier(self, curie): + result = self.normalize_curie(curie) + logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}") + for identifier in result.get('equivalent_identifiers', []): + if identifier['identifier'] == curie: + logging.debug(f"Found exact match for {curie}: {identifier}") + return Identifier.from_dict(identifier) + + return Identifier(curie=curie) + + @functools.lru_cache(maxsize=None) + def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True, description=True, individual_types=True, include_taxa=True): response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={ "curie": curie, "conflate": conflate, From 40c3338d2d72b651e6f07da0dfbb87e226437b1c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 8 Jan 2026 14:58:30 -0500 Subject: [PATCH 09/66] Midnight commit: attempting to improve expansion. --- src/babel_xrefs/babel_xrefs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py index e953c0f..32829c1 100644 --- a/src/babel_xrefs/babel_xrefs.py +++ b/src/babel_xrefs/babel_xrefs.py @@ -104,6 +104,9 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies :return: A list of cross-references containing that CURIE. """ + if ignore_curies_in_expansion: + logging.info(f"Ignoring {len(ignore_curies_in_expansion)}: {ignore_curies_in_expansion}") + xrefs = set() for curie in curies: logging.info(f"Searching for cross-references for {curie}") @@ -114,6 +117,6 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion) if new_curies: logging.info(f"Expanding cross-references to {new_curies}") - xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies), label_curies=label_curies)) + xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies)) return sorted(xrefs) From 8c41112940b8e60fa186c775e2d3f138c3b2a935 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 8 Jan 2026 16:04:56 -0500 Subject: [PATCH 10/66] Added some improvements. --- src/babel_xrefs/babel_xrefs.py | 2 +- src/babel_xrefs/core/nodenorm.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py index 32829c1..bf32521 100644 --- a/src/babel_xrefs/babel_xrefs.py +++ b/src/babel_xrefs/babel_xrefs.py @@ -42,7 +42,7 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st self.obj_biolink_type = obj_biolink_type def __str__(self): - return f"""LabeledCrossReference(subj="{self.subj}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")""" + return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")""" class BabelXRefs: def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None): diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py index 5c8e0c9..018f106 100644 --- a/src/babel_xrefs/core/nodenorm.py +++ b/src/babel_xrefs/core/nodenorm.py @@ -35,6 +35,8 @@ def __init__(self, nodenorm_url: str=""): def get_identifier(self, curie): result = self.normalize_curie(curie) logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}") + if not result: + return Identifier(curie=curie) for identifier in result.get('equivalent_identifiers', []): if identifier['identifier'] == curie: logging.debug(f"Found exact match for {curie}: {identifier}") From 239c89f7bf6e9853cac75452234d0f2fce265c9c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Feb 2026 01:25:40 -0500 Subject: [PATCH 11/66] Added a CLAUDE.md by Claude.ai. --- CLAUDE.md | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..536af9b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,118 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +babel-xrefs is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC). + +## Development Setup + +This project uses **uv** for package management: + +```bash +# Install dependencies +uv sync + +# Install with dev dependencies +uv sync --group dev + +# Activate virtual environment (if needed) +source .venv/bin/activate + +# Run the CLI +uv run babel-xrefs --help +``` + +## Commands + +### Running the Application + +```bash +# Get cross-references for one or more CURIEs +uv run babel-xrefs xrefs MONDO:0004979 + +# Get cross-references with expansion (recursive lookup) +uv run babel-xrefs xrefs MONDO:0004979 --expand + +# Get cross-references with labels from NodeNorm +uv run babel-xrefs xrefs MONDO:0004979 --labels + +# Get ID records for CURIEs +uv run babel-xrefs ids MONDO:0004979 + +# Test concordance changes with NodeNorm +uv run babel-xrefs test-concord MONDO:0004979 HP:0000001 + +# Use custom Babel server or local directory +uv run babel-xrefs xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/ +``` + +### Development Commands + +```bash +# Run tests +uv run pytest + +# Run linter +uv run ruff check + +# Format code +uv run ruff format +``` + +## Architecture + +### Core Components + +1. **BabelDownloader** (`src/babel_xrefs/core/downloader.py`): + - Downloads Babel intermediate files from a remote server using `wget` + - Caches files locally in configurable directory (default: `data/2025nov19/`) + - Uses `@functools.lru_cache` to avoid re-downloading + - **Important**: Requires `wget` to be installed on the system + +2. **BabelXRefs** (`src/babel_xrefs/babel_xrefs.py`): + - Main query engine for cross-references + - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`) + - Supports recursive expansion of cross-references + - Creates ephemeral DuckDB databases in `data//output/duckdbs/` + +3. **NodeNorm** (`src/babel_xrefs/core/nodenorm.py`): + - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/) + - Fetches labels, biolink types, and equivalent identifiers for CURIEs + - Uses `@functools.lru_cache` for performance + - Optional component for label enrichment + +4. **CLI** (`src/babel_xrefs/cli.py`): + - Click-based command-line interface + - Three main commands: `xrefs`, `ids`, `test-concord` + +### Data Flow + +1. User provides CURIEs via CLI +2. BabelDownloader ensures required Parquet files are downloaded +3. BabelXRefs queries files using DuckDB +4. If `--labels` or `--expand` flags are set, NodeNorm is queried for additional metadata +5. Results are printed to stdout + +### Key Design Patterns + +- **Lazy downloading**: Files are only downloaded when first accessed +- **LRU caching**: Heavy use of `@functools.lru_cache` to avoid redundant downloads and API calls +- **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs +- **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups + +## Important Notes + +- **System dependency**: This project requires `wget` to be installed (used by BabelDownloader) +- **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases +- **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url` +- **No tests yet**: The project currently has pytest configured but no test files exist +- **Empty model.py**: The `src/babel_xrefs/core/model.py` file exists but is currently empty; data classes are defined in `babel_xrefs.py` and `nodenorm.py` instead + +## File Locations + +- Source code: `src/babel_xrefs/` +- Downloaded Babel files: `data//duckdb/*.parquet` +- Generated DuckDB databases: `data//output/duckdbs/` +- Entry point: `src/babel_xrefs/cli.py` From 8132fe1f64d455557929059ec7981435e6d82a63 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Feb 2026 01:52:15 -0500 Subject: [PATCH 12/66] Reorganized file slightly. --- src/babel_xrefs/cli.py | 2 +- src/babel_xrefs/{ => core}/babel_xrefs.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/babel_xrefs/{ => core}/babel_xrefs.py (100%) diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py index 673500d..ab2e283 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_xrefs/cli.py @@ -2,7 +2,7 @@ import click import logging from babel_xrefs.core.downloader import BabelDownloader -from babel_xrefs.babel_xrefs import BabelXRefs +from babel_xrefs.core.babel_xrefs import BabelXRefs from babel_xrefs.core.nodenorm import NodeNorm diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/core/babel_xrefs.py similarity index 100% rename from src/babel_xrefs/babel_xrefs.py rename to src/babel_xrefs/core/babel_xrefs.py From bd009721c06dee9e482d2947c16afacaaf265a0e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Feb 2026 02:02:02 -0500 Subject: [PATCH 13/66] Claude wrote some tests. --- tests/__init__.py | 1 + tests/test_downloader.py | 194 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_downloader.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..61c04ac --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Tests for babel-xrefs diff --git a/tests/test_downloader.py b/tests/test_downloader.py new file mode 100644 index 0000000..461854c --- /dev/null +++ b/tests/test_downloader.py @@ -0,0 +1,194 @@ +""" +Tests for the BabelDownloader class. + +These tests verify that the downloader can successfully fetch large Parquet files +from the Babel server using wget and properly manage local file caching. +""" + +import os +import shutil +import pytest +from babel_xrefs.core.downloader import BabelDownloader + + +# Constants for test configuration +BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/" +TEST_DATA_DIR = "data/test" +IDENTIFIERS_FILE = "duckdb/Identifiers.parquet" +MINIMUM_FILE_SIZE_GB = 2 +MINIMUM_FILE_SIZE_BYTES = MINIMUM_FILE_SIZE_GB * 1024 * 1024 * 1024 # 2GB in bytes + + +@pytest.fixture(scope="module") +def test_data_dir(): + """ + Fixture that provides a clean test data directory. + + This fixture: + - Creates the test data directory before tests run + - Yields the directory path to tests + - Cleans up (removes) the directory after all tests complete + + Scope is 'module' so the directory persists across all tests in this file, + allowing downloaded files to be reused by multiple tests. + """ + # Setup: ensure clean test directory + if os.path.exists(TEST_DATA_DIR): + shutil.rmtree(TEST_DATA_DIR) + os.makedirs(TEST_DATA_DIR, exist_ok=True) + + yield TEST_DATA_DIR + + # Teardown: remove test directory and all contents + if os.path.exists(TEST_DATA_DIR): + shutil.rmtree(TEST_DATA_DIR) + + +@pytest.fixture(scope="module") +def downloader(test_data_dir): + """ + Fixture that provides a BabelDownloader instance configured for testing. + + Args: + test_data_dir: The test data directory fixture + + Returns: + BabelDownloader: Configured downloader instance + """ + return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + +def test_downloader_initialization(test_data_dir): + """ + Test that BabelDownloader initializes correctly with custom parameters. + + Verifies: + - Downloader accepts URL and local path + - Local path is stored correctly + - Directory is created if it doesn't exist + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + assert downloader.url_base == BABEL_URL + assert downloader.local_path == test_data_dir + assert os.path.exists(test_data_dir) + assert os.path.isdir(test_data_dir) + + +def test_download_large_parquet_file(downloader): + """ + Test downloading a large Parquet file from the Babel server. + + This test: + 1. Downloads the Identifiers.parquet file (2GB+) from the real Babel server + 2. Verifies the file was downloaded successfully + 3. Confirms the file size is at least 2GB + + Note: This test takes several minutes to complete due to the large file size. + + Args: + downloader: BabelDownloader fixture + """ + # Download the Identifiers.parquet file + downloaded_path = downloader.get_downloaded_file(IDENTIFIERS_FILE) + + # Verify the file exists + assert os.path.exists(downloaded_path), \ + f"Downloaded file does not exist at {downloaded_path}" + + # Verify it's a file, not a directory + assert os.path.isfile(downloaded_path), \ + f"Downloaded path is not a file: {downloaded_path}" + + # Get the file size in bytes + file_size_bytes = os.path.getsize(downloaded_path) + file_size_gb = file_size_bytes / (1024 * 1024 * 1024) + + # Verify the file is at least 2GB + assert file_size_bytes >= MINIMUM_FILE_SIZE_BYTES, \ + f"Downloaded file is too small: {file_size_gb:.2f}GB (expected at least {MINIMUM_FILE_SIZE_GB}GB)" + + print(f"\n✓ Successfully downloaded {IDENTIFIERS_FILE}") + print(f" Size: {file_size_gb:.2f}GB ({file_size_bytes:,} bytes)") + print(f" Path: {downloaded_path}") + + +def test_download_caching(downloader): + """ + Test that the downloader uses LRU caching to avoid re-downloading files. + + This test: + 1. Downloads the same file twice + 2. Verifies both calls return the same path + 3. Confirms the file is only downloaded once (via caching) + + Args: + downloader: BabelDownloader fixture + """ + # First download + path1 = downloader.get_downloaded_file(IDENTIFIERS_FILE) + initial_mtime = os.path.getmtime(path1) + + # Second download - should use cache + path2 = downloader.get_downloaded_file(IDENTIFIERS_FILE) + second_mtime = os.path.getmtime(path2) + + # Verify same path returned + assert path1 == path2, "Cached download returned different path" + + # Verify file wasn't modified (i.e., wasn't re-downloaded) + assert initial_mtime == second_mtime, \ + "File was modified, suggesting it was re-downloaded instead of cached" + + print(f"\n✓ Caching works correctly - file not re-downloaded") + + +def test_get_output_file(downloader): + """ + Test the get_output_file method for creating output file paths. + + This test: + 1. Creates an output file path + 2. Verifies the directory structure is created + 3. Confirms the path is in the correct location + + Args: + downloader: BabelDownloader fixture + """ + output_filename = "output/duckdbs/test.duckdb" + output_path = downloader.get_output_file(output_filename) + + # Verify the path is correct + expected_path = os.path.join(TEST_DATA_DIR, output_filename) + assert output_path == expected_path, \ + f"Output path mismatch: expected {expected_path}, got {output_path}" + + # Verify the parent directory was created + assert os.path.exists(os.path.dirname(output_path)), \ + "Parent directory for output file was not created" + + print(f"\n✓ Output file path created correctly: {output_path}") + + +def test_invalid_local_path(): + """ + Test that BabelDownloader raises an error for invalid local paths. + + This test verifies error handling when attempting to use a file path + as the local directory (should be a directory, not a file). + """ + # Create a temporary file + invalid_path = "/tmp/test_babel_invalid_file.txt" + with open(invalid_path, 'w') as f: + f.write("test") + + try: + # Attempt to create downloader with a file path instead of directory + with pytest.raises(ValueError, match="Invalid local_path"): + BabelDownloader(url_base=BABEL_URL, local_path=invalid_path) + + print("\n✓ Correctly raised ValueError for invalid local path") + finally: + # Clean up + if os.path.exists(invalid_path): + os.remove(invalid_path) From 9cc06bc28fe29077bea689e847c5a946898bcd2b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Feb 2026 02:13:52 -0500 Subject: [PATCH 14/66] Improved downloader using Claude. --- pyproject.toml | 1 + src/babel_xrefs/core/downloader.py | 182 +++++++++++++++++++++-------- 2 files changed, 132 insertions(+), 51 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5696f67..ddc0a95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "click>=8.3.1", "duckdb>=1.4.2", "requests>=2.32.5", + "tqdm>=4.67.0", ] [build-system] diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py index 9313685..8e9a07e 100644 --- a/src/babel_xrefs/core/downloader.py +++ b/src/babel_xrefs/core/downloader.py @@ -1,10 +1,11 @@ import functools import os import urllib.parse -import subprocess +import time import requests +from tqdm import tqdm import logging -import functools + class BabelDownloader: """ @@ -39,31 +40,126 @@ def get_output_file(self, filename): os.makedirs(os.path.dirname(filepath), exist_ok=True) return filepath + def _stream_download(self, response, local_path, resume_byte_pos, chunk_size): + """ + Stream download from response to file with progress bar. + + Args: + response: requests.Response object with stream=True + local_path: Local file path to write to + resume_byte_pos: Starting byte position (for resume) + chunk_size: Size of chunks to read/write + """ + # Get total size from Content-Length header (may not be present) + content_length = response.headers.get('Content-Length') + if content_length: + total_size = int(content_length) + resume_byte_pos + else: + total_size = None + + # Open file in append mode if resuming, write mode otherwise + mode = 'ab' if resume_byte_pos > 0 else 'wb' + + with open(local_path, mode) as f: + with tqdm( + total=total_size, + initial=resume_byte_pos, + unit='B', + unit_scale=True, + unit_divisor=1024, + desc=os.path.basename(local_path) + ) as progress_bar: + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + progress_bar.update(len(chunk)) + + def _download_with_retry(self, url, local_path, chunk_size): + """ + Download a file with retry logic and resume capability. + + Args: + url: URL to download from + local_path: Local file path to save to + chunk_size: Size of chunks to read/write + + Raises: + RuntimeError: If all retry attempts fail + """ + for attempt in range(1, self.retries + 1): + try: + # Check if we're resuming a partial download + resume_byte_pos = 0 + if os.path.exists(local_path): + resume_byte_pos = os.path.getsize(local_path) + + # Prepare headers for resume + headers = {} + if resume_byte_pos > 0: + headers['Range'] = f'bytes={resume_byte_pos}-' + self.logger.info(f"Resuming download from byte {resume_byte_pos}") + + # Make streaming request with timeout for connection (not total time) + response = requests.get(url, headers=headers, stream=True, timeout=30) + + # Handle different response codes + if response.status_code == 416: + # Range Not Satisfiable - file already complete + self.logger.info(f"File already complete: {local_path}") + return + elif response.status_code == 206: + # Partial Content - resume successful + self.logger.info(f"Resuming download (HTTP 206)") + elif response.status_code == 200: + # OK - server doesn't support resume or no Range header was sent + if resume_byte_pos > 0: + self.logger.warning(f"Server doesn't support resume, restarting from beginning") + resume_byte_pos = 0 + # Remove partial file + if os.path.exists(local_path): + os.remove(local_path) + else: + response.raise_for_status() + + # Stream download with progress bar + self._stream_download(response, local_path, resume_byte_pos, chunk_size) + + # Success - exit retry loop + return + + except (requests.RequestException, IOError) as e: + self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}") + + if attempt < self.retries: + # Calculate exponential backoff with max of 60 seconds + wait_time = min(2 ** attempt, 60) + self.logger.info(f"Retrying in {wait_time} seconds...") + time.sleep(wait_time) + else: + # All retries exhausted + raise RuntimeError(f"Failed to download {url} after {self.retries} attempts: {e}") + @functools.lru_cache(maxsize=None) - def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024): + def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): + """ + Download a file from the Babel server to local storage. + + Args: + dirpath: Relative path from url_base to the file + chunk_size: Size of chunks to download (default 1MB) + + Returns: + str: Local path to the downloaded file + """ local_path_to_download_to = os.path.join(self.local_path, dirpath) os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) url_to_download = urllib.parse.urljoin(self.url_base, dirpath) - bytes_downloaded = 0 - - wget_command_line = [ - "wget", - "--progress=bar:force:noscroll", # Display progress bar. - "--compression=auto", # Compress files if available. - "--continue", # Continue downloading in case of interruption. - f"--tries={self.retries}", - "-O" + local_path_to_download_to, - ] - - # Add URL and output file. - wget_command_line.append(url_to_download) - - # Execute wget. - self.logger.info(f"Downloading {url_to_download} using wget: {wget_command_line}") - process = subprocess.run(wget_command_line) - if process.returncode != 0: - raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") + + self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}") + + # Download with retry logic + self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size) bytes_downloaded = os.path.getsize(local_path_to_download_to) self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") @@ -71,32 +167,16 @@ def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024): @functools.lru_cache(maxsize=None) def get_downloaded_dir(self, dirpath: str): - local_path_to_download_to = os.path.join(self.local_path, dirpath) - os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) - - url_to_download_recursively = urllib.parse.urljoin(self.url_base, dirpath) - - wget_command_line = [ - "wget", - "--progress=bar:force:noscroll", # Display progress bar. - "--compression=auto", # Compress files if available. - "--continue", # Continue downloading in case of interruption. - f"--tries={self.retries}", - "--recursive", - "--no-parent", - "--no-host-directories", - "--directory-prefix=" + local_path_to_download_to, - ] - - # Add URL and output file. - if url_to_download_recursively[-1] != "/": - url_to_download_recursively += "/" - wget_command_line.append(url_to_download_recursively) - - # Execute wget. - self.logger.info(f"Downloading {url_to_download_recursively} using wget: {wget_command_line}") - process = subprocess.run(wget_command_line) - if process.returncode != 0: - raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") - - return local_path_to_download_to + """ + Download a directory recursively. + + NOTE: This method is not implemented in the Python-based downloader. + Use get_downloaded_file() for individual files instead. + + Raises: + NotImplementedError: This method is not implemented + """ + raise NotImplementedError( + "Recursive directory downloads are not supported. " + "Use get_downloaded_file() for individual files." + ) From da8bb0cfa8493990faf934aa70f72ab0de434bc0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Feb 2026 02:19:09 -0500 Subject: [PATCH 15/66] Added MD5 download functionality. --- src/babel_xrefs/core/downloader.py | 96 +++++++++++++- tests/test_downloader.py | 204 ++++++++++++++++++++++++++++- 2 files changed, 298 insertions(+), 2 deletions(-) diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py index 8e9a07e..93081c6 100644 --- a/src/babel_xrefs/core/downloader.py +++ b/src/babel_xrefs/core/downloader.py @@ -2,6 +2,7 @@ import os import urllib.parse import time +import hashlib import requests from tqdm import tqdm import logging @@ -40,6 +41,56 @@ def get_output_file(self, filename): os.makedirs(os.path.dirname(filepath), exist_ok=True) return filepath + def _calculate_md5(self, file_path, chunk_size=1024*1024): + """ + Calculate MD5 checksum of a file. + + Args: + file_path: Path to the file to checksum + chunk_size: Size of chunks to read (default 1MB) + + Returns: + str: Hexadecimal MD5 checksum + """ + md5_hash = hashlib.md5() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): + md5_hash.update(chunk) + return md5_hash.hexdigest() + + def _fetch_remote_md5(self, url): + """ + Fetch MD5 checksum from remote .md5 file. + + Args: + url: URL to the .md5 file + + Returns: + str: MD5 checksum if found, None if file doesn't exist or is malformed + """ + try: + response = requests.get(url, timeout=10) + if response.status_code == 404: + self.logger.debug(f"No .md5 file found at {url}") + return None + response.raise_for_status() + + # Parse MD5 file content + # Format is typically: "md5hash filename" or just "md5hash" + content = response.text.strip() + md5_match = content.split()[0] # Take first token + + # Validate it's a valid MD5 (32 hex characters) + if len(md5_match) == 32 and all(c in '0123456789abcdef' for c in md5_match.lower()): + return md5_match.lower() + else: + self.logger.warning(f"Malformed .md5 file at {url}: {content}") + return None + + except requests.RequestException as e: + self.logger.debug(f"Could not fetch .md5 file from {url}: {e}") + return None + def _stream_download(self, response, local_path, resume_byte_pos, chunk_size): """ Stream download from response to file with progress bar. @@ -142,7 +193,13 @@ def _download_with_retry(self, url, local_path, chunk_size): @functools.lru_cache(maxsize=None) def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): """ - Download a file from the Babel server to local storage. + Download a file from the Babel server to local storage with MD5 validation. + + If a .md5 file exists on the server, this method will: + 1. Check if the local file exists + 2. Verify its MD5 checksum matches the expected value + 3. Delete and re-download if checksums don't match + 4. Skip download if checksums match Args: dirpath: Relative path from url_base to the file @@ -155,12 +212,49 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) url_to_download = urllib.parse.urljoin(self.url_base, dirpath) + md5_url = url_to_download + '.md5' + + # Check if file already exists and validate with MD5 if available + if os.path.exists(local_path_to_download_to): + self.logger.info(f"Local file exists: {local_path_to_download_to}") + + # Try to fetch remote MD5 checksum + expected_md5 = self._fetch_remote_md5(md5_url) + + if expected_md5: + self.logger.info(f"Validating MD5 checksum (expected: {expected_md5})") + + # Calculate local file's MD5 + actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size) + self.logger.info(f"Local file MD5: {actual_md5}") + + if actual_md5 == expected_md5: + # File is valid, skip download + self.logger.info(f"MD5 checksum matches - file is valid, skipping download") + bytes_downloaded = os.path.getsize(local_path_to_download_to) + self.logger.info(f"Using existing file: {local_path_to_download_to} ({bytes_downloaded} bytes)") + return local_path_to_download_to + else: + # Checksums don't match - delete and re-download + self.logger.warning(f"MD5 checksum mismatch! Expected {expected_md5}, got {actual_md5}") + self.logger.warning(f"Deleting corrupted file and re-downloading: {local_path_to_download_to}") + os.remove(local_path_to_download_to) self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}") # Download with retry logic self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size) + # Verify MD5 after download if available + expected_md5 = self._fetch_remote_md5(md5_url) + if expected_md5: + actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size) + if actual_md5 == expected_md5: + self.logger.info(f"Post-download MD5 verification passed: {actual_md5}") + else: + self.logger.error(f"Post-download MD5 verification failed! Expected {expected_md5}, got {actual_md5}") + raise RuntimeError(f"Downloaded file has incorrect MD5 checksum") + bytes_downloaded = os.path.getsize(local_path_to_download_to) self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") return local_path_to_download_to diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 461854c..11132f3 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -2,12 +2,14 @@ Tests for the BabelDownloader class. These tests verify that the downloader can successfully fetch large Parquet files -from the Babel server using wget and properly manage local file caching. +from the Babel server and properly manage local file caching with MD5 validation. """ import os import shutil +import hashlib import pytest +from unittest.mock import Mock, patch, MagicMock from babel_xrefs.core.downloader import BabelDownloader @@ -192,3 +194,203 @@ def test_invalid_local_path(): # Clean up if os.path.exists(invalid_path): os.remove(invalid_path) + + +def test_md5_validation_matching_checksum(test_data_dir): + """ + Test that MD5 validation skips download when checksums match. + + This test: + 1. Creates a local file with known content + 2. Mocks the .md5 file to return the correct checksum + 3. Verifies the download is skipped (no actual HTTP download occurs) + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + # Create a test file with known content + test_file = "test_file.txt" + local_path = os.path.join(test_data_dir, test_file) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + + test_content = b"This is test content for MD5 validation" + with open(local_path, 'wb') as f: + f.write(test_content) + + # Calculate the expected MD5 + expected_md5 = hashlib.md5(test_content).hexdigest() + + # Mock the _fetch_remote_md5 to return the matching checksum + with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): + # Mock _download_with_retry to ensure it's NOT called + with patch.object(downloader, '_download_with_retry') as mock_download: + # Clear the cache before testing + downloader.get_downloaded_file.cache_clear() + + result_path = downloader.get_downloaded_file(test_file) + + # Verify the download was skipped + mock_download.assert_not_called() + assert result_path == local_path + assert os.path.exists(result_path) + + print(f"\n✓ MD5 validation correctly skipped download for matching checksum: {expected_md5}") + + +def test_md5_validation_mismatched_checksum(test_data_dir): + """ + Test that MD5 validation deletes and re-downloads file when checksums don't match. + + This test: + 1. Creates a local file with wrong content + 2. Mocks the .md5 file to return a different checksum + 3. Verifies the file is deleted and re-downloaded + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + # Create a test file with incorrect content + test_file = "test_file_mismatch.txt" + local_path = os.path.join(test_data_dir, test_file) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + + wrong_content = b"This is WRONG content" + with open(local_path, 'wb') as f: + f.write(wrong_content) + + # Use a different MD5 (this is MD5 of "correct content") + correct_content = b"This is CORRECT content" + expected_md5 = hashlib.md5(correct_content).hexdigest() + + # Track whether file was deleted + original_exists = os.path.exists(local_path) + + # Mock the _fetch_remote_md5 to return the mismatched checksum + with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): + # Mock _download_with_retry to create the "correct" file + def mock_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(correct_content) + + with patch.object(downloader, '_download_with_retry', side_effect=mock_download): + # Clear the cache before testing + downloader.get_downloaded_file.cache_clear() + + result_path = downloader.get_downloaded_file(test_file) + + # Verify the file exists and has correct content + assert os.path.exists(result_path) + with open(result_path, 'rb') as f: + assert f.read() == correct_content + + print(f"\n✓ MD5 validation correctly deleted and re-downloaded file with mismatched checksum") + + +def test_md5_validation_no_md5_file(test_data_dir): + """ + Test that download proceeds normally when no .md5 file exists. + + This test: + 1. Mocks the .md5 file fetch to return None (404) + 2. Verifies the download proceeds normally + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + test_file = "test_file_no_md5.txt" + local_path = os.path.join(test_data_dir, test_file) + + test_content = b"Test content without MD5 file" + + # Mock the _fetch_remote_md5 to return None (no .md5 file) + with patch.object(downloader, '_fetch_remote_md5', return_value=None): + # Mock _download_with_retry to create the file + def mock_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(test_content) + + with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method: + # Clear the cache before testing + downloader.get_downloaded_file.cache_clear() + + result_path = downloader.get_downloaded_file(test_file) + + # Verify download was called (normal download path) + mock_download_method.assert_called_once() + assert os.path.exists(result_path) + with open(result_path, 'rb') as f: + assert f.read() == test_content + + print(f"\n✓ Download proceeded normally when no .md5 file exists") + + +def test_md5_validation_malformed_md5_file(test_data_dir): + """ + Test that download proceeds normally when .md5 file is malformed. + + This test: + 1. Mocks the .md5 file fetch to return None (malformed content) + 2. Verifies the download proceeds normally with a warning + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + test_file = "test_file_malformed_md5.txt" + local_path = os.path.join(test_data_dir, test_file) + + test_content = b"Test content with malformed MD5 file" + + # Mock the _fetch_remote_md5 to return None (malformed .md5 file) + with patch.object(downloader, '_fetch_remote_md5', return_value=None): + # Mock _download_with_retry to create the file + def mock_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(test_content) + + with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method: + # Clear the cache before testing + downloader.get_downloaded_file.cache_clear() + + result_path = downloader.get_downloaded_file(test_file) + + # Verify download was called (normal download path) + mock_download_method.assert_called_once() + assert os.path.exists(result_path) + + print(f"\n✓ Download proceeded normally when .md5 file is malformed") + + +def test_md5_post_download_validation(test_data_dir): + """ + Test that MD5 validation occurs after download and fails if checksum is wrong. + + This test: + 1. Downloads a new file + 2. Mocks the .md5 file to return a checksum + 3. Mocks the download to create a file with WRONG content + 4. Verifies a RuntimeError is raised for checksum mismatch + """ + downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + test_file = "test_file_post_validation.txt" + local_path = os.path.join(test_data_dir, test_file) + + # Expected content and MD5 + correct_content = b"Expected content" + expected_md5 = hashlib.md5(correct_content).hexdigest() + + # Wrong content that will be downloaded + wrong_content = b"Wrong content downloaded" + + # Mock the _fetch_remote_md5 to return the expected checksum + with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): + # Mock _download_with_retry to create a file with WRONG content + def mock_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(wrong_content) + + with patch.object(downloader, '_download_with_retry', side_effect=mock_download): + # Clear the cache before testing + downloader.get_downloaded_file.cache_clear() + + # Should raise RuntimeError due to post-download MD5 mismatch + with pytest.raises(RuntimeError, match="incorrect MD5 checksum"): + downloader.get_downloaded_file(test_file) + + print(f"\n✓ Post-download MD5 validation correctly detected checksum mismatch") From 8f36b74f9563f6e48e6cc929b88be6e3e2efbd72 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 15 Feb 2026 02:11:32 -0500 Subject: [PATCH 16/66] Removed empty model file. --- src/babel_xrefs/core/model.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 src/babel_xrefs/core/model.py diff --git a/src/babel_xrefs/core/model.py b/src/babel_xrefs/core/model.py deleted file mode 100644 index 139597f..0000000 --- a/src/babel_xrefs/core/model.py +++ /dev/null @@ -1,2 +0,0 @@ - - From 0534fd876c4b81374f88f76b4ed3561a2abe624d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 15 Feb 2026 02:17:13 -0500 Subject: [PATCH 17/66] Attempted to rename this package to babel-explorer. --- CLAUDE.md | 33 +++++++++---------- README.md | 2 +- pyproject.toml | 4 +-- .../__init__.py | 0 src/{babel_xrefs => babel_explorer}/cli.py | 8 ++--- .../core/__init__.py | 0 .../core/babel_xrefs.py | 4 +-- .../core/downloader.py | 0 .../core/nodenorm.py | 0 tests/__init__.py | 2 +- tests/test_downloader.py | 2 +- 11 files changed, 26 insertions(+), 29 deletions(-) rename src/{babel_xrefs => babel_explorer}/__init__.py (100%) rename src/{babel_xrefs => babel_explorer}/cli.py (94%) rename src/{babel_xrefs => babel_explorer}/core/__init__.py (100%) rename src/{babel_xrefs => babel_explorer}/core/babel_xrefs.py (97%) rename src/{babel_xrefs => babel_explorer}/core/downloader.py (100%) rename src/{babel_xrefs => babel_explorer}/core/nodenorm.py (100%) diff --git a/CLAUDE.md b/CLAUDE.md index 536af9b..1fc596f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -babel-xrefs is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC). +babel-explorer is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC). ## Development Setup @@ -17,11 +17,8 @@ uv sync # Install with dev dependencies uv sync --group dev -# Activate virtual environment (if needed) -source .venv/bin/activate - # Run the CLI -uv run babel-xrefs --help +uv run babel-explorer --help ``` ## Commands @@ -30,22 +27,22 @@ uv run babel-xrefs --help ```bash # Get cross-references for one or more CURIEs -uv run babel-xrefs xrefs MONDO:0004979 +uv run babel-explorer xrefs MONDO:0004979 # Get cross-references with expansion (recursive lookup) -uv run babel-xrefs xrefs MONDO:0004979 --expand +uv run babel-explorer xrefs MONDO:0004979 --expand # Get cross-references with labels from NodeNorm -uv run babel-xrefs xrefs MONDO:0004979 --labels +uv run babel-explorer xrefs MONDO:0004979 --labels # Get ID records for CURIEs -uv run babel-xrefs ids MONDO:0004979 +uv run babel-explorer ids MONDO:0004979 # Test concordance changes with NodeNorm -uv run babel-xrefs test-concord MONDO:0004979 HP:0000001 +uv run babel-explorer test-concord MONDO:0004979 HP:0000001 # Use custom Babel server or local directory -uv run babel-xrefs xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/ +uv run babel-explorer xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/ ``` ### Development Commands @@ -65,25 +62,25 @@ uv run ruff format ### Core Components -1. **BabelDownloader** (`src/babel_xrefs/core/downloader.py`): +1. **BabelDownloader** (`src/babel_explorer/core/downloader.py`): - Downloads Babel intermediate files from a remote server using `wget` - Caches files locally in configurable directory (default: `data/2025nov19/`) - Uses `@functools.lru_cache` to avoid re-downloading - **Important**: Requires `wget` to be installed on the system -2. **BabelXRefs** (`src/babel_xrefs/babel_xrefs.py`): +2. **BabelXRefs** (`src/babel_explorer/babel_xrefs.py`): - Main query engine for cross-references - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`) - Supports recursive expansion of cross-references - Creates ephemeral DuckDB databases in `data//output/duckdbs/` -3. **NodeNorm** (`src/babel_xrefs/core/nodenorm.py`): +3. **NodeNorm** (`src/babel_explorer/core/nodenorm.py`): - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/) - Fetches labels, biolink types, and equivalent identifiers for CURIEs - Uses `@functools.lru_cache` for performance - Optional component for label enrichment -4. **CLI** (`src/babel_xrefs/cli.py`): +4. **CLI** (`src/babel_explorer/cli.py`): - Click-based command-line interface - Three main commands: `xrefs`, `ids`, `test-concord` @@ -108,11 +105,11 @@ uv run ruff format - **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases - **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url` - **No tests yet**: The project currently has pytest configured but no test files exist -- **Empty model.py**: The `src/babel_xrefs/core/model.py` file exists but is currently empty; data classes are defined in `babel_xrefs.py` and `nodenorm.py` instead +- **Empty model.py**: The `src/babel_explorer/core/model.py` file exists but is currently empty; data classes are defined in `babel_explorer.py` and `nodenorm.py` instead ## File Locations -- Source code: `src/babel_xrefs/` +- Source code: `src/babel_explorer/` - Downloaded Babel files: `data//duckdb/*.parquet` - Generated DuckDB databases: `data//output/duckdbs/` -- Entry point: `src/babel_xrefs/cli.py` +- Entry point: `src/babel_explorer/cli.py` diff --git a/README.md b/README.md index 7e78ca5..d17c739 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# babel-xrefs +# babel-explorer Software for querying and exporting Babel intermediate files diff --git a/pyproject.toml b/pyproject.toml index ddc0a95..0fb8f09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "babel-xrefs" +name = "babel-explorer" version = "0.1.0" description = "Add your description here" readme = "README.md" @@ -22,4 +22,4 @@ dev = [ ] [project.scripts] -babel-xrefs = "babel_xrefs.cli:cli" +babel-explorer = "babel_explorer.cli:cli" diff --git a/src/babel_xrefs/__init__.py b/src/babel_explorer/__init__.py similarity index 100% rename from src/babel_xrefs/__init__.py rename to src/babel_explorer/__init__.py diff --git a/src/babel_xrefs/cli.py b/src/babel_explorer/cli.py similarity index 94% rename from src/babel_xrefs/cli.py rename to src/babel_explorer/cli.py index ab2e283..8dd5fc4 100644 --- a/src/babel_xrefs/cli.py +++ b/src/babel_explorer/cli.py @@ -1,9 +1,9 @@ -# Command line interface for babel-xrefs +# Command line interface for babel-explorer import click import logging -from babel_xrefs.core.downloader import BabelDownloader -from babel_xrefs.core.babel_xrefs import BabelXRefs -from babel_xrefs.core.nodenorm import NodeNorm +from babel_explorer.core.downloader import BabelDownloader +from babel_explorer.core.babel_xrefs import BabelXRefs +from babel_explorer.core.nodenorm import NodeNorm @click.group() diff --git a/src/babel_xrefs/core/__init__.py b/src/babel_explorer/core/__init__.py similarity index 100% rename from src/babel_xrefs/core/__init__.py rename to src/babel_explorer/core/__init__.py diff --git a/src/babel_xrefs/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py similarity index 97% rename from src/babel_xrefs/core/babel_xrefs.py rename to src/babel_explorer/core/babel_xrefs.py index bf32521..6776a98 100644 --- a/src/babel_xrefs/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -6,8 +6,8 @@ import duckdb import functools -from babel_xrefs.core.downloader import BabelDownloader -from babel_xrefs.core.nodenorm import NodeNorm +from babel_explorer.core.downloader import BabelDownloader +from babel_explorer.core.nodenorm import NodeNorm @dataclasses.dataclass(frozen=True) diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_explorer/core/downloader.py similarity index 100% rename from src/babel_xrefs/core/downloader.py rename to src/babel_explorer/core/downloader.py diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py similarity index 100% rename from src/babel_xrefs/core/nodenorm.py rename to src/babel_explorer/core/nodenorm.py diff --git a/tests/__init__.py b/tests/__init__.py index 61c04ac..588fec0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1 @@ -# Tests for babel-xrefs +# Tests for babel-explorer diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 11132f3..c16bf74 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -10,7 +10,7 @@ import hashlib import pytest from unittest.mock import Mock, patch, MagicMock -from babel_xrefs.core.downloader import BabelDownloader +from babel_explorer.core.downloader import BabelDownloader # Constants for test configuration From 0b3a9f512cf5e695fe51376e3f7d0b7557f19a58 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 15 Feb 2026 02:44:54 -0500 Subject: [PATCH 18/66] Add comprehensive pytest suite for all core modules - Add IdentifierRecord dataclass to babel_xrefs.py (resolves TODO) - Add 89 tests across 3 files: test_downloader (26), test_babel_xrefs (31), test_nodenorm (23) - Unit tests (71) use mocks and run without network; integration tests (18) use real downloads/APIs - Add session-scoped fixtures in conftest.py for shared Parquet file downloads - Parametrize integration tests over tests/data/valid_curies.txt for easy expansion - Add integration and slow pytest markers to pyproject.toml - Update CLAUDE.md and README.md with testing documentation Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 47 +- README.md | 56 ++- pyproject.toml | 6 + src/babel_explorer/core/babel_xrefs.py | 35 +- tests/conftest.py | 106 ++++ tests/constants.py | 26 + tests/data/valid_curies.txt | 3 + tests/test_babel_xrefs.py | 333 ++++++++++++ tests/test_downloader.py | 669 +++++++++++++------------ tests/test_nodenorm.py | 296 +++++++++++ 10 files changed, 1230 insertions(+), 347 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/constants.py create mode 100644 tests/data/valid_curies.txt create mode 100644 tests/test_babel_xrefs.py create mode 100644 tests/test_nodenorm.py diff --git a/CLAUDE.md b/CLAUDE.md index 1fc596f..ae2e78f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -48,8 +48,17 @@ uv run babel-explorer xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url ### Development Commands ```bash -# Run tests -uv run pytest +# Run all tests (includes large file downloads) +uv run pytest -v + +# Run unit tests only (fast, no network) +uv run pytest -v -m "not integration" + +# Run integration tests without 2GB+ downloads +uv run pytest -v -m "integration and not slow" + +# Run a single test file +uv run pytest -v tests/test_nodenorm.py # Run linter uv run ruff check @@ -68,7 +77,7 @@ uv run ruff format - Uses `@functools.lru_cache` to avoid re-downloading - **Important**: Requires `wget` to be installed on the system -2. **BabelXRefs** (`src/babel_explorer/babel_xrefs.py`): +2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`): - Main query engine for cross-references - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`) - Supports recursive expansion of cross-references @@ -99,17 +108,43 @@ uv run ruff format - **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs - **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups +## Testing + +### Test Structure + +Tests live in `tests/` and are split into fast **unit tests** (mocked, no network) and slower **integration tests** (real downloads and API calls). Pytest markers control which tests run: + +- **`@pytest.mark.integration`** — requires network access (downloads Parquet files or calls NodeNorm API) +- **`@pytest.mark.slow`** — downloads very large files (2 GB+) + +| File | Unit | Integration | Slow | Total | +|------|------|-------------|------|-------| +| `tests/test_downloader.py` | 22 | 3 | 1 | 26 | +| `tests/test_babel_xrefs.py` | 22 | 8 | 1 | 31 | +| `tests/test_nodenorm.py` | 18 | 5 | 0 | 23 | + +### Test Infrastructure + +- **`tests/conftest.py`** — Session-scoped fixtures that download Parquet files once and share them across all integration tests. Teardown removes the `data/test/` directory so the next run starts fresh. +- **`tests/constants.py`** — Shared constants (URLs, file paths) and `load_curies()` helper. +- **`tests/data/valid_curies.txt`** — One CURIE per line (`#` comments allowed). Integration tests are parametrized over this list — adding a new line automatically expands test coverage. + +### Key Dataclasses + +- **`CrossReference`** — Frozen dataclass for Concord.parquet rows (filename, subj, pred, obj) +- **`LabeledCrossReference`** — Extends CrossReference with labels and biolink types from NodeNorm +- **`IdentifierRecord`** — Frozen dataclass for Identifiers.parquet rows (curie + dynamic extra fields). Returned by `BabelXRefs.get_curie_ids()`. + ## Important Notes -- **System dependency**: This project requires `wget` to be installed (used by BabelDownloader) - **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases - **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url` -- **No tests yet**: The project currently has pytest configured but no test files exist -- **Empty model.py**: The `src/babel_explorer/core/model.py` file exists but is currently empty; data classes are defined in `babel_explorer.py` and `nodenorm.py` instead ## File Locations - Source code: `src/babel_explorer/` +- Tests: `tests/` +- Test CURIEs: `tests/data/valid_curies.txt` - Downloaded Babel files: `data//duckdb/*.parquet` - Generated DuckDB databases: `data//output/duckdbs/` - Entry point: `src/babel_explorer/cli.py` diff --git a/README.md b/README.md index d17c739..b545c8c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,56 @@ # babel-explorer -Software for querying and exporting Babel intermediate files +Software for querying and exploring Babel intermediate files. + +babel-explorer allows you to discover why two biological/chemical identifiers are considered identical by the [Babel](https://github.com/TranslatorSRI/Babel) system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC). + +## Setup + +This project uses [uv](https://docs.astral.sh/uv/) for package management: + +```bash +uv sync --group dev +``` + +## Usage + +```bash +# Get cross-references for one or more CURIEs +uv run babel-explorer xrefs MONDO:0004979 + +# Get cross-references with expansion (recursive lookup) +uv run babel-explorer xrefs MONDO:0004979 --expand + +# Get cross-references with labels from NodeNorm +uv run babel-explorer xrefs MONDO:0004979 --labels + +# Get ID records for CURIEs +uv run babel-explorer ids MONDO:0004979 + +# Test concordance changes with NodeNorm +uv run babel-explorer test-concord MONDO:0004979 HP:0000001 +``` + +## Testing + +Tests are split into fast **unit tests** (mocked, no network) and slower **integration tests** (real file downloads and API calls), controlled by pytest markers. + +```bash +# Unit tests only — fast, no network required +uv run pytest -v -m "not integration" + +# Integration tests without 2GB+ downloads +uv run pytest -v -m "integration and not slow" + +# Full suite including large file downloads +uv run pytest -v +``` + +### Adding Test CURIEs + +Integration tests are parametrized over the CURIEs listed in `tests/data/valid_curies.txt`. Add a new CURIE on its own line to automatically expand test coverage: + +``` +# tests/data/valid_curies.txt +MONDO:0004979 +HP:0000001 +``` diff --git a/pyproject.toml b/pyproject.toml index 0fb8f09..922fa1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,3 +23,9 @@ dev = [ [project.scripts] babel-explorer = "babel_explorer.cli:cli" + +[tool.pytest.ini_options] +markers = [ + "integration: tests requiring network access (deselect with '-m \"not integration\"')", + "slow: tests downloading very large files 2GB+ (deselect with '-m \"not slow\"')", +] diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 6776a98..1e82125 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -44,17 +44,39 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st def __str__(self): return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")""" +@dataclasses.dataclass(frozen=True) +class IdentifierRecord: + """A record from the Identifiers.parquet file.""" + curie: str + extra_fields: tuple = () + + @staticmethod + def from_row(row: tuple, column_names: list[str]): + """Create an IdentifierRecord from a DuckDB result row and its column names.""" + curie_idx = column_names.index('curie') + extra = tuple( + (col, row[i]) for i, col in enumerate(column_names) if i != curie_idx + ) + return IdentifierRecord(curie=row[curie_idx], extra_fields=extra) + + def __str__(self): + parts = [f"curie={self.curie!r}"] + for name, value in self.extra_fields: + parts.append(f"{name}={value!r}") + return f"IdentifierRecord({', '.join(parts)})" + + class BabelXRefs: def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None): self.downloader = downloader self.nodenorm = nodenorm - def get_curie_ids(self, curies: list[str]): + def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: """ Search for all identifiers in the /ids/ files for a particular CURIE. - :param curie: A CURIE to search for. - :return: A list of cross-references containing that CURIE. + :param curies: A list of CURIEs to search for. + :return: A list of IdentifierRecords containing those CURIEs. """ identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') @@ -64,11 +86,10 @@ def get_curie_ids(self, curies: list[str]): duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') db = duckdb.connect(duckdb_path) identifier_table = db.read_parquet(identifier_parquet) - xrefs = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies]) - - # TODO: convert into case classes. + result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies]) - return xrefs.fetchall() + column_names = [desc[0] for desc in result.description] + return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()] @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f3df2fe --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,106 @@ +""" +Shared fixtures for babel-explorer tests. + +Session-scoped fixtures download Babel files once and share them across all test modules. +Teardown removes the test data directory so the next run starts fresh. +""" + +import os +import shutil + +import pytest + +from babel_explorer.core.downloader import BabelDownloader +from babel_explorer.core.babel_xrefs import BabelXRefs +from babel_explorer.core.nodenorm import NodeNorm + +from tests.constants import ( + BABEL_URL, + NODENORM_URL, + TEST_DATA_DIR, + CONCORD_FILE, + METADATA_FILE, + IDENTIFIERS_FILE, + load_curies, +) + + +# --------------------------------------------------------------------------- +# Session-scoped fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def valid_curies() -> list[str]: + """Load test CURIEs from tests/data/valid_curies.txt.""" + curies = load_curies() + assert len(curies) > 0, "No CURIEs found in valid_curies.txt" + return curies + + +@pytest.fixture(scope="session") +def test_data_dir(): + """ + Provide a clean test data directory for the entire session. + + Creates the directory before tests, removes it after all tests complete. + """ + if os.path.exists(TEST_DATA_DIR): + shutil.rmtree(TEST_DATA_DIR) + os.makedirs(TEST_DATA_DIR, exist_ok=True) + + yield TEST_DATA_DIR + + if os.path.exists(TEST_DATA_DIR): + shutil.rmtree(TEST_DATA_DIR) + + +@pytest.fixture(scope="session") +def shared_downloader(test_data_dir) -> BabelDownloader: + """A BabelDownloader pointed at the test data directory.""" + return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + + +@pytest.fixture(scope="session") +def downloaded_concord(shared_downloader) -> str: + """Download duckdb/Concord.parquet (~626 MB). Returns the local path.""" + return shared_downloader.get_downloaded_file(CONCORD_FILE) + + +@pytest.fixture(scope="session") +def downloaded_metadata(shared_downloader) -> str: + """Download duckdb/Metadata.parquet (small). Returns the local path.""" + return shared_downloader.get_downloaded_file(METADATA_FILE) + + +@pytest.fixture(scope="session") +def downloaded_parquet_files(downloaded_concord, downloaded_metadata) -> dict[str, str]: + """Dict of {relative_name: local_path} for Concord and Metadata files.""" + return { + CONCORD_FILE: downloaded_concord, + METADATA_FILE: downloaded_metadata, + } + + +@pytest.fixture(scope="session") +def downloaded_identifiers(shared_downloader) -> str: + """Download duckdb/Identifiers.parquet (2 GB+). Returns the local path.""" + return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE) + + +@pytest.fixture(scope="session") +def nodenorm() -> NodeNorm: + """A NodeNorm client pointed at the public API.""" + return NodeNorm(nodenorm_url=NODENORM_URL) + + +@pytest.fixture(scope="session") +def babel_xrefs(shared_downloader, downloaded_parquet_files) -> BabelXRefs: + """A BabelXRefs instance (no NodeNorm) with Concord + Metadata already downloaded.""" + return BabelXRefs(shared_downloader) + + +@pytest.fixture(scope="session") +def babel_xrefs_with_nodenorm(shared_downloader, nodenorm, downloaded_parquet_files) -> BabelXRefs: + """A BabelXRefs instance with NodeNorm, Concord + Metadata already downloaded.""" + return BabelXRefs(shared_downloader, nodenorm) diff --git a/tests/constants.py b/tests/constants.py new file mode 100644 index 0000000..01b75fa --- /dev/null +++ b/tests/constants.py @@ -0,0 +1,26 @@ +"""Shared constants for babel-explorer tests.""" + +import pathlib + +BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/" +NODENORM_URL = "https://nodenormalization-sri.renci.org/" +TEST_DATA_DIR = "data/test" + +# Parquet file paths (relative to the Babel server / local data dir) +CONCORD_FILE = "duckdb/Concord.parquet" +METADATA_FILE = "duckdb/Metadata.parquet" +IDENTIFIERS_FILE = "duckdb/Identifiers.parquet" + +# Path to the valid CURIEs file +VALID_CURIES_PATH = pathlib.Path(__file__).parent / "data" / "valid_curies.txt" + + +def load_curies(path: pathlib.Path = VALID_CURIES_PATH) -> list[str]: + """Read CURIEs from a text file, skipping comments and blank lines.""" + curies = [] + with open(path) as f: + for line in f: + stripped = line.strip() + if stripped and not stripped.startswith("#"): + curies.append(stripped) + return curies diff --git a/tests/data/valid_curies.txt b/tests/data/valid_curies.txt new file mode 100644 index 0000000..9f2f87c --- /dev/null +++ b/tests/data/valid_curies.txt @@ -0,0 +1,3 @@ +# Valid CURIEs for integration tests. +# Add new CURIEs here to expand test coverage — tests are parametrized over this list. +MONDO:0004979 diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py new file mode 100644 index 0000000..052d09c --- /dev/null +++ b/tests/test_babel_xrefs.py @@ -0,0 +1,333 @@ +""" +Tests for BabelXRefs, CrossReference, LabeledCrossReference, and IdentifierRecord. + +Unit tests use mocks; integration tests query real Parquet files via DuckDB. +""" + +import pytest +from unittest.mock import patch, MagicMock + +from babel_explorer.core.babel_xrefs import ( + BabelXRefs, + CrossReference, + LabeledCrossReference, + IdentifierRecord, +) +from babel_explorer.core.downloader import BabelDownloader +from babel_explorer.core.nodenorm import NodeNorm + +from tests.constants import load_curies + +VALID_CURIES = load_curies() + + +# ========================================================================== +# Unit Tests — CrossReference +# ========================================================================== + + +class TestCrossReference: + def test_creation(self): + xr = CrossReference(filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2") + assert xr.filename == "f.txt" + assert xr.subj == "A:1" + assert xr.pred == "skos:exactMatch" + assert xr.obj == "B:2" + + def test_from_tuple(self): + t = ("file.tsv", "MONDO:1", "owl:sameAs", "HP:2") + xr = CrossReference.from_tuple(t) + assert xr.filename == "file.tsv" + assert xr.subj == "MONDO:1" + assert xr.pred == "owl:sameAs" + assert xr.obj == "HP:2" + + def test_curies_property(self): + xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + assert xr.curies == frozenset({"A:1", "B:2"}) + + def test_frozen_immutability(self): + xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + with pytest.raises(AttributeError): + xr.subj = "changed" + + def test_equality(self): + a = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + b = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + assert a == b + + def test_hashability(self): + a = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + b = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + assert hash(a) == hash(b) + assert len({a, b}) == 1 + + def test_lt_ordering(self): + a = CrossReference(filename="a.tsv", subj="A:1", pred="p", obj="B:2") + b = CrossReference(filename="b.tsv", subj="A:1", pred="p", obj="B:2") + assert a < b + + def test_sorting(self): + items = [ + CrossReference(filename="c", subj="C:1", pred="p", obj="D:1"), + CrossReference(filename="a", subj="A:1", pred="p", obj="B:1"), + CrossReference(filename="b", subj="B:1", pred="p", obj="C:1"), + ] + result = sorted(items) + assert [x.filename for x in result] == ["a", "b", "c"] + + +# ========================================================================== +# Unit Tests — LabeledCrossReference +# ========================================================================== + + +class TestLabeledCrossReference: + def test_creation(self): + lxr = LabeledCrossReference( + subj="A:1", pred="p", obj="B:2", filename="f", + subj_label="Alpha", subj_biolink_type="biolink:Disease", + obj_label="Beta", obj_biolink_type="biolink:Gene", + ) + assert lxr.subj == "A:1" + assert lxr.subj_label == "Alpha" + assert lxr.obj_biolink_type == "biolink:Gene" + + def test_inherits_from_cross_reference(self): + lxr = LabeledCrossReference( + subj="A:1", pred="p", obj="B:2", filename="f", + subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="", + ) + assert isinstance(lxr, CrossReference) + + def test_curies_property(self): + lxr = LabeledCrossReference( + subj="A:1", pred="p", obj="B:2", filename="f", + subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="", + ) + assert lxr.curies == frozenset({"A:1", "B:2"}) + + def test_str(self): + lxr = LabeledCrossReference( + subj="A:1", pred="p", obj="B:2", filename="f", + subj_label="Alpha", subj_biolink_type="biolink:Disease", + obj_label="Beta", obj_biolink_type="biolink:Gene", + ) + s = str(lxr) + assert "A:1" in s + assert "B:2" in s + assert "Alpha" in s + + +# ========================================================================== +# Unit Tests — IdentifierRecord +# ========================================================================== + + +class TestIdentifierRecord: + def test_creation(self): + rec = IdentifierRecord(curie="MONDO:0004979") + assert rec.curie == "MONDO:0004979" + assert rec.extra_fields == () + + def test_from_row(self): + row = ("MONDO:0004979", "Disease", "asthma") + cols = ["curie", "category", "label"] + rec = IdentifierRecord.from_row(row, cols) + assert rec.curie == "MONDO:0004979" + assert ("category", "Disease") in rec.extra_fields + assert ("label", "asthma") in rec.extra_fields + + def test_frozen(self): + rec = IdentifierRecord(curie="X:1") + with pytest.raises(AttributeError): + rec.curie = "changed" + + def test_str(self): + rec = IdentifierRecord(curie="X:1", extra_fields=(("type", "Gene"),)) + s = str(rec) + assert "X:1" in s + assert "type" in s + assert "Gene" in s + + +# ========================================================================== +# Unit Tests — BabelXRefs (mocked) +# ========================================================================== + + +class TestBabelXRefsInit: + def test_init_without_nodenorm(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + bx = BabelXRefs(dl) + assert bx.downloader is dl + assert bx.nodenorm is None + + def test_init_with_nodenorm(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + nn = NodeNorm("https://example.com/") + bx = BabelXRefs(dl, nn) + assert bx.nodenorm is nn + + +class TestBabelXRefsMocked: + """Mocked query tests — no DuckDB or Parquet files needed.""" + + def _make_bx(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + return BabelXRefs(dl) + + def test_get_curie_xref_calls_downloader(self, tmp_path): + bx = self._make_bx(tmp_path) + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("concord.tsv", "A:1", "skos:exactMatch", "B:2"), + ] + mock_db = MagicMock() + mock_db.read_parquet.return_value = "table" + mock_db.execute.return_value = mock_result + + with patch.object(bx.downloader, 'get_downloaded_file', return_value="/fake/path") as mock_dl: + with patch.object(bx.downloader, 'get_output_file', return_value="/fake/db"): + with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db): + bx.get_curie_xref.cache_clear() + result = bx.get_curie_xref("A:1") + # Downloader should be called for Concord and Metadata + assert mock_dl.call_count == 2 + result_list = list(result) + assert len(result_list) == 1 + assert isinstance(result_list[0], CrossReference) + + def test_get_curie_xrefs_no_expand(self, tmp_path): + bx = self._make_bx(tmp_path) + xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + with patch.object(bx, 'get_curie_xref', return_value=[xr]): + bx.get_curie_xref.cache_clear() + result = bx.get_curie_xrefs(["A:1"], expand=False) + assert len(result) == 1 + assert result[0] == xr + + def test_get_curie_xrefs_with_expand(self, tmp_path): + bx = self._make_bx(tmp_path) + xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") + xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3") + + def mock_get_curie_xref(curie, label_curies=False): + if curie == "A:1": + return [xr1] + elif curie == "B:2": + return [xr2] + return [] + + with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref): + result = bx.get_curie_xrefs(["A:1"], expand=True) + assert xr1 in result + assert xr2 in result + + def test_results_are_sorted(self, tmp_path): + bx = self._make_bx(tmp_path) + xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1") + xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1") + + with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]): + result = bx.get_curie_xrefs(["X:1"], expand=False) + assert result == [xr_a, xr_b] + + +# ========================================================================== +# Integration Tests — require downloaded Parquet files +# ========================================================================== + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xref(babel_xrefs, curie): + """get_curie_xref returns non-empty CrossReferences with the queried CURIE.""" + babel_xrefs.get_curie_xref.cache_clear() + results = list(babel_xrefs.get_curie_xref(curie)) + assert len(results) > 0, f"No cross-references found for {curie}" + for xr in results: + assert isinstance(xr, CrossReference) + assert curie in (xr.subj, xr.obj) + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xref_returns_known_xrefs(babel_xrefs, curie): + """At least one cross-reference is found.""" + babel_xrefs.get_curie_xref.cache_clear() + results = list(babel_xrefs.get_curie_xref(curie)) + assert len(results) >= 1 + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie): + """get_curie_xrefs without expansion returns sorted, non-empty results.""" + babel_xrefs.get_curie_xref.cache_clear() + results = babel_xrefs.get_curie_xrefs([curie], expand=False) + assert len(results) > 0 + assert results == sorted(results) + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie): + """Expanded results are at least as many as non-expanded.""" + babel_xrefs.get_curie_xref.cache_clear() + non_expanded = babel_xrefs.get_curie_xrefs([curie], expand=False) + babel_xrefs.get_curie_xref.cache_clear() + expanded = babel_xrefs.get_curie_xrefs([curie], expand=True) + assert len(expanded) >= len(non_expanded) + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xrefs_expanded_includes_original(babel_xrefs, curie): + """Non-expanded results are a subset of expanded results.""" + babel_xrefs.get_curie_xref.cache_clear() + non_expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=False)) + babel_xrefs.get_curie_xref.cache_clear() + expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=True)) + assert non_expanded.issubset(expanded) + + +@pytest.mark.integration +def test_get_curie_xref_caching(babel_xrefs): + """Cached calls return the same object.""" + curie = VALID_CURIES[0] + babel_xrefs.get_curie_xref.cache_clear() + r1 = babel_xrefs.get_curie_xref(curie) + r2 = babel_xrefs.get_curie_xref(curie) + assert r1 is r2 + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_xref_with_labels(babel_xrefs_with_nodenorm, curie): + """With labels, returns LabeledCrossReference objects.""" + babel_xrefs_with_nodenorm.get_curie_xref.cache_clear() + results = list(babel_xrefs_with_nodenorm.get_curie_xref(curie, label_curies=True)) + assert len(results) > 0 + for xr in results: + assert isinstance(xr, LabeledCrossReference) + + +@pytest.mark.integration +def test_get_curie_xref_nonexistent_curie(babel_xrefs): + """A made-up CURIE returns an empty list.""" + babel_xrefs.get_curie_xref.cache_clear() + results = list(babel_xrefs.get_curie_xref("FAKE:9999999999")) + assert results == [] + + +@pytest.mark.integration +@pytest.mark.slow +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_curie_ids(babel_xrefs, downloaded_identifiers, curie): + """get_curie_ids returns non-empty IdentifierRecord objects.""" + results = babel_xrefs.get_curie_ids([curie]) + assert len(results) > 0 + for rec in results: + assert isinstance(rec, IdentifierRecord) + assert rec.curie == curie diff --git a/tests/test_downloader.py b/tests/test_downloader.py index c16bf74..912cd0a 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,396 +1,399 @@ """ Tests for the BabelDownloader class. -These tests verify that the downloader can successfully fetch large Parquet files -from the Babel server and properly manage local file caching with MD5 validation. +Unit tests use mocks and run without network access. +Integration tests download real files from the Babel server. """ -import os -import shutil import hashlib -import pytest -from unittest.mock import Mock, patch, MagicMock -from babel_explorer.core.downloader import BabelDownloader - - -# Constants for test configuration -BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/" -TEST_DATA_DIR = "data/test" -IDENTIFIERS_FILE = "duckdb/Identifiers.parquet" -MINIMUM_FILE_SIZE_GB = 2 -MINIMUM_FILE_SIZE_BYTES = MINIMUM_FILE_SIZE_GB * 1024 * 1024 * 1024 # 2GB in bytes - - -@pytest.fixture(scope="module") -def test_data_dir(): - """ - Fixture that provides a clean test data directory. - - This fixture: - - Creates the test data directory before tests run - - Yields the directory path to tests - - Cleans up (removes) the directory after all tests complete - - Scope is 'module' so the directory persists across all tests in this file, - allowing downloaded files to be reused by multiple tests. - """ - # Setup: ensure clean test directory - if os.path.exists(TEST_DATA_DIR): - shutil.rmtree(TEST_DATA_DIR) - os.makedirs(TEST_DATA_DIR, exist_ok=True) - - yield TEST_DATA_DIR - - # Teardown: remove test directory and all contents - if os.path.exists(TEST_DATA_DIR): - shutil.rmtree(TEST_DATA_DIR) - - -@pytest.fixture(scope="module") -def downloader(test_data_dir): - """ - Fixture that provides a BabelDownloader instance configured for testing. - - Args: - test_data_dir: The test data directory fixture - - Returns: - BabelDownloader: Configured downloader instance - """ - return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - - -def test_downloader_initialization(test_data_dir): - """ - Test that BabelDownloader initializes correctly with custom parameters. - - Verifies: - - Downloader accepts URL and local path - - Local path is stored correctly - - Directory is created if it doesn't exist - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - - assert downloader.url_base == BABEL_URL - assert downloader.local_path == test_data_dir - assert os.path.exists(test_data_dir) - assert os.path.isdir(test_data_dir) - - -def test_download_large_parquet_file(downloader): - """ - Test downloading a large Parquet file from the Babel server. - - This test: - 1. Downloads the Identifiers.parquet file (2GB+) from the real Babel server - 2. Verifies the file was downloaded successfully - 3. Confirms the file size is at least 2GB +import os +import tempfile - Note: This test takes several minutes to complete due to the large file size. +import pytest +import requests +from unittest.mock import Mock, patch - Args: - downloader: BabelDownloader fixture - """ - # Download the Identifiers.parquet file - downloaded_path = downloader.get_downloaded_file(IDENTIFIERS_FILE) +from babel_explorer.core.downloader import BabelDownloader - # Verify the file exists - assert os.path.exists(downloaded_path), \ - f"Downloaded file does not exist at {downloaded_path}" +from tests.constants import CONCORD_FILE - # Verify it's a file, not a directory - assert os.path.isfile(downloaded_path), \ - f"Downloaded path is not a file: {downloaded_path}" - # Get the file size in bytes - file_size_bytes = os.path.getsize(downloaded_path) - file_size_gb = file_size_bytes / (1024 * 1024 * 1024) +# ========================================================================== +# Unit Tests — no network required +# ========================================================================== + + +class TestBabelDownloaderInit: + """Tests for BabelDownloader constructor.""" + + def test_constructor_stores_url_and_path(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + assert dl.url_base == "https://example.com/" + assert dl.local_path == str(tmp_path) + + def test_creates_directory_if_missing(self, tmp_path): + new_dir = str(tmp_path / "nested" / "dir") + dl = BabelDownloader(url_base="https://example.com/", local_path=new_dir) + assert os.path.isdir(new_dir) + assert dl.local_path == new_dir + + def test_custom_retries(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3) + assert dl.retries == 3 + + def test_default_retries(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + assert dl.retries == 10 + + def test_invalid_path_raises_value_error(self): + """Using a file path (not a directory) should raise ValueError.""" + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(b"not a directory") + f.flush() + try: + with pytest.raises(ValueError, match="Invalid local_path"): + BabelDownloader(url_base="https://example.com/", local_path=f.name) + finally: + os.unlink(f.name) + + +class TestGetOutputFile: + """Tests for get_output_file.""" + + def test_returns_correct_path(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + result = dl.get_output_file("output/duckdbs/test.duckdb") + assert result == os.path.join(str(tmp_path), "output/duckdbs/test.duckdb") + + def test_creates_parent_directories(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + result = dl.get_output_file("deep/nested/dir/file.txt") + assert os.path.isdir(os.path.dirname(result)) + + def test_lru_caching(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + result1 = dl.get_output_file("some/file.txt") + result2 = dl.get_output_file("some/file.txt") + assert result1 is result2 # identity check — same cached object + + +class TestCalculateMd5: + """Tests for _calculate_md5.""" + + def test_correct_hash(self, tmp_path): + content = b"Hello, world!" + expected = hashlib.md5(content).hexdigest() + file_path = tmp_path / "test.bin" + file_path.write_bytes(content) + + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + assert dl._calculate_md5(str(file_path)) == expected + + def test_different_chunk_sizes_same_result(self, tmp_path): + content = b"A" * 5000 + expected = hashlib.md5(content).hexdigest() + file_path = tmp_path / "chunks.bin" + file_path.write_bytes(content) + + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + assert dl._calculate_md5(str(file_path), chunk_size=100) == expected + assert dl._calculate_md5(str(file_path), chunk_size=4096) == expected + + +class TestFetchRemoteMd5: + """Tests for _fetch_remote_md5.""" + + def _make_dl(self, tmp_path): + return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + + def test_valid_md5_response(self, tmp_path): + dl = self._make_dl(tmp_path) + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e filename.parquet\n" + mock_resp.raise_for_status = Mock() + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): + result = dl._fetch_remote_md5("https://example.com/file.md5") + assert result == "d41d8cd98f00b204e9800998ecf8427e" + + def test_hash_only_format(self, tmp_path): + dl = self._make_dl(tmp_path) + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e\n" + mock_resp.raise_for_status = Mock() + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): + result = dl._fetch_remote_md5("https://example.com/file.md5") + assert result == "d41d8cd98f00b204e9800998ecf8427e" + + def test_404_returns_none(self, tmp_path): + dl = self._make_dl(tmp_path) + mock_resp = Mock() + mock_resp.status_code = 404 + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): + assert dl._fetch_remote_md5("https://example.com/missing.md5") is None + + def test_malformed_returns_none(self, tmp_path): + dl = self._make_dl(tmp_path) + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.text = "not-a-valid-md5-hash\n" + mock_resp.raise_for_status = Mock() + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): + assert dl._fetch_remote_md5("https://example.com/bad.md5") is None + + def test_network_error_returns_none(self, tmp_path): + dl = self._make_dl(tmp_path) + with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")): + assert dl._fetch_remote_md5("https://example.com/err.md5") is None + + +class TestMd5ValidationFlow: + """Tests for the MD5 validation logic inside get_downloaded_file.""" + + def test_matching_checksum_skips_download(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + test_file = "test.txt" + content = b"test content" + local_path = tmp_path / test_file + local_path.write_bytes(content) + expected_md5 = hashlib.md5(content).hexdigest() + + with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): + with patch.object(dl, '_download_with_retry') as mock_dl: + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + mock_dl.assert_not_called() + assert result == str(local_path) + + def test_mismatched_checksum_triggers_redownload(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + test_file = "mismatch.txt" + local_path = tmp_path / test_file + local_path.write_bytes(b"wrong content") + correct_content = b"correct content" + expected_md5 = hashlib.md5(correct_content).hexdigest() + + def fake_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(correct_content) - # Verify the file is at least 2GB - assert file_size_bytes >= MINIMUM_FILE_SIZE_BYTES, \ - f"Downloaded file is too small: {file_size_gb:.2f}GB (expected at least {MINIMUM_FILE_SIZE_GB}GB)" + with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): + with patch.object(dl, '_download_with_retry', side_effect=fake_download): + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + assert os.path.exists(result) + with open(result, 'rb') as f: + assert f.read() == correct_content - print(f"\n✓ Successfully downloaded {IDENTIFIERS_FILE}") - print(f" Size: {file_size_gb:.2f}GB ({file_size_bytes:,} bytes)") - print(f" Path: {downloaded_path}") - - -def test_download_caching(downloader): - """ - Test that the downloader uses LRU caching to avoid re-downloading files. + def test_no_md5_proceeds_normally(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + test_file = "no_md5.txt" + content = b"downloaded content" - This test: - 1. Downloads the same file twice - 2. Verifies both calls return the same path - 3. Confirms the file is only downloaded once (via caching) - - Args: - downloader: BabelDownloader fixture - """ - # First download - path1 = downloader.get_downloaded_file(IDENTIFIERS_FILE) - initial_mtime = os.path.getmtime(path1) + def fake_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(content) - # Second download - should use cache - path2 = downloader.get_downloaded_file(IDENTIFIERS_FILE) - second_mtime = os.path.getmtime(path2) + with patch.object(dl, '_fetch_remote_md5', return_value=None): + with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + mock_dl.assert_called_once() + assert os.path.exists(result) - # Verify same path returned - assert path1 == path2, "Cached download returned different path" + def test_post_download_validation_fail_raises(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + test_file = "post_fail.txt" + correct_md5 = hashlib.md5(b"expected").hexdigest() - # Verify file wasn't modified (i.e., wasn't re-downloaded) - assert initial_mtime == second_mtime, \ - "File was modified, suggesting it was re-downloaded instead of cached" + def fake_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(b"wrong data after download") - print(f"\n✓ Caching works correctly - file not re-downloaded") + with patch.object(dl, '_fetch_remote_md5', return_value=correct_md5): + with patch.object(dl, '_download_with_retry', side_effect=fake_download): + dl.get_downloaded_file.cache_clear() + with pytest.raises(RuntimeError, match="incorrect MD5 checksum"): + dl.get_downloaded_file(test_file) + def test_post_download_validation_pass(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + test_file = "post_pass.txt" + content = b"correct content" + expected_md5 = hashlib.md5(content).hexdigest() -def test_get_output_file(downloader): - """ - Test the get_output_file method for creating output file paths. + def fake_download(url, path, chunk_size): + with open(path, 'wb') as f: + f.write(content) + + with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): + with patch.object(dl, '_download_with_retry', side_effect=fake_download): + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + assert os.path.exists(result) - This test: - 1. Creates an output file path - 2. Verifies the directory structure is created - 3. Confirms the path is in the correct location - Args: - downloader: BabelDownloader fixture - """ - output_filename = "output/duckdbs/test.duckdb" - output_path = downloader.get_output_file(output_filename) +class TestDownloadWithRetry: + """Tests for _download_with_retry.""" - # Verify the path is correct - expected_path = os.path.join(TEST_DATA_DIR, output_filename) - assert output_path == expected_path, \ - f"Output path mismatch: expected {expected_path}, got {output_path}" - - # Verify the parent directory was created - assert os.path.exists(os.path.dirname(output_path)), \ - "Parent directory for output file was not created" + def test_retries_exhausted_raises_runtime_error(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2) + with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")): + with patch("babel_explorer.core.downloader.time.sleep"): # skip waiting + with pytest.raises(RuntimeError, match="Failed to download"): + dl._download_with_retry("https://example.com/file", str(tmp_path / "f"), 1024) - print(f"\n✓ Output file path created correctly: {output_path}") + def test_succeeds_on_second_attempt(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3) + out_path = str(tmp_path / "retry_success.bin") + mock_response = Mock() + mock_response.status_code = 200 + mock_response.headers = {'Content-Length': '5'} + mock_response.iter_content = Mock(return_value=[b"hello"]) -def test_invalid_local_path(): - """ - Test that BabelDownloader raises an error for invalid local paths. - - This test verifies error handling when attempting to use a file path - as the local directory (should be a directory, not a file). - """ - # Create a temporary file - invalid_path = "/tmp/test_babel_invalid_file.txt" - with open(invalid_path, 'w') as f: - f.write("test") - - try: - # Attempt to create downloader with a file path instead of directory - with pytest.raises(ValueError, match="Invalid local_path"): - BabelDownloader(url_base=BABEL_URL, local_path=invalid_path) - - print("\n✓ Correctly raised ValueError for invalid local path") - finally: - # Clean up - if os.path.exists(invalid_path): - os.remove(invalid_path) - - -def test_md5_validation_matching_checksum(test_data_dir): - """ - Test that MD5 validation skips download when checksums match. - - This test: - 1. Creates a local file with known content - 2. Mocks the .md5 file to return the correct checksum - 3. Verifies the download is skipped (no actual HTTP download occurs) - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - - # Create a test file with known content - test_file = "test_file.txt" - local_path = os.path.join(test_data_dir, test_file) - os.makedirs(os.path.dirname(local_path), exist_ok=True) - - test_content = b"This is test content for MD5 validation" - with open(local_path, 'wb') as f: - f.write(test_content) - - # Calculate the expected MD5 - expected_md5 = hashlib.md5(test_content).hexdigest() - - # Mock the _fetch_remote_md5 to return the matching checksum - with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): - # Mock _download_with_retry to ensure it's NOT called - with patch.object(downloader, '_download_with_retry') as mock_download: - # Clear the cache before testing - downloader.get_downloaded_file.cache_clear() - - result_path = downloader.get_downloaded_file(test_file) - - # Verify the download was skipped - mock_download.assert_not_called() - assert result_path == local_path - assert os.path.exists(result_path) + side_effects = [requests.ConnectionError("first fail"), mock_response] - print(f"\n✓ MD5 validation correctly skipped download for matching checksum: {expected_md5}") + with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects): + with patch("babel_explorer.core.downloader.time.sleep"): + dl._download_with_retry("https://example.com/file", out_path, 1024) + assert os.path.exists(out_path) + def test_resume_sends_range_header(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = tmp_path / "partial.bin" + out_path.write_bytes(b"partial") # 7 bytes -def test_md5_validation_mismatched_checksum(test_data_dir): - """ - Test that MD5 validation deletes and re-downloads file when checksums don't match. - - This test: - 1. Creates a local file with wrong content - 2. Mocks the .md5 file to return a different checksum - 3. Verifies the file is deleted and re-downloaded - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - - # Create a test file with incorrect content - test_file = "test_file_mismatch.txt" - local_path = os.path.join(test_data_dir, test_file) - os.makedirs(os.path.dirname(local_path), exist_ok=True) - - wrong_content = b"This is WRONG content" - with open(local_path, 'wb') as f: - f.write(wrong_content) - - # Use a different MD5 (this is MD5 of "correct content") - correct_content = b"This is CORRECT content" - expected_md5 = hashlib.md5(correct_content).hexdigest() - - # Track whether file was deleted - original_exists = os.path.exists(local_path) - - # Mock the _fetch_remote_md5 to return the mismatched checksum - with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): - # Mock _download_with_retry to create the "correct" file - def mock_download(url, path, chunk_size): - with open(path, 'wb') as f: - f.write(correct_content) - - with patch.object(downloader, '_download_with_retry', side_effect=mock_download): - # Clear the cache before testing - downloader.get_downloaded_file.cache_clear() + mock_response = Mock() + mock_response.status_code = 206 + mock_response.headers = {'Content-Length': '3'} + mock_response.iter_content = Mock(return_value=[b"end"]) - result_path = downloader.get_downloaded_file(test_file) + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get: + dl._download_with_retry("https://example.com/file", str(out_path), 1024) + _, kwargs = mock_get.call_args + assert kwargs['headers'] == {'Range': 'bytes=7-'} - # Verify the file exists and has correct content - assert os.path.exists(result_path) - with open(result_path, 'rb') as f: - assert f.read() == correct_content + def test_http_416_file_already_complete(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = tmp_path / "complete.bin" + out_path.write_bytes(b"full file") - print(f"\n✓ MD5 validation correctly deleted and re-downloaded file with mismatched checksum") + mock_response = Mock() + mock_response.status_code = 416 + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): + dl._download_with_retry("https://example.com/file", str(out_path), 1024) + # Should return without error + assert out_path.read_bytes() == b"full file" -def test_md5_validation_no_md5_file(test_data_dir): - """ - Test that download proceeds normally when no .md5 file exists. + def test_server_no_resume_restarts_download(self, tmp_path): + """When server responds 200 (instead of 206), partial file is removed and download restarts.""" + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = tmp_path / "no_resume.bin" + out_path.write_bytes(b"partial") - This test: - 1. Mocks the .md5 file fetch to return None (404) - 2. Verifies the download proceeds normally - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) + mock_response = Mock() + mock_response.status_code = 200 + mock_response.headers = {'Content-Length': '12'} + mock_response.iter_content = Mock(return_value=[b"full content"]) - test_file = "test_file_no_md5.txt" - local_path = os.path.join(test_data_dir, test_file) + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): + dl._download_with_retry("https://example.com/file", str(out_path), 1024) + assert out_path.read_bytes() == b"full content" - test_content = b"Test content without MD5 file" - # Mock the _fetch_remote_md5 to return None (no .md5 file) - with patch.object(downloader, '_fetch_remote_md5', return_value=None): - # Mock _download_with_retry to create the file - def mock_download(url, path, chunk_size): - with open(path, 'wb') as f: - f.write(test_content) +class TestStreamDownload: + """Tests for _stream_download.""" - with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method: - # Clear the cache before testing - downloader.get_downloaded_file.cache_clear() + def test_writes_chunks(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = str(tmp_path / "stream.bin") - result_path = downloader.get_downloaded_file(test_file) + mock_response = Mock() + mock_response.headers = {'Content-Length': '10'} + mock_response.iter_content = Mock(return_value=[b"hello", b"world"]) - # Verify download was called (normal download path) - mock_download_method.assert_called_once() - assert os.path.exists(result_path) - with open(result_path, 'rb') as f: - assert f.read() == test_content + dl._stream_download(mock_response, out_path, resume_byte_pos=0, chunk_size=1024) + with open(out_path, 'rb') as f: + assert f.read() == b"helloworld" - print(f"\n✓ Download proceeded normally when no .md5 file exists") + def test_append_mode_on_resume(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = tmp_path / "append.bin" + out_path.write_bytes(b"start") + mock_response = Mock() + mock_response.headers = {'Content-Length': '3'} + mock_response.iter_content = Mock(return_value=[b"end"]) -def test_md5_validation_malformed_md5_file(test_data_dir): - """ - Test that download proceeds normally when .md5 file is malformed. + dl._stream_download(mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024) + assert out_path.read_bytes() == b"startend" - This test: - 1. Mocks the .md5 file fetch to return None (malformed content) - 2. Verifies the download proceeds normally with a warning - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - test_file = "test_file_malformed_md5.txt" - local_path = os.path.join(test_data_dir, test_file) +class TestGetDownloadedFileCaching: + """Tests for get_downloaded_file LRU caching.""" - test_content = b"Test content with malformed MD5 file" + def test_cache_returns_same_result(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + content = b"cached content" - # Mock the _fetch_remote_md5 to return None (malformed .md5 file) - with patch.object(downloader, '_fetch_remote_md5', return_value=None): - # Mock _download_with_retry to create the file - def mock_download(url, path, chunk_size): + def fake_download(url, path, chunk_size): with open(path, 'wb') as f: - f.write(test_content) + f.write(content) - with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method: - # Clear the cache before testing - downloader.get_downloaded_file.cache_clear() + with patch.object(dl, '_fetch_remote_md5', return_value=None): + with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + dl.get_downloaded_file.cache_clear() + r1 = dl.get_downloaded_file("cached.txt") + r2 = dl.get_downloaded_file("cached.txt") + assert r1 == r2 + mock_dl.assert_called_once() # only one actual download - result_path = downloader.get_downloaded_file(test_file) - # Verify download was called (normal download path) - mock_download_method.assert_called_once() - assert os.path.exists(result_path) +class TestGetDownloadedDir: + """Tests for get_downloaded_dir.""" - print(f"\n✓ Download proceeded normally when .md5 file is malformed") + def test_raises_not_implemented(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + dl.get_downloaded_dir.cache_clear() + with pytest.raises(NotImplementedError): + dl.get_downloaded_dir("some/dir") -def test_md5_post_download_validation(test_data_dir): - """ - Test that MD5 validation occurs after download and fails if checksum is wrong. +# ========================================================================== +# Integration Tests — require network access +# ========================================================================== - This test: - 1. Downloads a new file - 2. Mocks the .md5 file to return a checksum - 3. Mocks the download to create a file with WRONG content - 4. Verifies a RuntimeError is raised for checksum mismatch - """ - downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir) - test_file = "test_file_post_validation.txt" - local_path = os.path.join(test_data_dir, test_file) +@pytest.mark.integration +def test_download_concord_parquet(downloaded_concord): + """Verify Concord.parquet downloads and is > 100 MB.""" + assert os.path.isfile(downloaded_concord) + size = os.path.getsize(downloaded_concord) + assert size > 100 * 1024 * 1024, f"Concord.parquet too small: {size} bytes" - # Expected content and MD5 - correct_content = b"Expected content" - expected_md5 = hashlib.md5(correct_content).hexdigest() - # Wrong content that will be downloaded - wrong_content = b"Wrong content downloaded" +@pytest.mark.integration +def test_download_metadata_parquet(downloaded_metadata): + """Verify Metadata.parquet downloads and is non-empty.""" + assert os.path.isfile(downloaded_metadata) + assert os.path.getsize(downloaded_metadata) > 0 - # Mock the _fetch_remote_md5 to return the expected checksum - with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5): - # Mock _download_with_retry to create a file with WRONG content - def mock_download(url, path, chunk_size): - with open(path, 'wb') as f: - f.write(wrong_content) - with patch.object(downloader, '_download_with_retry', side_effect=mock_download): - # Clear the cache before testing - downloader.get_downloaded_file.cache_clear() +@pytest.mark.integration +def test_download_caching_real_files(shared_downloader, downloaded_concord): + """Second call returns same path and file is not re-downloaded.""" + path2 = shared_downloader.get_downloaded_file(CONCORD_FILE) + assert path2 == downloaded_concord + assert os.path.getmtime(downloaded_concord) == os.path.getmtime(path2) - # Should raise RuntimeError due to post-download MD5 mismatch - with pytest.raises(RuntimeError, match="incorrect MD5 checksum"): - downloader.get_downloaded_file(test_file) - print(f"\n✓ Post-download MD5 validation correctly detected checksum mismatch") +@pytest.mark.integration +@pytest.mark.slow +def test_download_identifiers_parquet(downloaded_identifiers): + """Verify Identifiers.parquet downloads and is > 2 GB.""" + assert os.path.isfile(downloaded_identifiers) + size = os.path.getsize(downloaded_identifiers) + assert size > 2 * 1024 * 1024 * 1024, f"Identifiers.parquet too small: {size} bytes" diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py new file mode 100644 index 0000000..2322eef --- /dev/null +++ b/tests/test_nodenorm.py @@ -0,0 +1,296 @@ +""" +Tests for NodeNorm and Identifier classes. + +Unit tests use mocks; integration tests call the real NodeNorm API. +""" + +import pytest +from unittest.mock import Mock, patch + +import requests + +from babel_explorer.core.nodenorm import NodeNorm, Identifier + +from tests.constants import load_curies + +VALID_CURIES = load_curies() + + +# ========================================================================== +# Unit Tests — Identifier +# ========================================================================== + + +class TestIdentifier: + def test_creation_with_defaults(self): + ident = Identifier(curie="MONDO:0004979") + assert ident.curie == "MONDO:0004979" + assert ident.label == "" + assert ident.biolink_type == "" + assert ident.taxa == [] + assert ident.description == [] + + def test_full_creation(self): + ident = Identifier( + curie="MONDO:0004979", + label="asthma", + biolink_type="biolink:Disease", + taxa=["NCBITaxon:9606"], + description=["A chronic respiratory disease"], + ) + assert ident.label == "asthma" + assert ident.biolink_type == "biolink:Disease" + assert ident.taxa == ["NCBITaxon:9606"] + + def test_from_dict_minimal(self): + d = {"identifier": "X:1"} + ident = Identifier.from_dict(d) + assert ident.curie == "X:1" + assert ident.label == "" + + def test_from_dict_full(self): + d = { + "identifier": "X:1", + "label": "Alpha", + "type": ["biolink:NamedThing"], + "taxa": ["NCBITaxon:9606"], + "description": ["Some thing"], + } + ident = Identifier.from_dict(d) + assert ident.curie == "X:1" + assert ident.label == "Alpha" + assert ident.biolink_type == ["biolink:NamedThing"] + assert ident.taxa == ["NCBITaxon:9606"] + + def test_from_dict_partial(self): + d = {"identifier": "X:1", "label": "Beta"} + ident = Identifier.from_dict(d) + assert ident.curie == "X:1" + assert ident.label == "Beta" + assert ident.biolink_type == "" + + def test_lt_ordering(self): + a = Identifier(curie="A:1") + b = Identifier(curie="B:2") + assert a < b + + def test_sorting(self): + items = [Identifier(curie="C:3"), Identifier(curie="A:1"), Identifier(curie="B:2")] + result = sorted(items) + assert [x.curie for x in result] == ["A:1", "B:2", "C:3"] + + +# ========================================================================== +# Unit Tests — NodeNorm (mocked) +# ========================================================================== + + +class TestNodeNormInit: + def test_default_url(self): + nn = NodeNorm() + assert nn.nodenorm_url == "" + + def test_custom_url(self): + nn = NodeNorm(nodenorm_url="https://custom.api/") + assert nn.nodenorm_url == "https://custom.api/" + + +class TestNormalizeCurieMocked: + def _make_nn(self): + nn = NodeNorm(nodenorm_url="https://example.com/") + nn.normalize_curie.cache_clear() + return nn + + def test_correct_api_endpoint_and_params(self): + nn = self._make_nn() + mock_resp = Mock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"X:1": {"id": {"identifier": "X:1"}}} + mock_resp.raise_for_status = Mock() + + with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get: + nn.normalize_curie("X:1") + mock_get.assert_called_once() + args, kwargs = mock_get.call_args + assert args[0] == "https://example.com/get_normalized_nodes" + assert kwargs["params"]["curie"] == "X:1" + + def test_returns_result_for_curie(self): + nn = self._make_nn() + expected = {"id": {"identifier": "X:1"}, "equivalent_identifiers": []} + mock_resp = Mock() + mock_resp.json.return_value = {"X:1": expected} + mock_resp.raise_for_status = Mock() + + with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp): + result = nn.normalize_curie("X:1") + assert result == expected + + def test_lru_caching(self): + nn = self._make_nn() + mock_resp = Mock() + mock_resp.json.return_value = {"X:1": {"id": "X:1"}} + mock_resp.raise_for_status = Mock() + + with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get: + nn.normalize_curie("X:1") + nn.normalize_curie("X:1") + mock_get.assert_called_once() + + def test_http_error_raises(self): + nn = self._make_nn() + mock_resp = Mock() + mock_resp.raise_for_status.side_effect = requests.HTTPError("500 Server Error") + + with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp): + with pytest.raises(requests.HTTPError): + nn.normalize_curie("BAD:1") + + +class TestGetIdentifierMocked: + def _make_nn(self): + nn = NodeNorm(nodenorm_url="https://example.com/") + nn.normalize_curie.cache_clear() + nn.get_identifier.cache_clear() + return nn + + def test_exact_match_found(self): + nn = self._make_nn() + api_result = { + "equivalent_identifiers": [ + {"identifier": "X:1", "label": "Alpha", "type": ["biolink:Disease"]}, + {"identifier": "X:2", "label": "Beta"}, + ], + } + with patch.object(nn, 'normalize_curie', return_value=api_result): + ident = nn.get_identifier("X:1") + assert ident.curie == "X:1" + assert ident.label == "Alpha" + + def test_no_match_returns_bare_identifier(self): + nn = self._make_nn() + api_result = { + "equivalent_identifiers": [ + {"identifier": "X:2", "label": "Beta"}, + ], + } + with patch.object(nn, 'normalize_curie', return_value=api_result): + ident = nn.get_identifier("X:1") + assert ident.curie == "X:1" + assert ident.label == "" + + def test_falsy_result_returns_bare_identifier(self): + nn = self._make_nn() + with patch.object(nn, 'normalize_curie', return_value=None): + ident = nn.get_identifier("X:1") + assert ident.curie == "X:1" + assert ident.label == "" + + def test_caching(self): + nn = self._make_nn() + api_result = { + "equivalent_identifiers": [ + {"identifier": "X:1", "label": "Alpha"}, + ], + } + with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm: + nn.get_identifier("X:1") + nn.get_identifier("X:1") + mock_norm.assert_called_once() + + +class TestGetCliqueIdentifiersMocked: + def _make_nn(self): + nn = NodeNorm(nodenorm_url="https://example.com/") + nn.normalize_curie.cache_clear() + nn.get_clique_identifiers.cache_clear() + return nn + + def test_success_returns_list(self): + nn = self._make_nn() + api_result = { + "equivalent_identifiers": [ + {"identifier": "X:1", "label": "Alpha"}, + {"identifier": "X:2", "label": "Beta"}, + ], + } + with patch.object(nn, 'normalize_curie', return_value=api_result): + result = nn.get_clique_identifiers("X:1") + assert len(result) == 2 + assert all(isinstance(x, Identifier) for x in result) + + def test_missing_key_returns_none(self): + nn = self._make_nn() + api_result = {"id": {"identifier": "X:1"}} # no equivalent_identifiers + with patch.object(nn, 'normalize_curie', return_value=api_result): + result = nn.get_clique_identifiers("X:1") + assert result is None + + def test_caching(self): + nn = self._make_nn() + api_result = { + "equivalent_identifiers": [{"identifier": "X:1"}], + } + with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm: + nn.get_clique_identifiers("X:1") + nn.get_clique_identifiers("X:1") + mock_norm.assert_called_once() + + +# ========================================================================== +# Integration Tests — require real NodeNorm API +# ========================================================================== + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_normalize_curie_real_api(nodenorm, curie): + """normalize_curie returns a dict with expected keys.""" + nodenorm.normalize_curie.cache_clear() + result = nodenorm.normalize_curie(curie) + assert isinstance(result, dict) + assert "id" in result + assert "equivalent_identifiers" in result + assert "type" in result + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_identifier_real_api(nodenorm, curie): + """get_identifier returns an Identifier with non-empty label and biolink_type.""" + nodenorm.normalize_curie.cache_clear() + nodenorm.get_identifier.cache_clear() + ident = nodenorm.get_identifier(curie) + assert isinstance(ident, Identifier) + assert ident.curie == curie + assert ident.label != "" + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_clique_identifiers_real_api(nodenorm, curie): + """get_clique_identifiers returns a non-empty list of Identifiers.""" + nodenorm.normalize_curie.cache_clear() + nodenorm.get_clique_identifiers.cache_clear() + result = nodenorm.get_clique_identifiers(curie) + assert result is not None + assert len(result) > 0 + assert all(isinstance(x, Identifier) for x in result) + + +@pytest.mark.integration +@pytest.mark.parametrize("curie", VALID_CURIES) +def test_get_clique_identifiers_has_known_ids(nodenorm, curie): + """At least one equivalent identifier is returned.""" + nodenorm.normalize_curie.cache_clear() + nodenorm.get_clique_identifiers.cache_clear() + result = nodenorm.get_clique_identifiers(curie) + assert len(result) >= 1 + + +@pytest.mark.integration +def test_normalize_curie_nonexistent(nodenorm): + """A made-up CURIE returns None.""" + nodenorm.normalize_curie.cache_clear() + result = nodenorm.normalize_curie("FAKENS:9999999999") + assert result is None From ff0dacc744102f01e541f7b45d83f6ff7834fb69 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 17:35:29 -0500 Subject: [PATCH 19/66] Added uv.lock (not sure why it wasn't added previously). --- uv.lock | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 uv.lock diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..56af50a --- /dev/null +++ b/uv.lock @@ -0,0 +1,295 @@ +version = 1 +revision = 3 +requires-python = ">=3.11" + +[[package]] +name = "babel-explorer" +version = "0.1.0" +source = { editable = "." } +dependencies = [ + { name = "click" }, + { name = "duckdb" }, + { name = "requests" }, + { name = "tqdm" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "ruff" }, +] + +[package.metadata] +requires-dist = [ + { name = "click", specifier = ">=8.3.1" }, + { name = "duckdb", specifier = ">=1.4.2" }, + { name = "requests", specifier = ">=2.32.5" }, + { name = "tqdm", specifier = ">=4.67.0" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8.3.5" }, + { name = "ruff", specifier = ">=0.11.0" }, +] + +[[package]] +name = "certifi" +version = "2026.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/27/c6491ff4954e58a10f69ad90aca8a1b6fe9c5d3c6f380907af3c37435b59/charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8", size = 206988, upload-time = "2025-10-14T04:40:33.79Z" }, + { url = "https://files.pythonhosted.org/packages/94/59/2e87300fe67ab820b5428580a53cad894272dbb97f38a7a814a2a1ac1011/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0", size = 147324, upload-time = "2025-10-14T04:40:34.961Z" }, + { url = "https://files.pythonhosted.org/packages/07/fb/0cf61dc84b2b088391830f6274cb57c82e4da8bbc2efeac8c025edb88772/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3", size = 142742, upload-time = "2025-10-14T04:40:36.105Z" }, + { url = "https://files.pythonhosted.org/packages/62/8b/171935adf2312cd745d290ed93cf16cf0dfe320863ab7cbeeae1dcd6535f/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc", size = 160863, upload-time = "2025-10-14T04:40:37.188Z" }, + { url = "https://files.pythonhosted.org/packages/09/73/ad875b192bda14f2173bfc1bc9a55e009808484a4b256748d931b6948442/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897", size = 157837, upload-time = "2025-10-14T04:40:38.435Z" }, + { url = "https://files.pythonhosted.org/packages/6d/fc/de9cce525b2c5b94b47c70a4b4fb19f871b24995c728e957ee68ab1671ea/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381", size = 151550, upload-time = "2025-10-14T04:40:40.053Z" }, + { url = "https://files.pythonhosted.org/packages/55/c2/43edd615fdfba8c6f2dfbd459b25a6b3b551f24ea21981e23fb768503ce1/charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815", size = 149162, upload-time = "2025-10-14T04:40:41.163Z" }, + { url = "https://files.pythonhosted.org/packages/03/86/bde4ad8b4d0e9429a4e82c1e8f5c659993a9a863ad62c7df05cf7b678d75/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0", size = 150019, upload-time = "2025-10-14T04:40:42.276Z" }, + { url = "https://files.pythonhosted.org/packages/1f/86/a151eb2af293a7e7bac3a739b81072585ce36ccfb4493039f49f1d3cae8c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161", size = 143310, upload-time = "2025-10-14T04:40:43.439Z" }, + { url = "https://files.pythonhosted.org/packages/b5/fe/43dae6144a7e07b87478fdfc4dbe9efd5defb0e7ec29f5f58a55aeef7bf7/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4", size = 162022, upload-time = "2025-10-14T04:40:44.547Z" }, + { url = "https://files.pythonhosted.org/packages/80/e6/7aab83774f5d2bca81f42ac58d04caf44f0cc2b65fc6db2b3b2e8a05f3b3/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89", size = 149383, upload-time = "2025-10-14T04:40:46.018Z" }, + { url = "https://files.pythonhosted.org/packages/4f/e8/b289173b4edae05c0dde07f69f8db476a0b511eac556dfe0d6bda3c43384/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569", size = 159098, upload-time = "2025-10-14T04:40:47.081Z" }, + { url = "https://files.pythonhosted.org/packages/d8/df/fe699727754cae3f8478493c7f45f777b17c3ef0600e28abfec8619eb49c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224", size = 152991, upload-time = "2025-10-14T04:40:48.246Z" }, + { url = "https://files.pythonhosted.org/packages/1a/86/584869fe4ddb6ffa3bd9f491b87a01568797fb9bd8933f557dba9771beaf/charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a", size = 99456, upload-time = "2025-10-14T04:40:49.376Z" }, + { url = "https://files.pythonhosted.org/packages/65/f6/62fdd5feb60530f50f7e38b4f6a1d5203f4d16ff4f9f0952962c044e919a/charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016", size = 106978, upload-time = "2025-10-14T04:40:50.844Z" }, + { url = "https://files.pythonhosted.org/packages/7a/9d/0710916e6c82948b3be62d9d398cb4fcf4e97b56d6a6aeccd66c4b2f2bd5/charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1", size = 99969, upload-time = "2025-10-14T04:40:52.272Z" }, + { url = "https://files.pythonhosted.org/packages/f3/85/1637cd4af66fa687396e757dec650f28025f2a2f5a5531a3208dc0ec43f2/charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394", size = 208425, upload-time = "2025-10-14T04:40:53.353Z" }, + { url = "https://files.pythonhosted.org/packages/9d/6a/04130023fef2a0d9c62d0bae2649b69f7b7d8d24ea5536feef50551029df/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25", size = 148162, upload-time = "2025-10-14T04:40:54.558Z" }, + { url = "https://files.pythonhosted.org/packages/78/29/62328d79aa60da22c9e0b9a66539feae06ca0f5a4171ac4f7dc285b83688/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef", size = 144558, upload-time = "2025-10-14T04:40:55.677Z" }, + { url = "https://files.pythonhosted.org/packages/86/bb/b32194a4bf15b88403537c2e120b817c61cd4ecffa9b6876e941c3ee38fe/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d", size = 161497, upload-time = "2025-10-14T04:40:57.217Z" }, + { url = "https://files.pythonhosted.org/packages/19/89/a54c82b253d5b9b111dc74aca196ba5ccfcca8242d0fb64146d4d3183ff1/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8", size = 159240, upload-time = "2025-10-14T04:40:58.358Z" }, + { url = "https://files.pythonhosted.org/packages/c0/10/d20b513afe03acc89ec33948320a5544d31f21b05368436d580dec4e234d/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86", size = 153471, upload-time = "2025-10-14T04:40:59.468Z" }, + { url = "https://files.pythonhosted.org/packages/61/fa/fbf177b55bdd727010f9c0a3c49eefa1d10f960e5f09d1d887bf93c2e698/charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a", size = 150864, upload-time = "2025-10-14T04:41:00.623Z" }, + { url = "https://files.pythonhosted.org/packages/05/12/9fbc6a4d39c0198adeebbde20b619790e9236557ca59fc40e0e3cebe6f40/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f", size = 150647, upload-time = "2025-10-14T04:41:01.754Z" }, + { url = "https://files.pythonhosted.org/packages/ad/1f/6a9a593d52e3e8c5d2b167daf8c6b968808efb57ef4c210acb907c365bc4/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc", size = 145110, upload-time = "2025-10-14T04:41:03.231Z" }, + { url = "https://files.pythonhosted.org/packages/30/42/9a52c609e72471b0fc54386dc63c3781a387bb4fe61c20231a4ebcd58bdd/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf", size = 162839, upload-time = "2025-10-14T04:41:04.715Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5b/c0682bbf9f11597073052628ddd38344a3d673fda35a36773f7d19344b23/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15", size = 150667, upload-time = "2025-10-14T04:41:05.827Z" }, + { url = "https://files.pythonhosted.org/packages/e4/24/a41afeab6f990cf2daf6cb8c67419b63b48cf518e4f56022230840c9bfb2/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9", size = 160535, upload-time = "2025-10-14T04:41:06.938Z" }, + { url = "https://files.pythonhosted.org/packages/2a/e5/6a4ce77ed243c4a50a1fecca6aaaab419628c818a49434be428fe24c9957/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0", size = 154816, upload-time = "2025-10-14T04:41:08.101Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ef/89297262b8092b312d29cdb2517cb1237e51db8ecef2e9af5edbe7b683b1/charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26", size = 99694, upload-time = "2025-10-14T04:41:09.23Z" }, + { url = "https://files.pythonhosted.org/packages/3d/2d/1e5ed9dd3b3803994c155cd9aacb60c82c331bad84daf75bcb9c91b3295e/charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525", size = 107131, upload-time = "2025-10-14T04:41:10.467Z" }, + { url = "https://files.pythonhosted.org/packages/d0/d9/0ed4c7098a861482a7b6a95603edce4c0d9db2311af23da1fb2b75ec26fc/charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3", size = 100390, upload-time = "2025-10-14T04:41:11.915Z" }, + { url = "https://files.pythonhosted.org/packages/97/45/4b3a1239bbacd321068ea6e7ac28875b03ab8bc0aa0966452db17cd36714/charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794", size = 208091, upload-time = "2025-10-14T04:41:13.346Z" }, + { url = "https://files.pythonhosted.org/packages/7d/62/73a6d7450829655a35bb88a88fca7d736f9882a27eacdca2c6d505b57e2e/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed", size = 147936, upload-time = "2025-10-14T04:41:14.461Z" }, + { url = "https://files.pythonhosted.org/packages/89/c5/adb8c8b3d6625bef6d88b251bbb0d95f8205831b987631ab0c8bb5d937c2/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72", size = 144180, upload-time = "2025-10-14T04:41:15.588Z" }, + { url = "https://files.pythonhosted.org/packages/91/ed/9706e4070682d1cc219050b6048bfd293ccf67b3d4f5a4f39207453d4b99/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328", size = 161346, upload-time = "2025-10-14T04:41:16.738Z" }, + { url = "https://files.pythonhosted.org/packages/d5/0d/031f0d95e4972901a2f6f09ef055751805ff541511dc1252ba3ca1f80cf5/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede", size = 158874, upload-time = "2025-10-14T04:41:17.923Z" }, + { url = "https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894", size = 153076, upload-time = "2025-10-14T04:41:19.106Z" }, + { url = "https://files.pythonhosted.org/packages/75/1e/5ff781ddf5260e387d6419959ee89ef13878229732732ee73cdae01800f2/charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1", size = 150601, upload-time = "2025-10-14T04:41:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/d7/57/71be810965493d3510a6ca79b90c19e48696fb1ff964da319334b12677f0/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490", size = 150376, upload-time = "2025-10-14T04:41:21.398Z" }, + { url = "https://files.pythonhosted.org/packages/e5/d5/c3d057a78c181d007014feb7e9f2e65905a6c4ef182c0ddf0de2924edd65/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44", size = 144825, upload-time = "2025-10-14T04:41:22.583Z" }, + { url = "https://files.pythonhosted.org/packages/e6/8c/d0406294828d4976f275ffbe66f00266c4b3136b7506941d87c00cab5272/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133", size = 162583, upload-time = "2025-10-14T04:41:23.754Z" }, + { url = "https://files.pythonhosted.org/packages/d7/24/e2aa1f18c8f15c4c0e932d9287b8609dd30ad56dbe41d926bd846e22fb8d/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3", size = 150366, upload-time = "2025-10-14T04:41:25.27Z" }, + { url = "https://files.pythonhosted.org/packages/e4/5b/1e6160c7739aad1e2df054300cc618b06bf784a7a164b0f238360721ab86/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e", size = 160300, upload-time = "2025-10-14T04:41:26.725Z" }, + { url = "https://files.pythonhosted.org/packages/7a/10/f882167cd207fbdd743e55534d5d9620e095089d176d55cb22d5322f2afd/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc", size = 154465, upload-time = "2025-10-14T04:41:28.322Z" }, + { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" }, + { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" }, + { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" }, + { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" }, + { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" }, + { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" }, + { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" }, + { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" }, + { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" }, + { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" }, + { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" }, + { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" }, + { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" }, + { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" }, + { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" }, + { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" }, + { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" }, + { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "duckdb" +version = "1.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/36/9d/ab66a06e416d71b7bdcb9904cdf8d4db3379ef632bb8e9495646702d9718/duckdb-1.4.4.tar.gz", hash = "sha256:8bba52fd2acb67668a4615ee17ee51814124223de836d9e2fdcbc4c9021b3d3c", size = 18419763, upload-time = "2026-01-26T11:50:37.68Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/68/19233412033a2bc5a144a3f531f64e3548d4487251e3f16b56c31411a06f/duckdb-1.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5ba684f498d4e924c7e8f30dd157da8da34c8479746c5011b6c0e037e9c60ad2", size = 28883816, upload-time = "2026-01-26T11:49:01.009Z" }, + { url = "https://files.pythonhosted.org/packages/b3/3e/cec70e546c298ab76d80b990109e111068d82cca67942c42328eaa7d6fdb/duckdb-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5536eb952a8aa6ae56469362e344d4e6403cc945a80bc8c5c2ebdd85d85eb64b", size = 15339662, upload-time = "2026-01-26T11:49:04.058Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f0/cf4241a040ec4f571859a738007ec773b642fbc27df4cbcf34b0c32ea559/duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:47dd4162da6a2be59a0aef640eb08d6360df1cf83c317dcc127836daaf3b7f7c", size = 13670044, upload-time = "2026-01-26T11:49:06.627Z" }, + { url = "https://files.pythonhosted.org/packages/11/64/de2bb4ec1e35ec9ebf6090a95b930fc56934a0ad6f34a24c5972a14a77ef/duckdb-1.4.4-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6cb357cfa3403910e79e2eb46c8e445bb1ee2fd62e9e9588c6b999df4256abc1", size = 18409951, upload-time = "2026-01-26T11:49:09.808Z" }, + { url = "https://files.pythonhosted.org/packages/79/a2/ac0f5ee16df890d141304bcd48733516b7202c0de34cd3555634d6eb4551/duckdb-1.4.4-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c25d5b0febda02b7944e94fdae95aecf952797afc8cb920f677b46a7c251955", size = 20411739, upload-time = "2026-01-26T11:49:12.652Z" }, + { url = "https://files.pythonhosted.org/packages/37/a2/9a3402edeedaecf72de05fe9ff7f0303d701b8dfc136aea4a4be1a5f7eee/duckdb-1.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6703dd1bb650025b3771552333d305d62ddd7ff182de121483d4e042ea6e2e00", size = 12256972, upload-time = "2026-01-26T11:49:15.468Z" }, + { url = "https://files.pythonhosted.org/packages/f6/e6/052ea6dcdf35b259fd182eff3efd8d75a071de4010c9807556098df137b9/duckdb-1.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:bf138201f56e5d6fc276a25138341b3523e2f84733613fc43f02c54465619a95", size = 13006696, upload-time = "2026-01-26T11:49:18.054Z" }, + { url = "https://files.pythonhosted.org/packages/58/33/beadaa69f8458afe466126f2c5ee48c4759cc9d5d784f8703d44e0b52c3c/duckdb-1.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ddcfd9c6ff234da603a1edd5fd8ae6107f4d042f74951b65f91bc5e2643856b3", size = 28896535, upload-time = "2026-01-26T11:49:21.232Z" }, + { url = "https://files.pythonhosted.org/packages/76/66/82413f386df10467affc87f65bac095b7c88dbd9c767584164d5f4dc4cb8/duckdb-1.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6792ca647216bd5c4ff16396e4591cfa9b4a72e5ad7cdd312cec6d67e8431a7c", size = 15349716, upload-time = "2026-01-26T11:49:23.989Z" }, + { url = "https://files.pythonhosted.org/packages/5d/8c/c13d396fd4e9bf970916dc5b4fea410c1b10fe531069aea65f1dcf849a71/duckdb-1.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1f8d55843cc940e36261689054f7dfb6ce35b1f5b0953b0d355b6adb654b0d52", size = 13672403, upload-time = "2026-01-26T11:49:26.741Z" }, + { url = "https://files.pythonhosted.org/packages/db/77/2446a0b44226bb95217748d911c7ca66a66ca10f6481d5178d9370819631/duckdb-1.4.4-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c65d15c440c31e06baaebfd2c06d71ce877e132779d309f1edf0a85d23c07e92", size = 18419001, upload-time = "2026-01-26T11:49:29.353Z" }, + { url = "https://files.pythonhosted.org/packages/2e/a3/97715bba30040572fb15d02c26f36be988d48bc00501e7ac02b1d65ef9d0/duckdb-1.4.4-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b297eff642503fd435a9de5a9cb7db4eccb6f61d61a55b30d2636023f149855f", size = 20437385, upload-time = "2026-01-26T11:49:32.302Z" }, + { url = "https://files.pythonhosted.org/packages/8b/0a/18b9167adf528cbe3867ef8a84a5f19f37bedccb606a8a9e59cfea1880c8/duckdb-1.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d525de5f282b03aa8be6db86b1abffdceae5f1055113a03d5b50cd2fb8cf2ef8", size = 12267343, upload-time = "2026-01-26T11:49:34.985Z" }, + { url = "https://files.pythonhosted.org/packages/f8/15/37af97f5717818f3d82d57414299c293b321ac83e048c0a90bb8b6a09072/duckdb-1.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:50f2eb173c573811b44aba51176da7a4e5c487113982be6a6a1c37337ec5fa57", size = 13007490, upload-time = "2026-01-26T11:49:37.413Z" }, + { url = "https://files.pythonhosted.org/packages/7f/fe/64810fee20030f2bf96ce28b527060564864ce5b934b50888eda2cbf99dd/duckdb-1.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:337f8b24e89bc2e12dadcfe87b4eb1c00fd920f68ab07bc9b70960d6523b8bc3", size = 28899349, upload-time = "2026-01-26T11:49:40.294Z" }, + { url = "https://files.pythonhosted.org/packages/9c/9b/3c7c5e48456b69365d952ac201666053de2700f5b0144a699a4dc6854507/duckdb-1.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0509b39ea7af8cff0198a99d206dca753c62844adab54e545984c2e2c1381616", size = 15350691, upload-time = "2026-01-26T11:49:43.242Z" }, + { url = "https://files.pythonhosted.org/packages/a6/7b/64e68a7b857ed0340045501535a0da99ea5d9d5ea3708fec0afb8663eb27/duckdb-1.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fb94de6d023de9d79b7edc1ae07ee1d0b4f5fa8a9dcec799650b5befdf7aafec", size = 13672311, upload-time = "2026-01-26T11:49:46.069Z" }, + { url = "https://files.pythonhosted.org/packages/09/5b/3e7aa490841784d223de61beb2ae64e82331501bf5a415dc87a0e27b4663/duckdb-1.4.4-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0d636ceda422e7babd5e2f7275f6a0d1a3405e6a01873f00d38b72118d30c10b", size = 18422740, upload-time = "2026-01-26T11:49:49.034Z" }, + { url = "https://files.pythonhosted.org/packages/53/32/256df3dbaa198c58539ad94f9a41e98c2c8ff23f126b8f5f52c7dcd0a738/duckdb-1.4.4-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df7351328ffb812a4a289732f500d621e7de9942a3a2c9b6d4afcf4c0e72526", size = 20435578, upload-time = "2026-01-26T11:49:51.946Z" }, + { url = "https://files.pythonhosted.org/packages/a4/f0/620323fd87062ea43e527a2d5ed9e55b525e0847c17d3b307094ddab98a2/duckdb-1.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:6fb1225a9ea5877421481d59a6c556a9532c32c16c7ae6ca8d127e2b878c9389", size = 12268083, upload-time = "2026-01-26T11:49:54.615Z" }, + { url = "https://files.pythonhosted.org/packages/e5/07/a397fdb7c95388ba9c055b9a3d38dfee92093f4427bc6946cf9543b1d216/duckdb-1.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:f28a18cc790217e5b347bb91b2cab27aafc557c58d3d8382e04b4fe55d0c3f66", size = 13006123, upload-time = "2026-01-26T11:49:57.092Z" }, + { url = "https://files.pythonhosted.org/packages/97/a6/f19e2864e651b0bd8e4db2b0c455e7e0d71e0d4cd2cd9cc052f518e43eb3/duckdb-1.4.4-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:25874f8b1355e96178079e37312c3ba6d61a2354f51319dae860cf21335c3a20", size = 28909554, upload-time = "2026-01-26T11:50:00.107Z" }, + { url = "https://files.pythonhosted.org/packages/0e/93/8a24e932c67414fd2c45bed83218e62b73348996bf859eda020c224774b2/duckdb-1.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:452c5b5d6c349dc5d1154eb2062ee547296fcbd0c20e9df1ed00b5e1809089da", size = 15353804, upload-time = "2026-01-26T11:50:03.382Z" }, + { url = "https://files.pythonhosted.org/packages/62/13/e5378ff5bb1d4397655d840b34b642b1b23cdd82ae19599e62dc4b9461c9/duckdb-1.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8e5c2d8a0452df55e092959c0bfc8ab8897ac3ea0f754cb3b0ab3e165cd79aff", size = 13676157, upload-time = "2026-01-26T11:50:06.232Z" }, + { url = "https://files.pythonhosted.org/packages/2d/94/24364da564b27aeebe44481f15bd0197a0b535ec93f188a6b1b98c22f082/duckdb-1.4.4-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1af6e76fe8bd24875dc56dd8e38300d64dc708cd2e772f67b9fbc635cc3066a3", size = 18426882, upload-time = "2026-01-26T11:50:08.97Z" }, + { url = "https://files.pythonhosted.org/packages/26/0a/6ae31b2914b4dc34243279b2301554bcbc5f1a09ccc82600486c49ab71d1/duckdb-1.4.4-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0440f59e0cd9936a9ebfcf7a13312eda480c79214ffed3878d75947fc3b7d6d", size = 20435641, upload-time = "2026-01-26T11:50:12.188Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b1/fd5c37c53d45efe979f67e9bd49aaceef640147bb18f0699a19edd1874d6/duckdb-1.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:59c8d76016dde854beab844935b1ec31de358d4053e792988108e995b18c08e7", size = 12762360, upload-time = "2026-01-26T11:50:14.76Z" }, + { url = "https://files.pythonhosted.org/packages/dd/2d/13e6024e613679d8a489dd922f199ef4b1d08a456a58eadd96dc2f05171f/duckdb-1.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4", size = 13458633, upload-time = "2026-01-26T11:50:17.657Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "pygments" +version = "2.19.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "ruff" +version = "0.15.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/06/04/eab13a954e763b0606f460443fcbf6bb5a0faf06890ea3754ff16523dce5/ruff-0.15.2.tar.gz", hash = "sha256:14b965afee0969e68bb871eba625343b8673375f457af4abe98553e8bbb98342", size = 4558148, upload-time = "2026-02-19T22:32:20.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/70/3a4dc6d09b13cb3e695f28307e5d889b2e1a66b7af9c5e257e796695b0e6/ruff-0.15.2-py3-none-linux_armv6l.whl", hash = "sha256:120691a6fdae2f16d65435648160f5b81a9625288f75544dc40637436b5d3c0d", size = 10430565, upload-time = "2026-02-19T22:32:41.824Z" }, + { url = "https://files.pythonhosted.org/packages/71/0b/bb8457b56185ece1305c666dc895832946d24055be90692381c31d57466d/ruff-0.15.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a89056d831256099658b6bba4037ac6dd06f49d194199215befe2bb10457ea5e", size = 10820354, upload-time = "2026-02-19T22:32:07.366Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c1/e0532d7f9c9e0b14c46f61b14afd563298b8b83f337b6789ddd987e46121/ruff-0.15.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e36dee3a64be0ebd23c86ffa3aa3fd3ac9a712ff295e192243f814a830b6bd87", size = 10170767, upload-time = "2026-02-19T22:32:13.188Z" }, + { url = "https://files.pythonhosted.org/packages/47/e8/da1aa341d3af017a21c7a62fb5ec31d4e7ad0a93ab80e3a508316efbcb23/ruff-0.15.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9fb47b6d9764677f8c0a193c0943ce9a05d6763523f132325af8a858eadc2b9", size = 10529591, upload-time = "2026-02-19T22:32:02.547Z" }, + { url = "https://files.pythonhosted.org/packages/93/74/184fbf38e9f3510231fbc5e437e808f0b48c42d1df9434b208821efcd8d6/ruff-0.15.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f376990f9d0d6442ea9014b19621d8f2aaf2b8e39fdbfc79220b7f0c596c9b80", size = 10260771, upload-time = "2026-02-19T22:32:36.938Z" }, + { url = "https://files.pythonhosted.org/packages/05/ac/605c20b8e059a0bc4b42360414baa4892ff278cec1c91fff4be0dceedefd/ruff-0.15.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dcc987551952d73cbf5c88d9fdee815618d497e4df86cd4c4824cc59d5dd75f", size = 11045791, upload-time = "2026-02-19T22:32:31.642Z" }, + { url = "https://files.pythonhosted.org/packages/fd/52/db6e419908f45a894924d410ac77d64bdd98ff86901d833364251bd08e22/ruff-0.15.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42a47fd785cbe8c01b9ff45031af875d101b040ad8f4de7bbb716487c74c9a77", size = 11879271, upload-time = "2026-02-19T22:32:29.305Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d8/7992b18f2008bdc9231d0f10b16df7dda964dbf639e2b8b4c1b4e91b83af/ruff-0.15.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe9f49354866e575b4c6943856989f966421870e85cd2ac94dccb0a9dcb2fea", size = 11303707, upload-time = "2026-02-19T22:32:22.492Z" }, + { url = "https://files.pythonhosted.org/packages/d7/02/849b46184bcfdd4b64cde61752cc9a146c54759ed036edd11857e9b8443b/ruff-0.15.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a672c82b5f9887576087d97be5ce439f04bbaf548ee987b92d3a7dede41d3a", size = 11149151, upload-time = "2026-02-19T22:32:44.234Z" }, + { url = "https://files.pythonhosted.org/packages/70/04/f5284e388bab60d1d3b99614a5a9aeb03e0f333847e2429bebd2aaa1feec/ruff-0.15.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ecc64f46f7019e2bcc3cdc05d4a7da958b629a5ab7033195e11a438403d956", size = 11091132, upload-time = "2026-02-19T22:32:24.691Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ae/88d844a21110e14d92cf73d57363fab59b727ebeabe78009b9ccb23500af/ruff-0.15.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8dcf243b15b561c655c1ef2f2b0050e5d50db37fe90115507f6ff37d865dc8b4", size = 10504717, upload-time = "2026-02-19T22:32:26.75Z" }, + { url = "https://files.pythonhosted.org/packages/64/27/867076a6ada7f2b9c8292884ab44d08fd2ba71bd2b5364d4136f3cd537e1/ruff-0.15.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dab6941c862c05739774677c6273166d2510d254dac0695c0e3f5efa1b5585de", size = 10263122, upload-time = "2026-02-19T22:32:10.036Z" }, + { url = "https://files.pythonhosted.org/packages/e7/ef/faf9321d550f8ebf0c6373696e70d1758e20ccdc3951ad7af00c0956be7c/ruff-0.15.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b9164f57fc36058e9a6806eb92af185b0697c9fe4c7c52caa431c6554521e5c", size = 10735295, upload-time = "2026-02-19T22:32:39.227Z" }, + { url = "https://files.pythonhosted.org/packages/2f/55/e8089fec62e050ba84d71b70e7834b97709ca9b7aba10c1a0b196e493f97/ruff-0.15.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:80d24fcae24d42659db7e335b9e1531697a7102c19185b8dc4a028b952865fd8", size = 11241641, upload-time = "2026-02-19T22:32:34.617Z" }, + { url = "https://files.pythonhosted.org/packages/23/01/1c30526460f4d23222d0fabd5888868262fd0e2b71a00570ca26483cd993/ruff-0.15.2-py3-none-win32.whl", hash = "sha256:fd5ff9e5f519a7e1bd99cbe8daa324010a74f5e2ebc97c6242c08f26f3714f6f", size = 10507885, upload-time = "2026-02-19T22:32:15.635Z" }, + { url = "https://files.pythonhosted.org/packages/5c/10/3d18e3bbdf8fc50bbb4ac3cc45970aa5a9753c5cb51bf9ed9a3cd8b79fa3/ruff-0.15.2-py3-none-win_amd64.whl", hash = "sha256:d20014e3dfa400f3ff84830dfb5755ece2de45ab62ecea4af6b7262d0fb4f7c5", size = 11623725, upload-time = "2026-02-19T22:32:04.947Z" }, + { url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649, upload-time = "2026-02-19T22:32:18.108Z" }, +] + +[[package]] +name = "tqdm" +version = "4.67.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] From bacc72de3a370e7d539162dbeb2cfbe3900a7d3e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 17:36:52 -0500 Subject: [PATCH 20/66] Update CLAUDE.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- CLAUDE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ae2e78f..3cb238c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -72,10 +72,10 @@ uv run ruff format ### Core Components 1. **BabelDownloader** (`src/babel_explorer/core/downloader.py`): - - Downloads Babel intermediate files from a remote server using `wget` + - Downloads Babel intermediate files from a remote HTTP(S) server using Python's `requests` library (streaming downloads) - Caches files locally in configurable directory (default: `data/2025nov19/`) - Uses `@functools.lru_cache` to avoid re-downloading - - **Important**: Requires `wget` to be installed on the system + - **Important**: Requires network access but no external tools like `wget` 2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`): - Main query engine for cross-references From 96d9609c9a499047f8e2efdcef3fd149090a0b06 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 17:38:01 -0500 Subject: [PATCH 21/66] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 922fa1b..eafcfc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "babel-explorer" version = "0.1.0" -description = "Add your description here" +description = "Tool for querying and exploring Babel APIs and intermediate files" readme = "README.md" requires-python = ">=3.11" dependencies = [ From 0c33e7ea60e1ad110a272f1ea94d9ed8b1aec75c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 17:46:17 -0500 Subject: [PATCH 22/66] Update src/babel_explorer/core/babel_xrefs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/babel_xrefs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 1e82125..95fda85 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -42,7 +42,7 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st self.obj_biolink_type = obj_biolink_type def __str__(self): - return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")""" + return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")""" @dataclasses.dataclass(frozen=True) class IdentifierRecord: From 1aff01367ec6ea1be038e41352bf108b469b8c9d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 17:47:47 -0500 Subject: [PATCH 23/66] Update src/babel_explorer/core/nodenorm.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/nodenorm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index 018f106..a9c6752 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -32,7 +32,7 @@ def __init__(self, nodenorm_url: str=""): self.nodenorm_url = nodenorm_url @functools.lru_cache(maxsize=None) - def get_identifier(self, curie): + def get_identifier(self, curie: str): result = self.normalize_curie(curie) logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}") if not result: From af76c151689429c517766f53adbc42bb43366bea Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 2 Mar 2026 18:26:05 -0500 Subject: [PATCH 24/66] Replace MD5 checksumming with HTTP header caching and freshness window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove _calculate_md5/_fetch_remote_md5 (too slow on 2.5-3.9 GB files) - Add sidecar .meta JSON files (ETag, Last-Modified, Content-Length, last_checked) - Three-tier logic: freshness window → HEAD/ETag check → full re-download - Add freshness_seconds param to BabelDownloader (default 3h) - Add --check-download CLI option to xrefs and ids commands (e.g. 3h, never) - Update tests: replace MD5 test classes with meta/ETag/tier coverage Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/cli.py | 28 +- src/babel_explorer/core/downloader.py | 203 ++++++----- tests/test_downloader.py | 463 +++++++++++++++++++------- 3 files changed, 490 insertions(+), 204 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 8dd5fc4..251ca0f 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -6,6 +6,17 @@ from babel_explorer.core.nodenorm import NodeNorm +def parse_duration(value: str) -> float: + """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds.""" + units = {"s": 1, "m": 60, "h": 3600, "d": 86400} + lower = value.lower() + if lower == "never": + return float("inf") + if lower[-1] in units: + return int(lower[:-1]) * units[lower[-1]] + return int(lower) # bare seconds + + @click.group() def cli(): pass @@ -17,7 +28,11 @@ def cli(): @click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") @click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs") @click.option("--labels", is_flag=True, help="Include labels for CURIEs") -def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool): +@click.option("--check-download", type=str, default="3h", show_default=True, + help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " + "'never' always checks via HTTP HEAD; '0' same as 'never'.") +def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool, + check_download: str): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -35,7 +50,8 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan """ logging.basicConfig(level=logging.INFO) - bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir), NodeNorm(nodenorm_url)) + freshness = parse_duration(check_download) + bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url)) xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels) for xref in xrefs: print(xref) @@ -44,7 +60,10 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan @click.argument("curies", type=str, required=True, nargs=-1) @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") -def ids(curies: list[str], babel_url: str, local_dir: str): +@click.option("--check-download", type=str, default="3h", show_default=True, + help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " + "'never' always checks via HTTP HEAD; '0' same as 'never'.") +def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): """ Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided. @@ -60,7 +79,8 @@ def ids(curies: list[str], babel_url: str, local_dir: str): """ logging.basicConfig(level=logging.INFO) - bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir)) + freshness = parse_duration(check_download) + bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness)) xrefs = bxref.get_curie_ids(curies) for xref in xrefs: print(xref) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 93081c6..43c3daf 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -1,9 +1,10 @@ import functools +import json import os import urllib.parse import time -import hashlib import requests +from datetime import datetime, timezone from tqdm import tqdm import logging @@ -13,10 +14,11 @@ class BabelDownloader: Class for downloading Babel cross-reference files to a local directory as needed. """ - def __init__(self, url_base, local_path=None, retries=10): + def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600): # We assume the URL base is correct (if not, we can fix it later). self.url_base = url_base self.retries = retries + self.freshness_seconds = freshness_seconds self.logger = logging.getLogger(BabelDownloader.__name__) if local_path is None: @@ -41,55 +43,115 @@ def get_output_file(self, filename): os.makedirs(os.path.dirname(filepath), exist_ok=True) return filepath - def _calculate_md5(self, file_path, chunk_size=1024*1024): + def _get_meta_path(self, local_path): + """Return the sidecar metadata file path for a given local file.""" + return local_path + ".meta" + + def _load_meta(self, local_path): + """Load sidecar metadata JSON, or return None if not found/invalid.""" + meta_path = self._get_meta_path(local_path) + if not os.path.exists(meta_path): + return None + try: + with open(meta_path, "r") as f: + return json.load(f) + except (json.JSONDecodeError, OSError): + return None + + def _save_meta(self, local_path, headers, update_last_checked=True): + """ + Write a sidecar .meta JSON file next to local_path. + + Args: + local_path: Path to the downloaded file + headers: Response headers dict (or requests.structures.CaseInsensitiveDict) + update_last_checked: If True, set last_checked to now + """ + meta = {} + if "ETag" in headers: + meta["etag"] = headers["ETag"] + if "Last-Modified" in headers: + meta["last_modified"] = headers["Last-Modified"] + if "Content-Length" in headers: + meta["content_length"] = int(headers["Content-Length"]) + if update_last_checked: + meta["last_checked"] = datetime.now(timezone.utc).isoformat() + + meta_path = self._get_meta_path(local_path) + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2) + + def _is_within_freshness(self, meta, freshness_seconds): """ - Calculate MD5 checksum of a file. + Return True if last_checked is within freshness_seconds of now. Args: - file_path: Path to the file to checksum - chunk_size: Size of chunks to read (default 1MB) + meta: dict loaded from .meta file + freshness_seconds: Number of seconds; float('inf') means always fresh Returns: - str: Hexadecimal MD5 checksum + bool """ - md5_hash = hashlib.md5() - with open(file_path, 'rb') as f: - for chunk in iter(lambda: f.read(chunk_size), b''): - md5_hash.update(chunk) - return md5_hash.hexdigest() + if freshness_seconds == float("inf"): + return True + last_checked_str = meta.get("last_checked") + if not last_checked_str: + return False + try: + last_checked = datetime.fromisoformat(last_checked_str) + age = (datetime.now(timezone.utc) - last_checked).total_seconds() + return age < freshness_seconds + except (ValueError, TypeError): + return False - def _fetch_remote_md5(self, url): + def _etag_matches(self, url, meta): """ - Fetch MD5 checksum from remote .md5 file. + Do a HEAD request and check if the ETag (or Last-Modified + Content-Length) + matches the stored metadata. If they match, update last_checked in the .meta file. Args: - url: URL to the .md5 file + url: URL to HEAD + meta: dict loaded from .meta file (may have etag, last_modified, content_length) Returns: - str: MD5 checksum if found, None if file doesn't exist or is malformed + bool: True if remote matches local meta (file is still current) """ try: - response = requests.get(url, timeout=10) - if response.status_code == 404: - self.logger.debug(f"No .md5 file found at {url}") - return None + response = requests.head(url, timeout=30) response.raise_for_status() + except requests.RequestException as e: + self.logger.warning(f"HEAD request failed for {url}: {e}") + return False + + remote_headers = response.headers + + # Primary check: ETag + local_etag = meta.get("etag") + remote_etag = remote_headers.get("ETag") + if local_etag and remote_etag: + if local_etag == remote_etag: + self.logger.info(f"ETag matches ({remote_etag}), file is current") + # Update last_checked in the .meta file + # We need the local_path to update — derive it from URL + # Caller will handle updating; return True + return True + else: + self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading") + return False - # Parse MD5 file content - # Format is typically: "md5hash filename" or just "md5hash" - content = response.text.strip() - md5_match = content.split()[0] # Take first token + # Fallback: Last-Modified + Content-Length + local_lm = meta.get("last_modified") + remote_lm = remote_headers.get("Last-Modified") + local_cl = meta.get("content_length") + remote_cl = remote_headers.get("Content-Length") - # Validate it's a valid MD5 (32 hex characters) - if len(md5_match) == 32 and all(c in '0123456789abcdef' for c in md5_match.lower()): - return md5_match.lower() - else: - self.logger.warning(f"Malformed .md5 file at {url}: {content}") - return None + if local_lm and remote_lm and local_lm == remote_lm: + if local_cl is None or remote_cl is None or int(remote_cl) == local_cl: + self.logger.info(f"Last-Modified matches ({remote_lm}), file is current") + return True - except requests.RequestException as e: - self.logger.debug(f"Could not fetch .md5 file from {url}: {e}") - return None + self.logger.info("Cannot confirm file is current (no matching ETag or Last-Modified), will re-download") + return False def _stream_download(self, response, local_path, resume_byte_pos, chunk_size): """ @@ -134,6 +196,9 @@ def _download_with_retry(self, url, local_path, chunk_size): local_path: Local file path to save to chunk_size: Size of chunks to read/write + Returns: + requests.structures.CaseInsensitiveDict: Response headers from the final request + Raises: RuntimeError: If all retry attempts fail """ @@ -157,7 +222,7 @@ def _download_with_retry(self, url, local_path, chunk_size): if response.status_code == 416: # Range Not Satisfiable - file already complete self.logger.info(f"File already complete: {local_path}") - return + return response.headers elif response.status_code == 206: # Partial Content - resume successful self.logger.info(f"Resuming download (HTTP 206)") @@ -176,7 +241,7 @@ def _download_with_retry(self, url, local_path, chunk_size): self._stream_download(response, local_path, resume_byte_pos, chunk_size) # Success - exit retry loop - return + return response.headers except (requests.RequestException, IOError) as e: self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}") @@ -193,13 +258,12 @@ def _download_with_retry(self, url, local_path, chunk_size): @functools.lru_cache(maxsize=None) def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): """ - Download a file from the Babel server to local storage with MD5 validation. + Download a file from the Babel server to local storage with ETag-based caching. - If a .md5 file exists on the server, this method will: - 1. Check if the local file exists - 2. Verify its MD5 checksum matches the expected value - 3. Delete and re-download if checksums don't match - 4. Skip download if checksums match + Three-tier freshness logic: + 1. If .meta exists and last_checked is within freshness window → return immediately + 2. If .meta exists but stale → HEAD request to compare ETag; return if unchanged + 3. If ETag changed or no .meta → full re-download Args: dirpath: Relative path from url_base to the file @@ -212,48 +276,37 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True) url_to_download = urllib.parse.urljoin(self.url_base, dirpath) - md5_url = url_to_download + '.md5' - # Check if file already exists and validate with MD5 if available if os.path.exists(local_path_to_download_to): - self.logger.info(f"Local file exists: {local_path_to_download_to}") - - # Try to fetch remote MD5 checksum - expected_md5 = self._fetch_remote_md5(md5_url) - - if expected_md5: - self.logger.info(f"Validating MD5 checksum (expected: {expected_md5})") - - # Calculate local file's MD5 - actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size) - self.logger.info(f"Local file MD5: {actual_md5}") + meta = self._load_meta(local_path_to_download_to) + if meta is not None: + # Tier 1: within freshness window — skip all network calls + if self._is_within_freshness(meta, self.freshness_seconds): + self.logger.info(f"File within freshness window, skipping check: {local_path_to_download_to}") + return local_path_to_download_to - if actual_md5 == expected_md5: - # File is valid, skip download - self.logger.info(f"MD5 checksum matches - file is valid, skipping download") - bytes_downloaded = os.path.getsize(local_path_to_download_to) - self.logger.info(f"Using existing file: {local_path_to_download_to} ({bytes_downloaded} bytes)") + # Tier 2: stale but maybe unchanged — HEAD request + if self._etag_matches(url_to_download, meta): + # Update last_checked timestamp + meta["last_checked"] = datetime.now(timezone.utc).isoformat() + meta_path = self._get_meta_path(local_path_to_download_to) + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2) + self.logger.info(f"ETag matches, using existing file: {local_path_to_download_to}") return local_path_to_download_to - else: - # Checksums don't match - delete and re-download - self.logger.warning(f"MD5 checksum mismatch! Expected {expected_md5}, got {actual_md5}") - self.logger.warning(f"Deleting corrupted file and re-downloading: {local_path_to_download_to}") - os.remove(local_path_to_download_to) + + # Tier 3: ETag changed — delete and re-download + self.logger.warning(f"Remote file changed, re-downloading: {local_path_to_download_to}") + os.remove(local_path_to_download_to) self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}") - # Download with retry logic - self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size) + # Download with retry logic; get response headers back + response_headers = self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size) - # Verify MD5 after download if available - expected_md5 = self._fetch_remote_md5(md5_url) - if expected_md5: - actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size) - if actual_md5 == expected_md5: - self.logger.info(f"Post-download MD5 verification passed: {actual_md5}") - else: - self.logger.error(f"Post-download MD5 verification failed! Expected {expected_md5}, got {actual_md5}") - raise RuntimeError(f"Downloaded file has incorrect MD5 checksum") + # Save sidecar metadata + if response_headers is not None: + self._save_meta(local_path_to_download_to, response_headers) bytes_downloaded = os.path.getsize(local_path_to_download_to) self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 912cd0a..045e402 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -5,13 +5,14 @@ Integration tests download real files from the Babel server. """ -import hashlib +import json import os import tempfile +from datetime import datetime, timezone, timedelta import pytest import requests -from unittest.mock import Mock, patch +from unittest.mock import Mock, patch, MagicMock from babel_explorer.core.downloader import BabelDownloader @@ -45,6 +46,14 @@ def test_default_retries(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) assert dl.retries == 10 + def test_default_freshness_seconds(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + assert dl.freshness_seconds == 3 * 3600 + + def test_custom_freshness_seconds(self, tmp_path): + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), freshness_seconds=0) + assert dl.freshness_seconds == 0 + def test_invalid_path_raises_value_error(self): """Using a file path (not a directory) should raise ValueError.""" with tempfile.NamedTemporaryFile(delete=False) as f: @@ -77,161 +86,360 @@ def test_lru_caching(self, tmp_path): assert result1 is result2 # identity check — same cached object -class TestCalculateMd5: - """Tests for _calculate_md5.""" +class TestSaveMeta: + """Tests for _save_meta.""" - def test_correct_hash(self, tmp_path): - content = b"Hello, world!" - expected = hashlib.md5(content).hexdigest() - file_path = tmp_path / "test.bin" - file_path.write_bytes(content) + def _make_dl(self, tmp_path): + return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - assert dl._calculate_md5(str(file_path)) == expected + def test_writes_all_fields(self, tmp_path): + dl = self._make_dl(tmp_path) + file_path = str(tmp_path / "test.parquet") + # Create the file so the path is valid + open(file_path, 'wb').close() + + headers = { + "ETag": '"abc123"', + "Last-Modified": "Wed, 03 Dec 2025 15:54:19 GMT", + "Content-Length": "12345", + } + dl._save_meta(file_path, headers) + + meta_path = file_path + ".meta" + assert os.path.exists(meta_path) + with open(meta_path) as f: + meta = json.load(f) + + assert meta["etag"] == '"abc123"' + assert meta["last_modified"] == "Wed, 03 Dec 2025 15:54:19 GMT" + assert meta["content_length"] == 12345 + assert "last_checked" in meta + + def test_last_checked_is_recent_utc(self, tmp_path): + dl = self._make_dl(tmp_path) + file_path = str(tmp_path / "f.parquet") + open(file_path, 'wb').close() - def test_different_chunk_sizes_same_result(self, tmp_path): - content = b"A" * 5000 - expected = hashlib.md5(content).hexdigest() - file_path = tmp_path / "chunks.bin" - file_path.write_bytes(content) + dl._save_meta(file_path, {"ETag": '"x"'}) - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - assert dl._calculate_md5(str(file_path), chunk_size=100) == expected - assert dl._calculate_md5(str(file_path), chunk_size=4096) == expected + with open(file_path + ".meta") as f: + meta = json.load(f) + + last_checked = datetime.fromisoformat(meta["last_checked"]) + age = (datetime.now(timezone.utc) - last_checked).total_seconds() + assert age < 5 # written less than 5 seconds ago + + def test_missing_headers_not_written(self, tmp_path): + """Headers not present in the response should not appear in .meta.""" + dl = self._make_dl(tmp_path) + file_path = str(tmp_path / "sparse.parquet") + open(file_path, 'wb').close() + + dl._save_meta(file_path, {}) + with open(file_path + ".meta") as f: + meta = json.load(f) -class TestFetchRemoteMd5: - """Tests for _fetch_remote_md5.""" + assert "etag" not in meta + assert "last_modified" not in meta + assert "content_length" not in meta + assert "last_checked" in meta + + +class TestLoadMeta: + """Tests for _load_meta.""" def _make_dl(self, tmp_path): return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - def test_valid_md5_response(self, tmp_path): + def test_returns_none_if_no_meta_file(self, tmp_path): dl = self._make_dl(tmp_path) - mock_resp = Mock() - mock_resp.status_code = 200 - mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e filename.parquet\n" - mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): - result = dl._fetch_remote_md5("https://example.com/file.md5") - assert result == "d41d8cd98f00b204e9800998ecf8427e" + assert dl._load_meta(str(tmp_path / "nonexistent.parquet")) is None + + def test_returns_dict_for_valid_meta(self, tmp_path): + dl = self._make_dl(tmp_path) + file_path = str(tmp_path / "f.parquet") + open(file_path, 'wb').close() + meta_data = {"etag": '"abc"', "last_checked": "2026-01-01T00:00:00+00:00"} + with open(file_path + ".meta", "w") as f: + json.dump(meta_data, f) + + result = dl._load_meta(file_path) + assert result == meta_data + + def test_returns_none_for_corrupt_meta(self, tmp_path): + dl = self._make_dl(tmp_path) + file_path = str(tmp_path / "corrupt.parquet") + open(file_path, 'wb').close() + with open(file_path + ".meta", "w") as f: + f.write("not valid json {{{") + + assert dl._load_meta(file_path) is None + + +class TestIsWithinFreshness: + """Tests for _is_within_freshness.""" - def test_hash_only_format(self, tmp_path): + def _make_dl(self, tmp_path): + return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + + def test_returns_true_when_recent(self, tmp_path): + dl = self._make_dl(tmp_path) + recent = datetime.now(timezone.utc).isoformat() + meta = {"last_checked": recent} + assert dl._is_within_freshness(meta, 3600) is True + + def test_returns_false_when_stale(self, tmp_path): + dl = self._make_dl(tmp_path) + old = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat() + meta = {"last_checked": old} + assert dl._is_within_freshness(meta, 3600) is False + + def test_returns_false_when_missing_last_checked(self, tmp_path): dl = self._make_dl(tmp_path) + assert dl._is_within_freshness({}, 3600) is False + + def test_returns_true_when_freshness_is_inf(self, tmp_path): + dl = self._make_dl(tmp_path) + old = (datetime.now(timezone.utc) - timedelta(days=365)).isoformat() + meta = {"last_checked": old} + assert dl._is_within_freshness(meta, float("inf")) is True + + def test_returns_false_when_freshness_is_zero(self, tmp_path): + dl = self._make_dl(tmp_path) + just_now = datetime.now(timezone.utc).isoformat() + meta = {"last_checked": just_now} + # Even with freshness=0, age >= 0 so it's not < 0 + assert dl._is_within_freshness(meta, 0) is False + + +class TestEtagMatches: + """Tests for _etag_matches.""" + + def _make_dl(self, tmp_path): + return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + + def test_returns_true_on_matching_etag(self, tmp_path): + dl = self._make_dl(tmp_path) + meta = {"etag": '"abc123"'} mock_resp = Mock() - mock_resp.status_code = 200 - mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e\n" + mock_resp.headers = {"ETag": '"abc123"'} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): - result = dl._fetch_remote_md5("https://example.com/file.md5") - assert result == "d41d8cd98f00b204e9800998ecf8427e" + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + assert dl._etag_matches("https://example.com/f.parquet", meta) is True - def test_404_returns_none(self, tmp_path): + def test_returns_false_on_different_etag(self, tmp_path): dl = self._make_dl(tmp_path) + meta = {"etag": '"old"'} mock_resp = Mock() - mock_resp.status_code = 404 - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): - assert dl._fetch_remote_md5("https://example.com/missing.md5") is None + mock_resp.headers = {"ETag": '"new"'} + mock_resp.raise_for_status = Mock() + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + assert dl._etag_matches("https://example.com/f.parquet", meta) is False - def test_malformed_returns_none(self, tmp_path): + def test_fallback_last_modified_match(self, tmp_path): dl = self._make_dl(tmp_path) + lm = "Wed, 03 Dec 2025 15:54:19 GMT" + meta = {"last_modified": lm, "content_length": 100} mock_resp = Mock() - mock_resp.status_code = 200 - mock_resp.text = "not-a-valid-md5-hash\n" + mock_resp.headers = {"Last-Modified": lm, "Content-Length": "100"} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp): - assert dl._fetch_remote_md5("https://example.com/bad.md5") is None + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + assert dl._etag_matches("https://example.com/f.parquet", meta) is True - def test_network_error_returns_none(self, tmp_path): + def test_returns_false_on_request_error(self, tmp_path): dl = self._make_dl(tmp_path) - with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")): - assert dl._fetch_remote_md5("https://example.com/err.md5") is None + meta = {"etag": '"abc"'} + with patch("babel_explorer.core.downloader.requests.head", + side_effect=requests.ConnectionError("fail")): + assert dl._etag_matches("https://example.com/f.parquet", meta) is False -class TestMd5ValidationFlow: - """Tests for the MD5 validation logic inside get_downloaded_file.""" +class TestGetDownloadedFileTiers: + """Tests for the three-tier logic in get_downloaded_file.""" - def test_matching_checksum_skips_download(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - test_file = "test.txt" - content = b"test content" - local_path = tmp_path / test_file - local_path.write_bytes(content) - expected_md5 = hashlib.md5(content).hexdigest() - - with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): - with patch.object(dl, '_download_with_retry') as mock_dl: + def _make_dl(self, tmp_path, freshness=3600): + return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), + freshness_seconds=freshness) + + # --- Tier 1: within freshness window --- + + def test_tier1_returns_immediately_no_http(self, tmp_path): + """File + fresh .meta → no network calls at all.""" + dl = self._make_dl(tmp_path, freshness=3600) + test_file = "duckdb/test.parquet" + local = tmp_path / "duckdb" / "test.parquet" + local.parent.mkdir(parents=True) + local.write_bytes(b"data") + + meta = {"etag": '"abc"', "last_checked": datetime.now(timezone.utc).isoformat()} + with open(str(local) + ".meta", "w") as f: + json.dump(meta, f) + + with patch("babel_explorer.core.downloader.requests.head") as mock_head: + with patch("babel_explorer.core.downloader.requests.get") as mock_get: dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) - mock_dl.assert_not_called() - assert result == str(local_path) - - def test_mismatched_checksum_triggers_redownload(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - test_file = "mismatch.txt" - local_path = tmp_path / test_file - local_path.write_bytes(b"wrong content") - correct_content = b"correct content" - expected_md5 = hashlib.md5(correct_content).hexdigest() + mock_head.assert_not_called() + mock_get.assert_not_called() + assert result == str(local) + + # --- Tier 2: stale .meta, ETag matches --- + + def test_tier2_head_check_no_redownload(self, tmp_path): + """Stale .meta + matching ETag → HEAD only, no GET.""" + dl = self._make_dl(tmp_path, freshness=0) + test_file = "duckdb/test.parquet" + local = tmp_path / "duckdb" / "test.parquet" + local.parent.mkdir(parents=True) + local.write_bytes(b"data") + + old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat() + meta = {"etag": '"abc"', "last_checked": old_ts} + with open(str(local) + ".meta", "w") as f: + json.dump(meta, f) + + mock_head_resp = Mock() + mock_head_resp.headers = {"ETag": '"abc"'} + mock_head_resp.raise_for_status = Mock() + + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): + with patch("babel_explorer.core.downloader.requests.get") as mock_get: + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + mock_get.assert_not_called() + assert result == str(local) + + def test_tier2_updates_last_checked_after_head(self, tmp_path): + """After successful HEAD match, last_checked in .meta is updated.""" + dl = self._make_dl(tmp_path, freshness=0) + test_file = "duckdb/upd.parquet" + local = tmp_path / "duckdb" / "upd.parquet" + local.parent.mkdir(parents=True) + local.write_bytes(b"data") + + old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat() + meta = {"etag": '"abc"', "last_checked": old_ts} + with open(str(local) + ".meta", "w") as f: + json.dump(meta, f) + + mock_head_resp = Mock() + mock_head_resp.headers = {"ETag": '"abc"'} + mock_head_resp.raise_for_status = Mock() + + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): + dl.get_downloaded_file.cache_clear() + dl.get_downloaded_file(test_file) + + with open(str(local) + ".meta") as f: + updated_meta = json.load(f) + updated_ts = datetime.fromisoformat(updated_meta["last_checked"]) + assert (datetime.now(timezone.utc) - updated_ts).total_seconds() < 5 + + # --- Tier 3: ETag changed, re-download --- + + def test_tier3_redownloads_when_etag_changed(self, tmp_path): + """Changed ETag → file deleted and re-downloaded.""" + dl = self._make_dl(tmp_path, freshness=0) + test_file = "duckdb/changed.parquet" + local = tmp_path / "duckdb" / "changed.parquet" + local.parent.mkdir(parents=True) + local.write_bytes(b"old data") + + old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat() + meta = {"etag": '"old"', "last_checked": old_ts} + with open(str(local) + ".meta", "w") as f: + json.dump(meta, f) + + mock_head_resp = Mock() + mock_head_resp.headers = {"ETag": '"new"'} + mock_head_resp.raise_for_status = Mock() + + new_content = b"new data" def fake_download(url, path, chunk_size): with open(path, 'wb') as f: - f.write(correct_content) + f.write(new_content) + return {"ETag": '"new"', "Content-Length": str(len(new_content))} - with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): + with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): with patch.object(dl, '_download_with_retry', side_effect=fake_download): dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) - assert os.path.exists(result) - with open(result, 'rb') as f: - assert f.read() == correct_content - def test_no_md5_proceeds_normally(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - test_file = "no_md5.txt" - content = b"downloaded content" + assert open(result, 'rb').read() == new_content + + # --- No .meta: fresh download --- + + def test_downloads_when_no_meta(self, tmp_path): + """No file and no .meta → download happens, .meta is saved.""" + dl = self._make_dl(tmp_path) + test_file = "duckdb/new.parquet" + local_path = str(tmp_path / "duckdb" / "new.parquet") + content = b"fresh download" def fake_download(url, path, chunk_size): + os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as f: f.write(content) - - with patch.object(dl, '_fetch_remote_md5', return_value=None): - with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: - dl.get_downloaded_file.cache_clear() - result = dl.get_downloaded_file(test_file) - mock_dl.assert_called_once() - assert os.path.exists(result) - - def test_post_download_validation_fail_raises(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - test_file = "post_fail.txt" - correct_md5 = hashlib.md5(b"expected").hexdigest() + return {"ETag": '"fresh"', "Content-Length": str(len(content))} + + with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + mock_dl.assert_called_once() + + assert os.path.exists(result) + assert open(result, 'rb').read() == content + # .meta should be saved + meta_path = result + ".meta" + assert os.path.exists(meta_path) + with open(meta_path) as f: + saved_meta = json.load(f) + assert saved_meta["etag"] == '"fresh"' + + def test_downloads_when_file_exists_but_no_meta(self, tmp_path): + """File exists but no .meta → treats as unknown, triggers full download flow.""" + dl = self._make_dl(tmp_path, freshness=3600) + test_file = "duckdb/nometa.parquet" + local = tmp_path / "duckdb" / "nometa.parquet" + local.parent.mkdir(parents=True) + local.write_bytes(b"old content") + # No .meta file + + new_content = b"refreshed" def fake_download(url, path, chunk_size): with open(path, 'wb') as f: - f.write(b"wrong data after download") + f.write(new_content) + return {"ETag": '"new"'} - with patch.object(dl, '_fetch_remote_md5', return_value=correct_md5): - with patch.object(dl, '_download_with_retry', side_effect=fake_download): - dl.get_downloaded_file.cache_clear() - with pytest.raises(RuntimeError, match="incorrect MD5 checksum"): - dl.get_downloaded_file(test_file) + with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + dl.get_downloaded_file.cache_clear() + result = dl.get_downloaded_file(test_file) + mock_dl.assert_called_once() + + assert open(result, 'rb').read() == new_content + + +class TestGetDownloadedFileCaching: + """Tests for get_downloaded_file LRU caching.""" - def test_post_download_validation_pass(self, tmp_path): + def test_cache_returns_same_result(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - test_file = "post_pass.txt" - content = b"correct content" - expected_md5 = hashlib.md5(content).hexdigest() + content = b"cached content" def fake_download(url, path, chunk_size): with open(path, 'wb') as f: f.write(content) + return {} - with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5): - with patch.object(dl, '_download_with_retry', side_effect=fake_download): - dl.get_downloaded_file.cache_clear() - result = dl.get_downloaded_file(test_file) - assert os.path.exists(result) + with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + dl.get_downloaded_file.cache_clear() + r1 = dl.get_downloaded_file("cached.txt") + r2 = dl.get_downloaded_file("cached.txt") + assert r1 == r2 + mock_dl.assert_called_once() # only one actual download class TestDownloadWithRetry: @@ -282,6 +490,7 @@ def test_http_416_file_already_complete(self, tmp_path): mock_response = Mock() mock_response.status_code = 416 + mock_response.headers = {} with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): dl._download_with_retry("https://example.com/file", str(out_path), 1024) @@ -303,6 +512,20 @@ def test_server_no_resume_restarts_download(self, tmp_path): dl._download_with_retry("https://example.com/file", str(out_path), 1024) assert out_path.read_bytes() == b"full content" + def test_returns_response_headers(self, tmp_path): + """_download_with_retry should return response headers.""" + dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + out_path = str(tmp_path / "headers.bin") + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.headers = {'Content-Length': '5', 'ETag': '"abc"'} + mock_response.iter_content = Mock(return_value=[b"hello"]) + + with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): + headers = dl._download_with_retry("https://example.com/file", out_path, 1024) + assert headers['ETag'] == '"abc"' + class TestStreamDownload: """Tests for _stream_download.""" @@ -332,26 +555,6 @@ def test_append_mode_on_resume(self, tmp_path): assert out_path.read_bytes() == b"startend" -class TestGetDownloadedFileCaching: - """Tests for get_downloaded_file LRU caching.""" - - def test_cache_returns_same_result(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - content = b"cached content" - - def fake_download(url, path, chunk_size): - with open(path, 'wb') as f: - f.write(content) - - with patch.object(dl, '_fetch_remote_md5', return_value=None): - with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: - dl.get_downloaded_file.cache_clear() - r1 = dl.get_downloaded_file("cached.txt") - r2 = dl.get_downloaded_file("cached.txt") - assert r1 == r2 - mock_dl.assert_called_once() # only one actual download - - class TestGetDownloadedDir: """Tests for get_downloaded_dir.""" @@ -382,6 +585,16 @@ def test_download_metadata_parquet(downloaded_metadata): assert os.path.getsize(downloaded_metadata) > 0 +@pytest.mark.integration +def test_download_creates_meta_file(downloaded_concord): + """After download, a .meta sidecar file should exist.""" + meta_path = downloaded_concord + ".meta" + assert os.path.isfile(meta_path), f"Missing .meta file: {meta_path}" + with open(meta_path) as f: + meta = json.load(f) + assert "last_checked" in meta + + @pytest.mark.integration def test_download_caching_real_files(shared_downloader, downloaded_concord): """Second call returns same path and file is not re-downloaded.""" From fb41da09ae1622740c43ed9cb82105e25306d0af Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 14:38:42 -0500 Subject: [PATCH 25/66] Added some CURIEs to test. --- tests/data/valid_curies.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/data/valid_curies.txt b/tests/data/valid_curies.txt index 9f2f87c..89a53b3 100644 --- a/tests/data/valid_curies.txt +++ b/tests/data/valid_curies.txt @@ -1,3 +1,5 @@ # Valid CURIEs for integration tests. # Add new CURIEs here to expand test coverage — tests are parametrized over this list. MONDO:0004979 +MONDO:0005044 +NCIT:C55060 From 5c544a2d43fddcad0fc9f07774a1b67e7338c261 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 14:49:07 -0500 Subject: [PATCH 26/66] Partially changed --expand to --recurse. --- src/babel_explorer/core/babel_xrefs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 95fda85..012f009 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -116,12 +116,12 @@ def get_curie_xref(self, curie: str, label_curies: bool = False): return xrefs - def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False): + def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False): """ Search for all identifiers that are cross-referenced to the given CURIE. :param curie: A CURIE to search for. - :param expand: Whether to expand the cross-references (i.e. recursively follow all identifiers). + :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers). :return: A list of cross-references containing that CURIE. """ @@ -133,11 +133,11 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies logging.info(f"Searching for cross-references for {curie}") xrefs.update(self.get_curie_xref(curie, label_curies)) - if expand: + if recurse: # Get a unique set of referenced curies, not including the ones currently queried. new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion) if new_curies: logging.info(f"Expanding cross-references to {new_curies}") - xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies)) + xrefs.update(self.get_curie_xrefs(new_curies, recurse=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies)) return sorted(xrefs) From 280212aa6bbfdb9bfbd1dbdcff5655183317db6c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 14:49:47 -0500 Subject: [PATCH 27/66] More fully changed --expand to --recurse. --- src/babel_explorer/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 251ca0f..4af31a8 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -26,12 +26,12 @@ def cli(): @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") @click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") -@click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs") +@click.option("--recurse", is_flag=True, help="Recursively query returned xrefs") @click.option("--labels", is_flag=True, help="Include labels for CURIEs") @click.option("--check-download", type=str, default="3h", show_default=True, help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " "'never' always checks via HTTP HEAD; '0' same as 'never'.") -def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool, +def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool, check_download: str): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -52,7 +52,7 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan freshness = parse_duration(check_download) bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url)) - xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels) + xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels) for xref in xrefs: print(xref) From b522e6e5e779f78879193e931ba070d59b4eb0a7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 14:52:07 -0500 Subject: [PATCH 28/66] Add pytest-xdist for parallel test execution - Add pytest-xdist[psutil] and filelock to dev dependencies - Enable parallel execution by default with addopts = "-n auto" - Switch DuckDB connections to in-memory mode (duckdb.connect()) to eliminate file locking that would deadlock parallel workers - Make test_data_dir teardown worker-aware (only gw0 cleans up) - Wrap download fixtures with FileLock to serialize concurrent downloads - Fix test_babel_xrefs.py: update expand= to recurse= to match renamed param Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 3 ++ src/babel_explorer/core/babel_xrefs.py | 8 ++- tests/conftest.py | 33 ++++++++----- tests/test_babel_xrefs.py | 16 +++--- uv.lock | 68 ++++++++++++++++++++++++++ 5 files changed, 103 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eafcfc6..59c1b68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,9 @@ build-backend = "hatchling.build" [dependency-groups] dev = [ + "filelock>=3.16", "pytest>=8.3.5", + "pytest-xdist[psutil]>=3.6", "ruff>=0.11.0", ] @@ -25,6 +27,7 @@ dev = [ babel-explorer = "babel_explorer.cli:cli" [tool.pytest.ini_options] +addopts = "-n auto" markers = [ "integration: tests requiring network access (deselect with '-m \"not integration\"')", "slow: tests downloading very large files 2GB+ (deselect with '-m \"not slow\"')", diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 012f009..de8e661 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -82,9 +82,8 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') - # Query the Parquet files using DuckDB. - duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') - db = duckdb.connect(duckdb_path) + # Query the Parquet files using DuckDB (in-memory; nothing is persisted). + db = duckdb.connect() identifier_table = db.read_parquet(identifier_parquet) result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies]) @@ -96,8 +95,7 @@ def get_curie_xref(self, curie: str, label_curies: bool = False): concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') - duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb') - db = duckdb.connect(duckdb_path) + db = duckdb.connect() concord_table = db.read_parquet(concord_parquet) xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples)) diff --git a/tests/conftest.py b/tests/conftest.py index f3df2fe..fc61599 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ import shutil import pytest +from filelock import FileLock from babel_explorer.core.downloader import BabelDownloader from babel_explorer.core.babel_xrefs import BabelXRefs @@ -39,20 +40,22 @@ def valid_curies() -> list[str]: @pytest.fixture(scope="session") -def test_data_dir(): +def test_data_dir(request): """ - Provide a clean test data directory for the entire session. + Provide a test data directory for the entire session. Creates the directory before tests, removes it after all tests complete. + When running under pytest-xdist, only the first worker (gw0) performs cleanup. """ - if os.path.exists(TEST_DATA_DIR): - shutil.rmtree(TEST_DATA_DIR) + worker_id = getattr(request.config, "workerinput", {}).get("workerid", "master") os.makedirs(TEST_DATA_DIR, exist_ok=True) yield TEST_DATA_DIR - if os.path.exists(TEST_DATA_DIR): - shutil.rmtree(TEST_DATA_DIR) + # Only the first xdist worker (or a non-xdist run) cleans up the directory. + if worker_id in ("master", "gw0"): + if os.path.exists(TEST_DATA_DIR): + shutil.rmtree(TEST_DATA_DIR) @pytest.fixture(scope="session") @@ -62,15 +65,19 @@ def shared_downloader(test_data_dir) -> BabelDownloader: @pytest.fixture(scope="session") -def downloaded_concord(shared_downloader) -> str: +def downloaded_concord(shared_downloader, test_data_dir) -> str: """Download duckdb/Concord.parquet (~626 MB). Returns the local path.""" - return shared_downloader.get_downloaded_file(CONCORD_FILE) + lock_path = os.path.join(test_data_dir, "concord.lock") + with FileLock(lock_path): + return shared_downloader.get_downloaded_file(CONCORD_FILE) @pytest.fixture(scope="session") -def downloaded_metadata(shared_downloader) -> str: +def downloaded_metadata(shared_downloader, test_data_dir) -> str: """Download duckdb/Metadata.parquet (small). Returns the local path.""" - return shared_downloader.get_downloaded_file(METADATA_FILE) + lock_path = os.path.join(test_data_dir, "metadata.lock") + with FileLock(lock_path): + return shared_downloader.get_downloaded_file(METADATA_FILE) @pytest.fixture(scope="session") @@ -83,9 +90,11 @@ def downloaded_parquet_files(downloaded_concord, downloaded_metadata) -> dict[st @pytest.fixture(scope="session") -def downloaded_identifiers(shared_downloader) -> str: +def downloaded_identifiers(shared_downloader, test_data_dir) -> str: """Download duckdb/Identifiers.parquet (2 GB+). Returns the local path.""" - return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE) + lock_path = os.path.join(test_data_dir, "identifiers.lock") + with FileLock(lock_path): + return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE) @pytest.fixture(scope="session") diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 052d09c..774ccae 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -203,7 +203,7 @@ def test_get_curie_xrefs_no_expand(self, tmp_path): xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") with patch.object(bx, 'get_curie_xref', return_value=[xr]): bx.get_curie_xref.cache_clear() - result = bx.get_curie_xrefs(["A:1"], expand=False) + result = bx.get_curie_xrefs(["A:1"], recurse=False) assert len(result) == 1 assert result[0] == xr @@ -220,7 +220,7 @@ def mock_get_curie_xref(curie, label_curies=False): return [] with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref): - result = bx.get_curie_xrefs(["A:1"], expand=True) + result = bx.get_curie_xrefs(["A:1"], recurse=True) assert xr1 in result assert xr2 in result @@ -230,7 +230,7 @@ def test_results_are_sorted(self, tmp_path): xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1") with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]): - result = bx.get_curie_xrefs(["X:1"], expand=False) + result = bx.get_curie_xrefs(["X:1"], recurse=False) assert result == [xr_a, xr_b] @@ -265,7 +265,7 @@ def test_get_curie_xref_returns_known_xrefs(babel_xrefs, curie): def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie): """get_curie_xrefs without expansion returns sorted, non-empty results.""" babel_xrefs.get_curie_xref.cache_clear() - results = babel_xrefs.get_curie_xrefs([curie], expand=False) + results = babel_xrefs.get_curie_xrefs([curie], recurse=False) assert len(results) > 0 assert results == sorted(results) @@ -275,9 +275,9 @@ def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie): def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie): """Expanded results are at least as many as non-expanded.""" babel_xrefs.get_curie_xref.cache_clear() - non_expanded = babel_xrefs.get_curie_xrefs([curie], expand=False) + non_expanded = babel_xrefs.get_curie_xrefs([curie], recurse=False) babel_xrefs.get_curie_xref.cache_clear() - expanded = babel_xrefs.get_curie_xrefs([curie], expand=True) + expanded = babel_xrefs.get_curie_xrefs([curie], recurse=True) assert len(expanded) >= len(non_expanded) @@ -286,9 +286,9 @@ def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie): def test_get_curie_xrefs_expanded_includes_original(babel_xrefs, curie): """Non-expanded results are a subset of expanded results.""" babel_xrefs.get_curie_xref.cache_clear() - non_expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=False)) + non_expanded = set(babel_xrefs.get_curie_xrefs([curie], recurse=False)) babel_xrefs.get_curie_xref.cache_clear() - expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=True)) + expanded = set(babel_xrefs.get_curie_xrefs([curie], recurse=True)) assert non_expanded.issubset(expanded) diff --git a/uv.lock b/uv.lock index 56af50a..b8496b5 100644 --- a/uv.lock +++ b/uv.lock @@ -15,7 +15,9 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "filelock" }, { name = "pytest" }, + { name = "pytest-xdist", extra = ["psutil"] }, { name = "ruff" }, ] @@ -29,7 +31,9 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ + { name = "filelock", specifier = ">=3.16" }, { name = "pytest", specifier = ">=8.3.5" }, + { name = "pytest-xdist", extras = ["psutil"], specifier = ">=3.6" }, { name = "ruff", specifier = ">=0.11.0" }, ] @@ -172,6 +176,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dd/2d/13e6024e613679d8a489dd922f199ef4b1d08a456a58eadd96dc2f05171f/duckdb-1.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4", size = 13458633, upload-time = "2026-01-26T11:50:17.657Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -208,6 +230,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "psutil" +version = "7.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/08/510cbdb69c25a96f4ae523f733cdc963ae654904e8db864c07585ef99875/psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b", size = 130595, upload-time = "2026-01-28T18:14:57.293Z" }, + { url = "https://files.pythonhosted.org/packages/d6/f5/97baea3fe7a5a9af7436301f85490905379b1c6f2dd51fe3ecf24b4c5fbf/psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea", size = 131082, upload-time = "2026-01-28T18:14:59.732Z" }, + { url = "https://files.pythonhosted.org/packages/37/d6/246513fbf9fa174af531f28412297dd05241d97a75911ac8febefa1a53c6/psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63", size = 181476, upload-time = "2026-01-28T18:15:01.884Z" }, + { url = "https://files.pythonhosted.org/packages/b8/b5/9182c9af3836cca61696dabe4fd1304e17bc56cb62f17439e1154f225dd3/psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312", size = 184062, upload-time = "2026-01-28T18:15:04.436Z" }, + { url = "https://files.pythonhosted.org/packages/16/ba/0756dca669f5a9300d0cbcbfae9a4c30e446dfc7440ffe43ded5724bfd93/psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b", size = 139893, upload-time = "2026-01-28T18:15:06.378Z" }, + { url = "https://files.pythonhosted.org/packages/1c/61/8fa0e26f33623b49949346de05ec1ddaad02ed8ba64af45f40a147dbfa97/psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9", size = 135589, upload-time = "2026-01-28T18:15:08.03Z" }, + { url = "https://files.pythonhosted.org/packages/81/69/ef179ab5ca24f32acc1dac0c247fd6a13b501fd5534dbae0e05a1c48b66d/psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00", size = 130664, upload-time = "2026-01-28T18:15:09.469Z" }, + { url = "https://files.pythonhosted.org/packages/7b/64/665248b557a236d3fa9efc378d60d95ef56dd0a490c2cd37dafc7660d4a9/psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9", size = 131087, upload-time = "2026-01-28T18:15:11.724Z" }, + { url = "https://files.pythonhosted.org/packages/d5/2e/e6782744700d6759ebce3043dcfa661fb61e2fb752b91cdeae9af12c2178/psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a", size = 182383, upload-time = "2026-01-28T18:15:13.445Z" }, + { url = "https://files.pythonhosted.org/packages/57/49/0a41cefd10cb7505cdc04dab3eacf24c0c2cb158a998b8c7b1d27ee2c1f5/psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf", size = 185210, upload-time = "2026-01-28T18:15:16.002Z" }, + { url = "https://files.pythonhosted.org/packages/dd/2c/ff9bfb544f283ba5f83ba725a3c5fec6d6b10b8f27ac1dc641c473dc390d/psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1", size = 141228, upload-time = "2026-01-28T18:15:18.385Z" }, + { url = "https://files.pythonhosted.org/packages/f2/fc/f8d9c31db14fcec13748d373e668bc3bed94d9077dbc17fb0eebc073233c/psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841", size = 136284, upload-time = "2026-01-28T18:15:19.912Z" }, + { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" }, + { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" }, + { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" }, + { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" }, + { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" }, + { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" }, + { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -233,6 +283,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + +[package.optional-dependencies] +psutil = [ + { name = "psutil" }, +] + [[package]] name = "requests" version = "2.32.5" From e137c31c199f9da2a4c7fa55d8956d6f74faac3a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 15:06:18 -0500 Subject: [PATCH 29/66] Replace Python recursion in get_curie_xrefs with DuckDB WITH RECURSIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recurse=True path previously issued one DuckDB query per CURIE and called itself recursively (O(diameter) queries, Python stack growth). It now delegates to _get_curie_xrefs_recursive, which traverses the full connected component in a single SQL query using WITH RECURSIVE. A bidirectional `edges` CTE (subj→obj and obj→subj) collapses the two traversal directions into one recursive arm; UNION (not UNION ALL) provides automatic cycle detection. ignore_curies_in_expansion is now a no-op on the recurse=True path and emits a DeprecationWarning. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/babel_xrefs.py | 72 ++++++++++++++++++++++---- tests/test_babel_xrefs.py | 48 ++++++++++++++--- 2 files changed, 101 insertions(+), 19 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index de8e661..fba5b61 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -3,6 +3,7 @@ # why we consider two identifiers to be identical. import dataclasses import logging +import warnings import duckdb import functools @@ -114,28 +115,77 @@ def get_curie_xref(self, curie: str, label_curies: bool = False): return xrefs + def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False): + """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query.""" + if not curies: + return [] + + concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') + concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') + + db = duckdb.connect() + concord_table = db.read_parquet(concord_parquet) + result = db.execute(""" + WITH RECURSIVE + edges(a, b) AS ( + SELECT subj, obj FROM concord_table + UNION ALL + SELECT obj, subj FROM concord_table + ), + frontier(curie) AS ( + SELECT unnest($1::VARCHAR[]) + UNION + SELECT e.b + FROM edges e + INNER JOIN frontier f ON e.a = f.curie + ) + SELECT DISTINCT c.filename, c.subj, c.pred, c.obj + FROM concord_table c + WHERE c.subj IN (SELECT curie FROM frontier) + OR c.obj IN (SELECT curie FROM frontier) + ORDER BY c.filename, c.subj, c.obj, c.pred + """, [curies]) + + xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()] + + if label_curies: + xrefs = [LabeledCrossReference( + subj=xref.subj, + obj=xref.obj, + filename=xref.filename, + pred=xref.pred, + subj_label=self.nodenorm.get_identifier(xref.subj).label, + subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, + obj_label=self.nodenorm.get_identifier(xref.obj).label, + obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, + ) for xref in xrefs] + + return xrefs + def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False): """ Search for all identifiers that are cross-referenced to the given CURIE. - :param curie: A CURIE to search for. + :param curies: A list of CURIEs to search for. :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers). - :return: A list of cross-references containing that CURIE. + :param ignore_curies_in_expansion: Deprecated when recurse=True; has no effect. + :param label_curies: Whether to annotate results with labels from NodeNorm. + :return: A list of cross-references containing those CURIEs. """ - if ignore_curies_in_expansion: - logging.info(f"Ignoring {len(ignore_curies_in_expansion)}: {ignore_curies_in_expansion}") + if recurse: + if ignore_curies_in_expansion: + warnings.warn( + "ignore_curies_in_expansion has no effect when recurse=True; " + "cycle detection is handled automatically by the SQL query.", + DeprecationWarning, + stacklevel=2, + ) + return self._get_curie_xrefs_recursive(curies, label_curies) xrefs = set() for curie in curies: logging.info(f"Searching for cross-references for {curie}") xrefs.update(self.get_curie_xref(curie, label_curies)) - if recurse: - # Get a unique set of referenced curies, not including the ones currently queried. - new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion) - if new_curies: - logging.info(f"Expanding cross-references to {new_curies}") - xrefs.update(self.get_curie_xrefs(new_curies, recurse=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies)) - return sorted(xrefs) diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 774ccae..42fa6aa 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -212,18 +212,50 @@ def test_get_curie_xrefs_with_expand(self, tmp_path): xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3") - def mock_get_curie_xref(curie, label_curies=False): - if curie == "A:1": - return [xr1] - elif curie == "B:2": - return [xr2] - return [] - - with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref): + with patch.object(bx, '_get_curie_xrefs_recursive', return_value=[xr1, xr2]) as mock_rec: result = bx.get_curie_xrefs(["A:1"], recurse=True) + mock_rec.assert_called_once_with(["A:1"], False) assert xr1 in result assert xr2 in result + def test_get_curie_xrefs_recursive_sql_traversal(self, tmp_path): + """_get_curie_xrefs_recursive uses SQL graph traversal, not Python recursion.""" + import duckdb as real_duckdb + + bx = self._make_bx(tmp_path) + + # Write a tiny Parquet file: graph A-B, B-C, D-E (disconnected from A-B-C) + parquet_path = str(tmp_path / "test_concord.parquet") + setup_db = real_duckdb.connect() + setup_db.execute(f""" + COPY ( + SELECT * FROM (VALUES + ('f1.tsv', 'A:1', 'skos:exactMatch', 'B:2'), + ('f1.tsv', 'B:2', 'skos:exactMatch', 'C:3'), + ('f2.tsv', 'D:4', 'skos:exactMatch', 'E:5') + ) AS t(filename, subj, pred, obj) + ) TO '{parquet_path}' (FORMAT PARQUET) + """) + setup_db.close() + + with patch.object(bx.downloader, 'get_downloaded_file', return_value=parquet_path): + # Starting from A:1 should reach B:2 and C:3 but not the D-E component + result = bx._get_curie_xrefs_recursive(["A:1"]) + pairs = {(xr.subj, xr.obj) for xr in result} + assert ("A:1", "B:2") in pairs + assert ("B:2", "C:3") in pairs + assert ("D:4", "E:5") not in pairs + + # Starting from D:4 should only reach E:5 + result = bx._get_curie_xrefs_recursive(["D:4"]) + pairs = {(xr.subj, xr.obj) for xr in result} + assert ("D:4", "E:5") in pairs + assert ("A:1", "B:2") not in pairs + + # Empty input returns empty list + result = bx._get_curie_xrefs_recursive([]) + assert result == [] + def test_results_are_sorted(self, tmp_path): bx = self._make_bx(tmp_path) xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1") From b115d0293262d78f04db6c3dc78c3ec03fa543da Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 15:44:56 -0500 Subject: [PATCH 30/66] Fix xdist race condition: skip test-data cleanup in parallel runs When pytest-xdist runs 8 workers, each worker session ends independently. gw0 was deleting data/test/ as soon as it finished its own tests, but other workers were still reading Concord.parquet. This caused sporadic IOException failures on any test that opened a fresh DuckDB connection (e.g. _get_curie_xrefs_recursive) after gw0's teardown deleted the file. Fix: only delete the shared test data directory in a sequential (non-xdist) run where worker_id == "master". In parallel runs the directory persists; BabelDownloader's freshness-window logic re-validates or re-downloads the files on the next run as needed. Co-Authored-By: Claude Sonnet 4.6 --- tests/conftest.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index fc61599..f1e0df6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,15 +45,21 @@ def test_data_dir(request): Provide a test data directory for the entire session. Creates the directory before tests, removes it after all tests complete. - When running under pytest-xdist, only the first worker (gw0) performs cleanup. + When running under pytest-xdist, cleanup is skipped: worker sessions end at + unpredictable times and deleting the shared directory from one worker while + others are still reading the same files causes flaky IO errors. The files + are re-used (or re-validated) on the next run via the freshness-window logic + in BabelDownloader.get_downloaded_file. """ worker_id = getattr(request.config, "workerinput", {}).get("workerid", "master") os.makedirs(TEST_DATA_DIR, exist_ok=True) yield TEST_DATA_DIR - # Only the first xdist worker (or a non-xdist run) cleans up the directory. - if worker_id in ("master", "gw0"): + # Only clean up when running without xdist (sequential run). In a parallel + # run each worker session may finish at a different time; gw0 cleaning up + # while gw5 is still reading Concord.parquet causes spurious failures. + if worker_id == "master": if os.path.exists(TEST_DATA_DIR): shutil.rmtree(TEST_DATA_DIR) From 5a0f75874cd4acd51be04b65fd4d0205312c2cd9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 3 Mar 2026 15:50:18 -0500 Subject: [PATCH 31/66] Made output a bit prettier. --- src/babel_explorer/core/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 43c3daf..653ccc8 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -282,7 +282,7 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): if meta is not None: # Tier 1: within freshness window — skip all network calls if self._is_within_freshness(meta, self.freshness_seconds): - self.logger.info(f"File within freshness window, skipping check: {local_path_to_download_to}") + self.logger.info(f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}") return local_path_to_download_to # Tier 2: stale but maybe unchanged — HEAD request From be2fa36ceae4cf4abe81fb59c390222243638d23 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 00:41:23 -0400 Subject: [PATCH 32/66] Update src/babel_explorer/core/nodenorm.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/nodenorm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index a9c6752..1e1e24e 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -57,11 +57,17 @@ def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True response.raise_for_status() result = response.json() - return result[curie] + try: + return result[curie] + except KeyError: + logging.debug(f"NodeNorm response did not contain CURIE {curie!r}; returning None") + return None @functools.lru_cache(maxsize=None) def get_clique_identifiers(self, curie, **kwargs): result = self.normalize_curie(curie, **kwargs) + if not result: + return None if 'equivalent_identifiers' not in result: return None return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers'])) From 2b2aa7f20561e57ad811697bd33394a846a64bfc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 01:07:11 -0400 Subject: [PATCH 33/66] Simplify babel_xrefs: extract helper, remove dead fetches, fix default arg - Extract _to_labeled_xref() to eliminate duplicated LabeledCrossReference construction in get_curie_xref and _get_curie_xrefs_recursive - Remove unused concord_metadata_parquet fetches from get_curie_ids, get_curie_xref, and _get_curie_xrefs_recursive (Metadata.parquet was downloaded but never queried in any of these methods) - Fix mutable default argument: ignore_curies_in_expansion: set = set() -> set | None = None - Return list (not lazy map) from get_curie_xref for consistency - Update test expectation to match single downloader call (Concord only) Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/babel_xrefs.py | 42 +++++++++++--------------- tests/test_babel_xrefs.py | 4 +-- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index fba5b61..c07c095 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -81,7 +81,6 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: """ identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') - concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') # Query the Parquet files using DuckDB (in-memory; nothing is persisted). db = duckdb.connect() @@ -94,34 +93,36 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') db = duckdb.connect() concord_table = db.read_parquet(concord_parquet) xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() - xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples)) + xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples] if label_curies: - xrefs = map(lambda xref: LabeledCrossReference( - subj=xref.subj, - obj=xref.obj, - filename=xref.filename, - pred=xref.pred, - subj_label=self.nodenorm.get_identifier(xref.subj).label, - subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, - obj_label=self.nodenorm.get_identifier(xref.obj).label, - obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, - ), xrefs) + xrefs = [self._to_labeled_xref(xref) for xref in xrefs] return xrefs + def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference: + """Convert a CrossReference to a LabeledCrossReference using NodeNorm.""" + return LabeledCrossReference( + subj=xref.subj, + obj=xref.obj, + filename=xref.filename, + pred=xref.pred, + subj_label=self.nodenorm.get_identifier(xref.subj).label, + subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, + obj_label=self.nodenorm.get_identifier(xref.obj).label, + obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, + ) + def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False): """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query.""" if not curies: return [] concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet') db = duckdb.connect() concord_table = db.read_parquet(concord_parquet) @@ -149,20 +150,11 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()] if label_curies: - xrefs = [LabeledCrossReference( - subj=xref.subj, - obj=xref.obj, - filename=xref.filename, - pred=xref.pred, - subj_label=self.nodenorm.get_identifier(xref.subj).label, - subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, - obj_label=self.nodenorm.get_identifier(xref.obj).label, - obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, - ) for xref in xrefs] + xrefs = [self._to_labeled_xref(xref) for xref in xrefs] return xrefs - def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False): + def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set | None = None, label_curies: bool = False): """ Search for all identifiers that are cross-referenced to the given CURIE. diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 42fa6aa..41ad777 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -192,8 +192,8 @@ def test_get_curie_xref_calls_downloader(self, tmp_path): with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db): bx.get_curie_xref.cache_clear() result = bx.get_curie_xref("A:1") - # Downloader should be called for Concord and Metadata - assert mock_dl.call_count == 2 + # Downloader should be called for Concord only (Metadata unused here) + assert mock_dl.call_count == 1 result_list = list(result) assert len(result_list) == 1 assert isinstance(result_list[0], CrossReference) From 3cdd19c787517a2d4ec9ea6d683e598119901454 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 01:45:18 -0400 Subject: [PATCH 34/66] Fix LabeledCrossReference: make it a frozen dataclass subclass Hand-written __init__ with post-construction setattr raised FrozenInstanceError since CrossReference is frozen=True. Adding @dataclasses.dataclass(frozen=True) lets Python generate the correct __init__ using object.__setattr__ internally. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/babel_xrefs.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index c07c095..51727ab 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -29,19 +29,13 @@ def curies(self): def __lt__(self, other): return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred) +@dataclasses.dataclass(frozen=True) class LabeledCrossReference(CrossReference): subj_label: str subj_biolink_type: str obj_label: str obj_biolink_type: str - def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: str, subj_biolink_type: str, obj_label: str, obj_biolink_type: str): - super().__init__(subj=subj, obj=obj, filename=filename, pred=pred) - self.subj_label = subj_label - self.subj_biolink_type = subj_biolink_type - self.obj_label = obj_label - self.obj_biolink_type = obj_biolink_type - def __str__(self): return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")""" From c7a3f16f9cf45c83b554f62544f7daebb67aa18d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 01:53:41 -0400 Subject: [PATCH 35/66] Fix BabelDownloader: use tempfile.gettempdir() when local_path is None Replace the fragile TMPDIR-only env var check with tempfile.gettempdir(), which has a cross-platform fallback chain and always returns a valid path. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/downloader.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 653ccc8..57a0911 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -1,6 +1,7 @@ import functools import json import os +import tempfile import urllib.parse import time import requests @@ -22,11 +23,7 @@ def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * self.logger = logging.getLogger(BabelDownloader.__name__) if local_path is None: - # Default to using TMPDIR. - # TODO: replace with a real temporary directory. - tmpdir = os.environ.get("TMPDIR") - if tmpdir: - local_path = tmpdir + local_path = tempfile.gettempdir() # Make sure the local path is an existing directory or that we can create it. if not os.path.exists(local_path): From c6635bc42c88240b128e074ab694eac7e2ff2ee0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 02:09:28 -0400 Subject: [PATCH 36/66] Fix test-concord: guard against None from get_clique_identifiers When NodeNorm doesn't recognise a CURIE, get_clique_identifiers returns None, causing a TypeError on iteration. Use (identifiers or []) to skip gracefully. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 4af31a8..fc5d314 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -95,7 +95,7 @@ def test_concord(curies, nodenorm_url): nodenorm = NodeNorm(nodenorm_url) for curie in curies: identifiers = nodenorm.get_clique_identifiers(curie) - for identifier in identifiers: + for identifier in (identifiers or []): if identifier.label: print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}") else: From d74110ebb88d38897c1fd76f76652bff5ecb3629 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:10:07 -0400 Subject: [PATCH 37/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/nodenorm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index 1e1e24e..b48c4ec 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -30,6 +30,8 @@ def from_dict(d: dict): class NodeNorm: def __init__(self, nodenorm_url: str=""): self.nodenorm_url = nodenorm_url + if self.nodenorm_url and not self.nodenorm_url.endswith("/"): + self.nodenorm_url += "/" @functools.lru_cache(maxsize=None) def get_identifier(self, curie: str): From 4338bdcf12aecbd377941d73677d690e9abc9c9b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:10:46 -0400 Subject: [PATCH 38/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index fc5d314..56afde7 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -30,7 +30,7 @@ def cli(): @click.option("--labels", is_flag=True, help="Include labels for CURIEs") @click.option("--check-download", type=str, default="3h", show_default=True, help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " - "'never' always checks via HTTP HEAD; '0' same as 'never'.") + "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.") def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool, check_download: str): """ @@ -62,7 +62,7 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recur @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") @click.option("--check-download", type=str, default="3h", show_default=True, help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " - "'never' always checks via HTTP HEAD; '0' same as 'never'.") + "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.") def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): """ Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided. From be3e42756dc014c7cb7997fc9d2f1b7eb7f6683e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:11:30 -0400 Subject: [PATCH 39/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 78fa8e9..00fff8c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ uv sync --group dev uv run babel-explorer xrefs MONDO:0004979 # Get cross-references with expansion (recursive lookup) -uv run babel-explorer xrefs MONDO:0004979 --expand +uv run babel-explorer xrefs MONDO:0004979 --recurse # Get cross-references with labels from NodeNorm uv run babel-explorer xrefs MONDO:0004979 --labels From f8b718b5d346fa1aa590661e67559d4b8b9be3c2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:13:26 -0400 Subject: [PATCH 40/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/cli.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 56afde7..23854c5 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -9,12 +9,33 @@ def parse_duration(value: str) -> float: """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds.""" units = {"s": 1, "m": 60, "h": 3600, "d": 86400} - lower = value.lower() + lower = (value or "").strip().lower() + if not lower: + raise click.BadParameter( + "Invalid duration: value cannot be empty. " + "Use an integer number of seconds, optionally followed by 's', 'm', 'h', or 'd', " + "or 'never'." + ) if lower == "never": return float("inf") + # Value with unit suffix (e.g. '3h', '30m') if lower[-1] in units: - return int(lower[:-1]) * units[lower[-1]] - return int(lower) # bare seconds + try: + amount = int(lower[:-1]) + except ValueError: + raise click.BadParameter( + f"Invalid duration {value!r}: expected an integer followed by an optional unit " + "('s', 'm', 'h', or 'd'), or 'never'." + ) + return amount * units[lower[-1]] + # Bare integer seconds + try: + return int(lower) + except ValueError: + raise click.BadParameter( + f"Invalid duration {value!r}: expected an integer number of seconds, optionally " + "followed by 's', 'm', 'h', or 'd', or 'never'." + ) @click.group() From 48c8e960cfb9f6e300945372e4109959ee63e42d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:13:40 -0400 Subject: [PATCH 41/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 3cb238c..7ad79fb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,7 +30,7 @@ uv run babel-explorer --help uv run babel-explorer xrefs MONDO:0004979 # Get cross-references with expansion (recursive lookup) -uv run babel-explorer xrefs MONDO:0004979 --expand +uv run babel-explorer xrefs MONDO:0004979 --recurse # Get cross-references with labels from NodeNorm uv run babel-explorer xrefs MONDO:0004979 --labels From 8fb37d65916b9d43aa7bde9526377292347ae38e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:15:10 -0400 Subject: [PATCH 42/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/downloader.py | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 57a0911..39ea3fa 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -213,32 +213,32 @@ def _download_with_retry(self, url, local_path, chunk_size): self.logger.info(f"Resuming download from byte {resume_byte_pos}") # Make streaming request with timeout for connection (not total time) - response = requests.get(url, headers=headers, stream=True, timeout=30) - - # Handle different response codes - if response.status_code == 416: - # Range Not Satisfiable - file already complete - self.logger.info(f"File already complete: {local_path}") + with requests.get(url, headers=headers, stream=True, timeout=30) as response: + + # Handle different response codes + if response.status_code == 416: + # Range Not Satisfiable - file already complete + self.logger.info(f"File already complete: {local_path}") + return response.headers + elif response.status_code == 206: + # Partial Content - resume successful + self.logger.info(f"Resuming download (HTTP 206)") + elif response.status_code == 200: + # OK - server doesn't support resume or no Range header was sent + if resume_byte_pos > 0: + self.logger.warning(f"Server doesn't support resume, restarting from beginning") + resume_byte_pos = 0 + # Remove partial file + if os.path.exists(local_path): + os.remove(local_path) + else: + response.raise_for_status() + + # Stream download with progress bar + self._stream_download(response, local_path, resume_byte_pos, chunk_size) + + # Success - exit retry loop return response.headers - elif response.status_code == 206: - # Partial Content - resume successful - self.logger.info(f"Resuming download (HTTP 206)") - elif response.status_code == 200: - # OK - server doesn't support resume or no Range header was sent - if resume_byte_pos > 0: - self.logger.warning(f"Server doesn't support resume, restarting from beginning") - resume_byte_pos = 0 - # Remove partial file - if os.path.exists(local_path): - os.remove(local_path) - else: - response.raise_for_status() - - # Stream download with progress bar - self._stream_download(response, local_path, resume_byte_pos, chunk_size) - - # Success - exit retry loop - return response.headers except (requests.RequestException, IOError) as e: self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}") From 49f5c3ba6931a8454a3e7b73dda20a9aeee8c8b7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:16:56 -0400 Subject: [PATCH 43/66] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/core/babel_xrefs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 51727ab..725e1f7 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -86,6 +86,9 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): + if label_curies and self.nodenorm is None: + raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).") + concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') db = duckdb.connect() @@ -113,6 +116,8 @@ def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference: def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False): """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query.""" + if label_curies and self.nodenorm is None: + raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).") if not curies: return [] From c952c1277da41b89c119e00c9fb170e64ac400eb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:21:57 -0400 Subject: [PATCH 44/66] Fix DuckDB connection leaks by using context managers Wrap all three duckdb.connect() calls in `with` statements so connections are deterministically closed after each query rather than relying on GC. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/babel_xrefs.py | 30 ++++++++++++-------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 725e1f7..f11f3f4 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -77,12 +77,11 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') # Query the Parquet files using DuckDB (in-memory; nothing is persisted). - db = duckdb.connect() - identifier_table = db.read_parquet(identifier_parquet) - result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies]) - - column_names = [desc[0] for desc in result.description] - return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()] + with duckdb.connect() as db: + identifier_table = db.read_parquet(identifier_parquet) + result = db.execute("SELECT * FROM identifier_table WHERE curie IN $1", [curies]) + column_names = [desc[0] for desc in result.description] + return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()] @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): @@ -91,14 +90,13 @@ def get_curie_xref(self, curie: str, label_curies: bool = False): concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - db = duckdb.connect() - concord_table = db.read_parquet(concord_parquet) - xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() - xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples] + with duckdb.connect() as db: + concord_table = db.read_parquet(concord_parquet) + xref_tuples = db.execute("SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() + xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples] if label_curies: xrefs = [self._to_labeled_xref(xref) for xref in xrefs] - return xrefs def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference: @@ -123,9 +121,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') - db = duckdb.connect() - concord_table = db.read_parquet(concord_parquet) - result = db.execute(""" + with duckdb.connect() as db: + concord_table = db.read_parquet(concord_parquet) + rows = db.execute(""" WITH RECURSIVE edges(a, b) AS ( SELECT subj, obj FROM concord_table @@ -144,9 +142,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal WHERE c.subj IN (SELECT curie FROM frontier) OR c.obj IN (SELECT curie FROM frontier) ORDER BY c.filename, c.subj, c.obj, c.pred - """, [curies]) + """, [curies]).fetchall() - xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()] + xrefs = [CrossReference.from_tuple(row) for row in rows] if label_curies: xrefs = [self._to_labeled_xref(xref) for xref in xrefs] From b0539bb44cb577ccfdeff8ffb3931b3c88071d48 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:36:53 -0400 Subject: [PATCH 45/66] Fix and simplify test mocks for context manager protocol After production code was updated to use `with duckdb.connect()` and `with requests.get()`, the test mocks (plain Mock()) no longer supported the context manager protocol. Updated affected mocks to MagicMock() with __enter__.return_value = self. Also extracted _make_response() helper in TestDownloadWithRetry to eliminate five near-identical 4-line mock setup blocks. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_babel_xrefs.py | 1 + tests/test_downloader.py | 39 +++++++++++++++------------------------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 41ad777..75e33cb 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -184,6 +184,7 @@ def test_get_curie_xref_calls_downloader(self, tmp_path): ("concord.tsv", "A:1", "skos:exactMatch", "B:2"), ] mock_db = MagicMock() + mock_db.__enter__.return_value = mock_db mock_db.read_parquet.return_value = "table" mock_db.execute.return_value = mock_result diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 045e402..16a7e9b 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -445,6 +445,16 @@ def fake_download(url, path, chunk_size): class TestDownloadWithRetry: """Tests for _download_with_retry.""" + @staticmethod + def _make_response(status_code, headers=None, content=None): + m = MagicMock() + m.__enter__.return_value = m + m.status_code = status_code + m.headers = headers or {} + if content is not None: + m.iter_content = Mock(return_value=content) + return m + def test_retries_exhausted_raises_runtime_error(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2) with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")): @@ -456,11 +466,7 @@ def test_succeeds_on_second_attempt(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3) out_path = str(tmp_path / "retry_success.bin") - mock_response = Mock() - mock_response.status_code = 200 - mock_response.headers = {'Content-Length': '5'} - mock_response.iter_content = Mock(return_value=[b"hello"]) - + mock_response = self._make_response(200, {'Content-Length': '5'}, [b"hello"]) side_effects = [requests.ConnectionError("first fail"), mock_response] with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects): @@ -473,11 +479,7 @@ def test_resume_sends_range_header(self, tmp_path): out_path = tmp_path / "partial.bin" out_path.write_bytes(b"partial") # 7 bytes - mock_response = Mock() - mock_response.status_code = 206 - mock_response.headers = {'Content-Length': '3'} - mock_response.iter_content = Mock(return_value=[b"end"]) - + mock_response = self._make_response(206, {'Content-Length': '3'}, [b"end"]) with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get: dl._download_with_retry("https://example.com/file", str(out_path), 1024) _, kwargs = mock_get.call_args @@ -488,10 +490,7 @@ def test_http_416_file_already_complete(self, tmp_path): out_path = tmp_path / "complete.bin" out_path.write_bytes(b"full file") - mock_response = Mock() - mock_response.status_code = 416 - mock_response.headers = {} - + mock_response = self._make_response(416) with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): dl._download_with_retry("https://example.com/file", str(out_path), 1024) # Should return without error @@ -503,11 +502,7 @@ def test_server_no_resume_restarts_download(self, tmp_path): out_path = tmp_path / "no_resume.bin" out_path.write_bytes(b"partial") - mock_response = Mock() - mock_response.status_code = 200 - mock_response.headers = {'Content-Length': '12'} - mock_response.iter_content = Mock(return_value=[b"full content"]) - + mock_response = self._make_response(200, {'Content-Length': '12'}, [b"full content"]) with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): dl._download_with_retry("https://example.com/file", str(out_path), 1024) assert out_path.read_bytes() == b"full content" @@ -517,11 +512,7 @@ def test_returns_response_headers(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) out_path = str(tmp_path / "headers.bin") - mock_response = Mock() - mock_response.status_code = 200 - mock_response.headers = {'Content-Length': '5', 'ETag': '"abc"'} - mock_response.iter_content = Mock(return_value=[b"hello"]) - + mock_response = self._make_response(200, {'Content-Length': '5', 'ETag': '"abc"'}, [b"hello"]) with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): headers = dl._download_with_retry("https://example.com/file", out_path, 1024) assert headers['ETag'] == '"abc"' From 6319212e9a606652350a70f6ad8d0cd3a568687a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:41:51 -0400 Subject: [PATCH 46/66] Add configurable HTTP timeout to NodeNorm and BabelDownloader normalize_curie() was calling requests.get() with no timeout, risking an indefinite hang if the NodeNorm service stalls. The downloader had timeout=30 hardcoded in two places with no way to override it. Add timeout: int = 30 to both constructors and thread self.timeout through all three request call sites, making the default consistent and the value overridable without patching. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/downloader.py | 7 ++++--- src/babel_explorer/core/nodenorm.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 39ea3fa..7b628d0 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -15,11 +15,12 @@ class BabelDownloader: Class for downloading Babel cross-reference files to a local directory as needed. """ - def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600): + def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600, timeout: int = 30): # We assume the URL base is correct (if not, we can fix it later). self.url_base = url_base self.retries = retries self.freshness_seconds = freshness_seconds + self.timeout = timeout self.logger = logging.getLogger(BabelDownloader.__name__) if local_path is None: @@ -114,7 +115,7 @@ def _etag_matches(self, url, meta): bool: True if remote matches local meta (file is still current) """ try: - response = requests.head(url, timeout=30) + response = requests.head(url, timeout=self.timeout) response.raise_for_status() except requests.RequestException as e: self.logger.warning(f"HEAD request failed for {url}: {e}") @@ -213,7 +214,7 @@ def _download_with_retry(self, url, local_path, chunk_size): self.logger.info(f"Resuming download from byte {resume_byte_pos}") # Make streaming request with timeout for connection (not total time) - with requests.get(url, headers=headers, stream=True, timeout=30) as response: + with requests.get(url, headers=headers, stream=True, timeout=self.timeout) as response: # Handle different response codes if response.status_code == 416: diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index b48c4ec..04a3629 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -28,8 +28,9 @@ def from_dict(d: dict): return identifier class NodeNorm: - def __init__(self, nodenorm_url: str=""): + def __init__(self, nodenorm_url: str = "", timeout: int = 30): self.nodenorm_url = nodenorm_url + self.timeout = timeout if self.nodenorm_url and not self.nodenorm_url.endswith("/"): self.nodenorm_url += "/" @@ -55,7 +56,7 @@ def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True "description": description, "individual_types": individual_types, "include_taxa": include_taxa, - }) + }, timeout=self.timeout) response.raise_for_status() result = response.json() From b634d11bf4b0a4455aca920ad02746752bcf76bb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:45:18 -0400 Subject: [PATCH 47/66] Fix _etag_matches docstring to match actual behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docstring claimed the method updates last_checked in the .meta file, but it is a pure predicate — the caller (get_downloaded_file) owns that write. Updated the docstring and removed stale inline comments that said the same thing. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/downloader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 7b628d0..016fb01 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -105,7 +105,10 @@ def _is_within_freshness(self, meta, freshness_seconds): def _etag_matches(self, url, meta): """ Do a HEAD request and check if the ETag (or Last-Modified + Content-Length) - matches the stored metadata. If they match, update last_checked in the .meta file. + matches the stored metadata. + + Does not write to disk — the caller is responsible for updating last_checked + when this returns True. Args: url: URL to HEAD @@ -129,9 +132,6 @@ def _etag_matches(self, url, meta): if local_etag and remote_etag: if local_etag == remote_etag: self.logger.info(f"ETag matches ({remote_etag}), file is current") - # Update last_checked in the .meta file - # We need the local_path to update — derive it from URL - # Caller will handle updating; return True return True else: self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading") From a7eb8c1db94b63fbaf3eba3566c31a08640b1eb3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:51:16 -0400 Subject: [PATCH 48/66] Got rid of ignore_curies_in_expansion, which is no longer used. --- src/babel_explorer/core/babel_xrefs.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index f11f3f4..c017d6b 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -151,25 +151,17 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal return xrefs - def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set | None = None, label_curies: bool = False): + def get_curie_xrefs(self, curies: list[str], recurse: bool = False, label_curies: bool = False): """ Search for all identifiers that are cross-referenced to the given CURIE. :param curies: A list of CURIEs to search for. :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers). - :param ignore_curies_in_expansion: Deprecated when recurse=True; has no effect. :param label_curies: Whether to annotate results with labels from NodeNorm. :return: A list of cross-references containing those CURIEs. """ if recurse: - if ignore_curies_in_expansion: - warnings.warn( - "ignore_curies_in_expansion has no effect when recurse=True; " - "cycle detection is handled automatically by the SQL query.", - DeprecationWarning, - stacklevel=2, - ) return self._get_curie_xrefs_recursive(curies, label_curies) xrefs = set() From 7163a643ca148f4da20f15c71e9ac30748264a81 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 16:56:11 -0400 Subject: [PATCH 49/66] Add ruff CI and fix all lint errors Add .github/workflows/lint.yml to run ruff check and ruff format --check on every pull request. Fix the 7 errors ruff found, and apply ruff format to all files: - Remove unused `import warnings` in babel_xrefs.py (left over after ignore_curies_in_expansion was removed) - Add # noqa: F841 to the three read_parquet() assignments: ruff flags them as unused, but DuckDB resolves SQL table names by matching the Python variable name, so the assignments are load-bearing - Remove spurious f-prefix from two string literals in downloader.py - Drop unused `local_path` variable in test_downloader.py Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/lint.yml | 14 ++ src/babel_explorer/cli.py | 93 +++++++++++--- src/babel_explorer/core/babel_xrefs.py | 64 +++++++--- src/babel_explorer/core/downloader.py | 86 +++++++++---- src/babel_explorer/core/nodenorm.py | 64 ++++++---- tests/conftest.py | 4 +- tests/test_babel_xrefs.py | 71 ++++++++--- tests/test_downloader.py | 169 +++++++++++++++++-------- tests/test_nodenorm.py | 28 ++-- 9 files changed, 427 insertions(+), 166 deletions(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..f4771d4 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,14 @@ +name: Lint + +on: + pull_request: + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - run: uv sync --group dev + - run: uv run ruff check src/ tests/ + - run: uv run ruff format --check src/ tests/ diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 23854c5..0e25ea3 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -42,18 +42,46 @@ def parse_duration(value: str) -> float: def cli(): pass + @cli.command("xrefs") @click.argument("curies", type=str, required=True, nargs=-1) -@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") -@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") -@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") +@click.option( + "--local-dir", + type=str, + default="data/2025nov19", + help="Local location to save Babel download files to", +) +@click.option( + "--babel-url", + type=str, + default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", + help="Base URL of the Babel server", +) +@click.option( + "--nodenorm-url", + type=str, + default="https://nodenormalization-sri.renci.org/", + help="NodeNorm URL to check for concord changes", +) @click.option("--recurse", is_flag=True, help="Recursively query returned xrefs") @click.option("--labels", is_flag=True, help="Include labels for CURIEs") -@click.option("--check-download", type=str, default="3h", show_default=True, - help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " - "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.") -def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool, - check_download: str): +@click.option( + "--check-download", + type=str, + default="3h", + show_default=True, + help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " + "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.", +) +def xrefs( + curies: list[str], + babel_url: str, + nodenorm_url, + local_dir: str, + recurse: bool, + labels: bool, + check_download: str, +): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -72,18 +100,37 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recur logging.basicConfig(level=logging.INFO) freshness = parse_duration(check_download) - bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url)) + bxref = BabelXRefs( + BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), + NodeNorm(nodenorm_url), + ) xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels) for xref in xrefs: print(xref) + @cli.command("ids") @click.argument("curies", type=str, required=True, nargs=-1) -@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to") -@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server") -@click.option("--check-download", type=str, default="3h", show_default=True, - help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " - "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.") +@click.option( + "--local-dir", + type=str, + default="data/2025nov19", + help="Local location to save Babel download files to", +) +@click.option( + "--babel-url", + type=str, + default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", + help="Base URL of the Babel server", +) +@click.option( + "--check-download", + type=str, + default="3h", + show_default=True, + help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " + "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.", +) def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): """ Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided. @@ -101,14 +148,22 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): logging.basicConfig(level=logging.INFO) freshness = parse_duration(check_download) - bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness)) + bxref = BabelXRefs( + BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness) + ) xrefs = bxref.get_curie_ids(curies) for xref in xrefs: print(xref) + @cli.command("test-concord") @click.argument("curies", type=str, required=True, nargs=-1) -@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes") +@click.option( + "--nodenorm-url", + type=str, + default="https://nodenormalization-sri.renci.org/", + help="NodeNorm URL to check for concord changes", +) def test_concord(curies, nodenorm_url): # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm? # By definition, this can only combine all the cliques mentioned in the CURIEs. @@ -116,9 +171,11 @@ def test_concord(curies, nodenorm_url): nodenorm = NodeNorm(nodenorm_url) for curie in curies: identifiers = nodenorm.get_clique_identifiers(curie) - for identifier in (identifiers or []): + for identifier in identifiers or []: if identifier.label: - print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}") + print( + f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}" + ) else: print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}") diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index c017d6b..c218761 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -3,7 +3,6 @@ # why we consider two identifiers to be identical. import dataclasses import logging -import warnings import duckdb import functools @@ -20,14 +19,22 @@ class CrossReference: @staticmethod def from_tuple(tuple: tuple[str, str, str, str]): - return CrossReference(filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3]) + return CrossReference( + filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3] + ) @property def curies(self): return frozenset([self.subj, self.obj]) def __lt__(self, other): - return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred) + return (self.filename, self.subj, self.obj, self.pred) < ( + other.filename, + other.subj, + other.obj, + other.pred, + ) + @dataclasses.dataclass(frozen=True) class LabeledCrossReference(CrossReference): @@ -39,16 +46,18 @@ class LabeledCrossReference(CrossReference): def __str__(self): return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")""" + @dataclasses.dataclass(frozen=True) class IdentifierRecord: """A record from the Identifiers.parquet file.""" + curie: str extra_fields: tuple = () @staticmethod def from_row(row: tuple, column_names: list[str]): """Create an IdentifierRecord from a DuckDB result row and its column names.""" - curie_idx = column_names.index('curie') + curie_idx = column_names.index("curie") extra = tuple( (col, row[i]) for i, col in enumerate(column_names) if i != curie_idx ) @@ -74,25 +83,37 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: :return: A list of IdentifierRecords containing those CURIEs. """ - identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet') + identifier_parquet = self.downloader.get_downloaded_file( + "duckdb/Identifiers.parquet" + ) # Query the Parquet files using DuckDB (in-memory; nothing is persisted). with duckdb.connect() as db: - identifier_table = db.read_parquet(identifier_parquet) - result = db.execute("SELECT * FROM identifier_table WHERE curie IN $1", [curies]) + identifier_table = db.read_parquet(identifier_parquet) # noqa: F841 — DuckDB resolves 'identifier_table' by Python variable name in the SQL query + result = db.execute( + "SELECT * FROM identifier_table WHERE curie IN $1", [curies] + ) column_names = [desc[0] for desc in result.description] - return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()] + return [ + IdentifierRecord.from_row(row, column_names) + for row in result.fetchall() + ] @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): if label_curies and self.nodenorm is None: - raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).") + raise ValueError( + "label_curies=True requires a configured NodeNorm instance (nodenorm was None)." + ) - concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') + concord_parquet = self.downloader.get_downloaded_file("duckdb/Concord.parquet") with duckdb.connect() as db: - concord_table = db.read_parquet(concord_parquet) - xref_tuples = db.execute("SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall() + concord_table = db.read_parquet(concord_parquet) # noqa: F841 — DuckDB resolves 'concord_table' by Python variable name in the SQL query + xref_tuples = db.execute( + "SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", + [curie], + ).fetchall() xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples] if label_curies: @@ -115,15 +136,18 @@ def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference: def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False): """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query.""" if label_curies and self.nodenorm is None: - raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).") + raise ValueError( + "label_curies=True requires a configured NodeNorm instance (nodenorm was None)." + ) if not curies: return [] - concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet') + concord_parquet = self.downloader.get_downloaded_file("duckdb/Concord.parquet") with duckdb.connect() as db: - concord_table = db.read_parquet(concord_parquet) - rows = db.execute(""" + concord_table = db.read_parquet(concord_parquet) # noqa: F841 — DuckDB resolves 'concord_table' by Python variable name in the SQL query + rows = db.execute( + """ WITH RECURSIVE edges(a, b) AS ( SELECT subj, obj FROM concord_table @@ -142,7 +166,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal WHERE c.subj IN (SELECT curie FROM frontier) OR c.obj IN (SELECT curie FROM frontier) ORDER BY c.filename, c.subj, c.obj, c.pred - """, [curies]).fetchall() + """, + [curies], + ).fetchall() xrefs = [CrossReference.from_tuple(row) for row in rows] @@ -151,7 +177,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal return xrefs - def get_curie_xrefs(self, curies: list[str], recurse: bool = False, label_curies: bool = False): + def get_curie_xrefs( + self, curies: list[str], recurse: bool = False, label_curies: bool = False + ): """ Search for all identifiers that are cross-referenced to the given CURIE. diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 016fb01..6ba9a38 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -15,7 +15,14 @@ class BabelDownloader: Class for downloading Babel cross-reference files to a local directory as needed. """ - def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600, timeout: int = 30): + def __init__( + self, + url_base, + local_path=None, + retries=10, + freshness_seconds=3 * 3600, + timeout: int = 30, + ): # We assume the URL base is correct (if not, we can fix it later). self.url_base = url_base self.retries = retries @@ -33,7 +40,9 @@ def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * elif os.path.exists(local_path) and os.path.isdir(local_path): self.local_path = local_path else: - raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'") + raise ValueError( + f"Invalid local_path (must be an existing directory): '{local_path}'" + ) @functools.lru_cache(maxsize=None) def get_output_file(self, filename): @@ -134,7 +143,9 @@ def _etag_matches(self, url, meta): self.logger.info(f"ETag matches ({remote_etag}), file is current") return True else: - self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading") + self.logger.info( + f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading" + ) return False # Fallback: Last-Modified + Content-Length @@ -145,10 +156,14 @@ def _etag_matches(self, url, meta): if local_lm and remote_lm and local_lm == remote_lm: if local_cl is None or remote_cl is None or int(remote_cl) == local_cl: - self.logger.info(f"Last-Modified matches ({remote_lm}), file is current") + self.logger.info( + f"Last-Modified matches ({remote_lm}), file is current" + ) return True - self.logger.info("Cannot confirm file is current (no matching ETag or Last-Modified), will re-download") + self.logger.info( + "Cannot confirm file is current (no matching ETag or Last-Modified), will re-download" + ) return False def _stream_download(self, response, local_path, resume_byte_pos, chunk_size): @@ -162,23 +177,23 @@ def _stream_download(self, response, local_path, resume_byte_pos, chunk_size): chunk_size: Size of chunks to read/write """ # Get total size from Content-Length header (may not be present) - content_length = response.headers.get('Content-Length') + content_length = response.headers.get("Content-Length") if content_length: total_size = int(content_length) + resume_byte_pos else: total_size = None # Open file in append mode if resuming, write mode otherwise - mode = 'ab' if resume_byte_pos > 0 else 'wb' + mode = "ab" if resume_byte_pos > 0 else "wb" with open(local_path, mode) as f: with tqdm( total=total_size, initial=resume_byte_pos, - unit='B', + unit="B", unit_scale=True, unit_divisor=1024, - desc=os.path.basename(local_path) + desc=os.path.basename(local_path), ) as progress_bar: for chunk in response.iter_content(chunk_size=chunk_size): if chunk: @@ -210,12 +225,13 @@ def _download_with_retry(self, url, local_path, chunk_size): # Prepare headers for resume headers = {} if resume_byte_pos > 0: - headers['Range'] = f'bytes={resume_byte_pos}-' + headers["Range"] = f"bytes={resume_byte_pos}-" self.logger.info(f"Resuming download from byte {resume_byte_pos}") # Make streaming request with timeout for connection (not total time) - with requests.get(url, headers=headers, stream=True, timeout=self.timeout) as response: - + with requests.get( + url, headers=headers, stream=True, timeout=self.timeout + ) as response: # Handle different response codes if response.status_code == 416: # Range Not Satisfiable - file already complete @@ -223,11 +239,13 @@ def _download_with_retry(self, url, local_path, chunk_size): return response.headers elif response.status_code == 206: # Partial Content - resume successful - self.logger.info(f"Resuming download (HTTP 206)") + self.logger.info("Resuming download (HTTP 206)") elif response.status_code == 200: # OK - server doesn't support resume or no Range header was sent if resume_byte_pos > 0: - self.logger.warning(f"Server doesn't support resume, restarting from beginning") + self.logger.warning( + "Server doesn't support resume, restarting from beginning" + ) resume_byte_pos = 0 # Remove partial file if os.path.exists(local_path): @@ -236,25 +254,31 @@ def _download_with_retry(self, url, local_path, chunk_size): response.raise_for_status() # Stream download with progress bar - self._stream_download(response, local_path, resume_byte_pos, chunk_size) + self._stream_download( + response, local_path, resume_byte_pos, chunk_size + ) # Success - exit retry loop return response.headers except (requests.RequestException, IOError) as e: - self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}") + self.logger.warning( + f"Download attempt {attempt}/{self.retries} failed: {e}" + ) if attempt < self.retries: # Calculate exponential backoff with max of 60 seconds - wait_time = min(2 ** attempt, 60) + wait_time = min(2**attempt, 60) self.logger.info(f"Retrying in {wait_time} seconds...") time.sleep(wait_time) else: # All retries exhausted - raise RuntimeError(f"Failed to download {url} after {self.retries} attempts: {e}") + raise RuntimeError( + f"Failed to download {url} after {self.retries} attempts: {e}" + ) @functools.lru_cache(maxsize=None) - def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): + def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024): """ Download a file from the Babel server to local storage with ETag-based caching. @@ -280,7 +304,9 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): if meta is not None: # Tier 1: within freshness window — skip all network calls if self._is_within_freshness(meta, self.freshness_seconds): - self.logger.info(f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}") + self.logger.info( + f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}" + ) return local_path_to_download_to # Tier 2: stale but maybe unchanged — HEAD request @@ -290,24 +316,34 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024): meta_path = self._get_meta_path(local_path_to_download_to) with open(meta_path, "w") as f: json.dump(meta, f, indent=2) - self.logger.info(f"ETag matches, using existing file: {local_path_to_download_to}") + self.logger.info( + f"ETag matches, using existing file: {local_path_to_download_to}" + ) return local_path_to_download_to # Tier 3: ETag changed — delete and re-download - self.logger.warning(f"Remote file changed, re-downloading: {local_path_to_download_to}") + self.logger.warning( + f"Remote file changed, re-downloading: {local_path_to_download_to}" + ) os.remove(local_path_to_download_to) - self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}") + self.logger.info( + f"Downloading {url_to_download} to {local_path_to_download_to}" + ) # Download with retry logic; get response headers back - response_headers = self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size) + response_headers = self._download_with_retry( + url_to_download, local_path_to_download_to, chunk_size + ) # Save sidecar metadata if response_headers is not None: self._save_meta(local_path_to_download_to, response_headers) bytes_downloaded = os.path.getsize(local_path_to_download_to) - self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes") + self.logger.info( + f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes" + ) return local_path_to_download_to @functools.lru_cache(maxsize=None) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index 04a3629..ec5b0a8 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -3,6 +3,7 @@ import requests import logging + @dataclasses.dataclass class Identifier: curie: str @@ -16,17 +17,18 @@ def __lt__(self, other): @staticmethod def from_dict(d: dict): - identifier = Identifier(curie=d['identifier']) - if 'label' in d: - identifier.label = d['label'] - if 'taxa' in d: - identifier.taxa = d['taxa'] - if 'description' in d: - identifier.description = d['description'] - if 'type' in d: - identifier.biolink_type = d['type'] + identifier = Identifier(curie=d["identifier"]) + if "label" in d: + identifier.label = d["label"] + if "taxa" in d: + identifier.taxa = d["taxa"] + if "description" in d: + identifier.description = d["description"] + if "type" in d: + identifier.biolink_type = d["type"] return identifier + class NodeNorm: def __init__(self, nodenorm_url: str = "", timeout: int = 30): self.nodenorm_url = nodenorm_url @@ -40,30 +42,44 @@ def get_identifier(self, curie: str): logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}") if not result: return Identifier(curie=curie) - for identifier in result.get('equivalent_identifiers', []): - if identifier['identifier'] == curie: + for identifier in result.get("equivalent_identifiers", []): + if identifier["identifier"] == curie: logging.debug(f"Found exact match for {curie}: {identifier}") return Identifier.from_dict(identifier) return Identifier(curie=curie) @functools.lru_cache(maxsize=None) - def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True, description=True, individual_types=True, include_taxa=True): - response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={ - "curie": curie, - "conflate": conflate, - "drug_chemical_conflate": drug_chemical_conflate, - "description": description, - "individual_types": individual_types, - "include_taxa": include_taxa, - }, timeout=self.timeout) + def normalize_curie( + self, + curie: str, + conflate=True, + drug_chemical_conflate=True, + description=True, + individual_types=True, + include_taxa=True, + ): + response = requests.get( + f"{self.nodenorm_url}get_normalized_nodes", + params={ + "curie": curie, + "conflate": conflate, + "drug_chemical_conflate": drug_chemical_conflate, + "description": description, + "individual_types": individual_types, + "include_taxa": include_taxa, + }, + timeout=self.timeout, + ) response.raise_for_status() result = response.json() try: return result[curie] except KeyError: - logging.debug(f"NodeNorm response did not contain CURIE {curie!r}; returning None") + logging.debug( + f"NodeNorm response did not contain CURIE {curie!r}; returning None" + ) return None @functools.lru_cache(maxsize=None) @@ -71,6 +87,8 @@ def get_clique_identifiers(self, curie, **kwargs): result = self.normalize_curie(curie, **kwargs) if not result: return None - if 'equivalent_identifiers' not in result: + if "equivalent_identifiers" not in result: return None - return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers'])) + return list( + map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"]) + ) diff --git a/tests/conftest.py b/tests/conftest.py index f1e0df6..92ecb06 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -116,6 +116,8 @@ def babel_xrefs(shared_downloader, downloaded_parquet_files) -> BabelXRefs: @pytest.fixture(scope="session") -def babel_xrefs_with_nodenorm(shared_downloader, nodenorm, downloaded_parquet_files) -> BabelXRefs: +def babel_xrefs_with_nodenorm( + shared_downloader, nodenorm, downloaded_parquet_files +) -> BabelXRefs: """A BabelXRefs instance with NodeNorm, Concord + Metadata already downloaded.""" return BabelXRefs(shared_downloader, nodenorm) diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 75e33cb..d67f81b 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -28,7 +28,9 @@ class TestCrossReference: def test_creation(self): - xr = CrossReference(filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2") + xr = CrossReference( + filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2" + ) assert xr.filename == "f.txt" assert xr.subj == "A:1" assert xr.pred == "skos:exactMatch" @@ -85,9 +87,14 @@ def test_sorting(self): class TestLabeledCrossReference: def test_creation(self): lxr = LabeledCrossReference( - subj="A:1", pred="p", obj="B:2", filename="f", - subj_label="Alpha", subj_biolink_type="biolink:Disease", - obj_label="Beta", obj_biolink_type="biolink:Gene", + subj="A:1", + pred="p", + obj="B:2", + filename="f", + subj_label="Alpha", + subj_biolink_type="biolink:Disease", + obj_label="Beta", + obj_biolink_type="biolink:Gene", ) assert lxr.subj == "A:1" assert lxr.subj_label == "Alpha" @@ -95,23 +102,40 @@ def test_creation(self): def test_inherits_from_cross_reference(self): lxr = LabeledCrossReference( - subj="A:1", pred="p", obj="B:2", filename="f", - subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="", + subj="A:1", + pred="p", + obj="B:2", + filename="f", + subj_label="", + subj_biolink_type="", + obj_label="", + obj_biolink_type="", ) assert isinstance(lxr, CrossReference) def test_curies_property(self): lxr = LabeledCrossReference( - subj="A:1", pred="p", obj="B:2", filename="f", - subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="", + subj="A:1", + pred="p", + obj="B:2", + filename="f", + subj_label="", + subj_biolink_type="", + obj_label="", + obj_biolink_type="", ) assert lxr.curies == frozenset({"A:1", "B:2"}) def test_str(self): lxr = LabeledCrossReference( - subj="A:1", pred="p", obj="B:2", filename="f", - subj_label="Alpha", subj_biolink_type="biolink:Disease", - obj_label="Beta", obj_biolink_type="biolink:Gene", + subj="A:1", + pred="p", + obj="B:2", + filename="f", + subj_label="Alpha", + subj_biolink_type="biolink:Disease", + obj_label="Beta", + obj_biolink_type="biolink:Gene", ) s = str(lxr) assert "A:1" in s @@ -188,9 +212,16 @@ def test_get_curie_xref_calls_downloader(self, tmp_path): mock_db.read_parquet.return_value = "table" mock_db.execute.return_value = mock_result - with patch.object(bx.downloader, 'get_downloaded_file', return_value="/fake/path") as mock_dl: - with patch.object(bx.downloader, 'get_output_file', return_value="/fake/db"): - with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db): + with patch.object( + bx.downloader, "get_downloaded_file", return_value="/fake/path" + ) as mock_dl: + with patch.object( + bx.downloader, "get_output_file", return_value="/fake/db" + ): + with patch( + "babel_explorer.core.babel_xrefs.duckdb.connect", + return_value=mock_db, + ): bx.get_curie_xref.cache_clear() result = bx.get_curie_xref("A:1") # Downloader should be called for Concord only (Metadata unused here) @@ -202,7 +233,7 @@ def test_get_curie_xref_calls_downloader(self, tmp_path): def test_get_curie_xrefs_no_expand(self, tmp_path): bx = self._make_bx(tmp_path) xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") - with patch.object(bx, 'get_curie_xref', return_value=[xr]): + with patch.object(bx, "get_curie_xref", return_value=[xr]): bx.get_curie_xref.cache_clear() result = bx.get_curie_xrefs(["A:1"], recurse=False) assert len(result) == 1 @@ -213,7 +244,9 @@ def test_get_curie_xrefs_with_expand(self, tmp_path): xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2") xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3") - with patch.object(bx, '_get_curie_xrefs_recursive', return_value=[xr1, xr2]) as mock_rec: + with patch.object( + bx, "_get_curie_xrefs_recursive", return_value=[xr1, xr2] + ) as mock_rec: result = bx.get_curie_xrefs(["A:1"], recurse=True) mock_rec.assert_called_once_with(["A:1"], False) assert xr1 in result @@ -239,7 +272,9 @@ def test_get_curie_xrefs_recursive_sql_traversal(self, tmp_path): """) setup_db.close() - with patch.object(bx.downloader, 'get_downloaded_file', return_value=parquet_path): + with patch.object( + bx.downloader, "get_downloaded_file", return_value=parquet_path + ): # Starting from A:1 should reach B:2 and C:3 but not the D-E component result = bx._get_curie_xrefs_recursive(["A:1"]) pairs = {(xr.subj, xr.obj) for xr in result} @@ -262,7 +297,7 @@ def test_results_are_sorted(self, tmp_path): xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1") xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1") - with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]): + with patch.object(bx, "get_curie_xref", return_value=[xr_b, xr_a]): result = bx.get_curie_xrefs(["X:1"], recurse=False) assert result == [xr_a, xr_b] diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 16a7e9b..9b33e7a 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -39,7 +39,9 @@ def test_creates_directory_if_missing(self, tmp_path): assert dl.local_path == new_dir def test_custom_retries(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3) + dl = BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path), retries=3 + ) assert dl.retries == 3 def test_default_retries(self, tmp_path): @@ -51,7 +53,11 @@ def test_default_freshness_seconds(self, tmp_path): assert dl.freshness_seconds == 3 * 3600 def test_custom_freshness_seconds(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), freshness_seconds=0) + dl = BabelDownloader( + url_base="https://example.com/", + local_path=str(tmp_path), + freshness_seconds=0, + ) assert dl.freshness_seconds == 0 def test_invalid_path_raises_value_error(self): @@ -90,13 +96,15 @@ class TestSaveMeta: """Tests for _save_meta.""" def _make_dl(self, tmp_path): - return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + return BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path) + ) def test_writes_all_fields(self, tmp_path): dl = self._make_dl(tmp_path) file_path = str(tmp_path / "test.parquet") # Create the file so the path is valid - open(file_path, 'wb').close() + open(file_path, "wb").close() headers = { "ETag": '"abc123"', @@ -118,7 +126,7 @@ def test_writes_all_fields(self, tmp_path): def test_last_checked_is_recent_utc(self, tmp_path): dl = self._make_dl(tmp_path) file_path = str(tmp_path / "f.parquet") - open(file_path, 'wb').close() + open(file_path, "wb").close() dl._save_meta(file_path, {"ETag": '"x"'}) @@ -133,7 +141,7 @@ def test_missing_headers_not_written(self, tmp_path): """Headers not present in the response should not appear in .meta.""" dl = self._make_dl(tmp_path) file_path = str(tmp_path / "sparse.parquet") - open(file_path, 'wb').close() + open(file_path, "wb").close() dl._save_meta(file_path, {}) @@ -150,7 +158,9 @@ class TestLoadMeta: """Tests for _load_meta.""" def _make_dl(self, tmp_path): - return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + return BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path) + ) def test_returns_none_if_no_meta_file(self, tmp_path): dl = self._make_dl(tmp_path) @@ -159,7 +169,7 @@ def test_returns_none_if_no_meta_file(self, tmp_path): def test_returns_dict_for_valid_meta(self, tmp_path): dl = self._make_dl(tmp_path) file_path = str(tmp_path / "f.parquet") - open(file_path, 'wb').close() + open(file_path, "wb").close() meta_data = {"etag": '"abc"', "last_checked": "2026-01-01T00:00:00+00:00"} with open(file_path + ".meta", "w") as f: json.dump(meta_data, f) @@ -170,7 +180,7 @@ def test_returns_dict_for_valid_meta(self, tmp_path): def test_returns_none_for_corrupt_meta(self, tmp_path): dl = self._make_dl(tmp_path) file_path = str(tmp_path / "corrupt.parquet") - open(file_path, 'wb').close() + open(file_path, "wb").close() with open(file_path + ".meta", "w") as f: f.write("not valid json {{{") @@ -181,7 +191,9 @@ class TestIsWithinFreshness: """Tests for _is_within_freshness.""" def _make_dl(self, tmp_path): - return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + return BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path) + ) def test_returns_true_when_recent(self, tmp_path): dl = self._make_dl(tmp_path) @@ -217,7 +229,9 @@ class TestEtagMatches: """Tests for _etag_matches.""" def _make_dl(self, tmp_path): - return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) + return BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path) + ) def test_returns_true_on_matching_etag(self, tmp_path): dl = self._make_dl(tmp_path) @@ -225,7 +239,9 @@ def test_returns_true_on_matching_etag(self, tmp_path): mock_resp = Mock() mock_resp.headers = {"ETag": '"abc123"'} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_resp + ): assert dl._etag_matches("https://example.com/f.parquet", meta) is True def test_returns_false_on_different_etag(self, tmp_path): @@ -234,7 +250,9 @@ def test_returns_false_on_different_etag(self, tmp_path): mock_resp = Mock() mock_resp.headers = {"ETag": '"new"'} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_resp + ): assert dl._etag_matches("https://example.com/f.parquet", meta) is False def test_fallback_last_modified_match(self, tmp_path): @@ -244,14 +262,18 @@ def test_fallback_last_modified_match(self, tmp_path): mock_resp = Mock() mock_resp.headers = {"Last-Modified": lm, "Content-Length": "100"} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_resp + ): assert dl._etag_matches("https://example.com/f.parquet", meta) is True def test_returns_false_on_request_error(self, tmp_path): dl = self._make_dl(tmp_path) meta = {"etag": '"abc"'} - with patch("babel_explorer.core.downloader.requests.head", - side_effect=requests.ConnectionError("fail")): + with patch( + "babel_explorer.core.downloader.requests.head", + side_effect=requests.ConnectionError("fail"), + ): assert dl._etag_matches("https://example.com/f.parquet", meta) is False @@ -259,8 +281,11 @@ class TestGetDownloadedFileTiers: """Tests for the three-tier logic in get_downloaded_file.""" def _make_dl(self, tmp_path, freshness=3600): - return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), - freshness_seconds=freshness) + return BabelDownloader( + url_base="https://example.com/", + local_path=str(tmp_path), + freshness_seconds=freshness, + ) # --- Tier 1: within freshness window --- @@ -303,7 +328,9 @@ def test_tier2_head_check_no_redownload(self, tmp_path): mock_head_resp.headers = {"ETag": '"abc"'} mock_head_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp + ): with patch("babel_explorer.core.downloader.requests.get") as mock_get: dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) @@ -327,7 +354,9 @@ def test_tier2_updates_last_checked_after_head(self, tmp_path): mock_head_resp.headers = {"ETag": '"abc"'} mock_head_resp.raise_for_status = Mock() - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp + ): dl.get_downloaded_file.cache_clear() dl.get_downloaded_file(test_file) @@ -358,16 +387,18 @@ def test_tier3_redownloads_when_etag_changed(self, tmp_path): new_content = b"new data" def fake_download(url, path, chunk_size): - with open(path, 'wb') as f: + with open(path, "wb") as f: f.write(new_content) return {"ETag": '"new"', "Content-Length": str(len(new_content))} - with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp): - with patch.object(dl, '_download_with_retry', side_effect=fake_download): + with patch( + "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp + ): + with patch.object(dl, "_download_with_retry", side_effect=fake_download): dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) - assert open(result, 'rb').read() == new_content + assert open(result, "rb").read() == new_content # --- No .meta: fresh download --- @@ -375,22 +406,23 @@ def test_downloads_when_no_meta(self, tmp_path): """No file and no .meta → download happens, .meta is saved.""" dl = self._make_dl(tmp_path) test_file = "duckdb/new.parquet" - local_path = str(tmp_path / "duckdb" / "new.parquet") content = b"fresh download" def fake_download(url, path, chunk_size): os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'wb') as f: + with open(path, "wb") as f: f.write(content) return {"ETag": '"fresh"', "Content-Length": str(len(content))} - with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + with patch.object( + dl, "_download_with_retry", side_effect=fake_download + ) as mock_dl: dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) mock_dl.assert_called_once() assert os.path.exists(result) - assert open(result, 'rb').read() == content + assert open(result, "rb").read() == content # .meta should be saved meta_path = result + ".meta" assert os.path.exists(meta_path) @@ -410,16 +442,18 @@ def test_downloads_when_file_exists_but_no_meta(self, tmp_path): new_content = b"refreshed" def fake_download(url, path, chunk_size): - with open(path, 'wb') as f: + with open(path, "wb") as f: f.write(new_content) return {"ETag": '"new"'} - with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + with patch.object( + dl, "_download_with_retry", side_effect=fake_download + ) as mock_dl: dl.get_downloaded_file.cache_clear() result = dl.get_downloaded_file(test_file) mock_dl.assert_called_once() - assert open(result, 'rb').read() == new_content + assert open(result, "rb").read() == new_content class TestGetDownloadedFileCaching: @@ -430,11 +464,13 @@ def test_cache_returns_same_result(self, tmp_path): content = b"cached content" def fake_download(url, path, chunk_size): - with open(path, 'wb') as f: + with open(path, "wb") as f: f.write(content) return {} - with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl: + with patch.object( + dl, "_download_with_retry", side_effect=fake_download + ) as mock_dl: dl.get_downloaded_file.cache_clear() r1 = dl.get_downloaded_file("cached.txt") r2 = dl.get_downloaded_file("cached.txt") @@ -456,20 +492,31 @@ def _make_response(status_code, headers=None, content=None): return m def test_retries_exhausted_raises_runtime_error(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2) - with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")): + dl = BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path), retries=2 + ) + with patch( + "babel_explorer.core.downloader.requests.get", + side_effect=requests.ConnectionError("fail"), + ): with patch("babel_explorer.core.downloader.time.sleep"): # skip waiting with pytest.raises(RuntimeError, match="Failed to download"): - dl._download_with_retry("https://example.com/file", str(tmp_path / "f"), 1024) + dl._download_with_retry( + "https://example.com/file", str(tmp_path / "f"), 1024 + ) def test_succeeds_on_second_attempt(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3) + dl = BabelDownloader( + url_base="https://example.com/", local_path=str(tmp_path), retries=3 + ) out_path = str(tmp_path / "retry_success.bin") - mock_response = self._make_response(200, {'Content-Length': '5'}, [b"hello"]) + mock_response = self._make_response(200, {"Content-Length": "5"}, [b"hello"]) side_effects = [requests.ConnectionError("first fail"), mock_response] - with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects): + with patch( + "babel_explorer.core.downloader.requests.get", side_effect=side_effects + ): with patch("babel_explorer.core.downloader.time.sleep"): dl._download_with_retry("https://example.com/file", out_path, 1024) assert os.path.exists(out_path) @@ -479,11 +526,13 @@ def test_resume_sends_range_header(self, tmp_path): out_path = tmp_path / "partial.bin" out_path.write_bytes(b"partial") # 7 bytes - mock_response = self._make_response(206, {'Content-Length': '3'}, [b"end"]) - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get: + mock_response = self._make_response(206, {"Content-Length": "3"}, [b"end"]) + with patch( + "babel_explorer.core.downloader.requests.get", return_value=mock_response + ) as mock_get: dl._download_with_retry("https://example.com/file", str(out_path), 1024) _, kwargs = mock_get.call_args - assert kwargs['headers'] == {'Range': 'bytes=7-'} + assert kwargs["headers"] == {"Range": "bytes=7-"} def test_http_416_file_already_complete(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) @@ -491,7 +540,9 @@ def test_http_416_file_already_complete(self, tmp_path): out_path.write_bytes(b"full file") mock_response = self._make_response(416) - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): + with patch( + "babel_explorer.core.downloader.requests.get", return_value=mock_response + ): dl._download_with_retry("https://example.com/file", str(out_path), 1024) # Should return without error assert out_path.read_bytes() == b"full file" @@ -502,8 +553,12 @@ def test_server_no_resume_restarts_download(self, tmp_path): out_path = tmp_path / "no_resume.bin" out_path.write_bytes(b"partial") - mock_response = self._make_response(200, {'Content-Length': '12'}, [b"full content"]) - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): + mock_response = self._make_response( + 200, {"Content-Length": "12"}, [b"full content"] + ) + with patch( + "babel_explorer.core.downloader.requests.get", return_value=mock_response + ): dl._download_with_retry("https://example.com/file", str(out_path), 1024) assert out_path.read_bytes() == b"full content" @@ -512,10 +567,16 @@ def test_returns_response_headers(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) out_path = str(tmp_path / "headers.bin") - mock_response = self._make_response(200, {'Content-Length': '5', 'ETag': '"abc"'}, [b"hello"]) - with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response): - headers = dl._download_with_retry("https://example.com/file", out_path, 1024) - assert headers['ETag'] == '"abc"' + mock_response = self._make_response( + 200, {"Content-Length": "5", "ETag": '"abc"'}, [b"hello"] + ) + with patch( + "babel_explorer.core.downloader.requests.get", return_value=mock_response + ): + headers = dl._download_with_retry( + "https://example.com/file", out_path, 1024 + ) + assert headers["ETag"] == '"abc"' class TestStreamDownload: @@ -526,11 +587,11 @@ def test_writes_chunks(self, tmp_path): out_path = str(tmp_path / "stream.bin") mock_response = Mock() - mock_response.headers = {'Content-Length': '10'} + mock_response.headers = {"Content-Length": "10"} mock_response.iter_content = Mock(return_value=[b"hello", b"world"]) dl._stream_download(mock_response, out_path, resume_byte_pos=0, chunk_size=1024) - with open(out_path, 'rb') as f: + with open(out_path, "rb") as f: assert f.read() == b"helloworld" def test_append_mode_on_resume(self, tmp_path): @@ -539,10 +600,12 @@ def test_append_mode_on_resume(self, tmp_path): out_path.write_bytes(b"start") mock_response = Mock() - mock_response.headers = {'Content-Length': '3'} + mock_response.headers = {"Content-Length": "3"} mock_response.iter_content = Mock(return_value=[b"end"]) - dl._stream_download(mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024) + dl._stream_download( + mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024 + ) assert out_path.read_bytes() == b"startend" diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py index 2322eef..363d077 100644 --- a/tests/test_nodenorm.py +++ b/tests/test_nodenorm.py @@ -75,7 +75,11 @@ def test_lt_ordering(self): assert a < b def test_sorting(self): - items = [Identifier(curie="C:3"), Identifier(curie="A:1"), Identifier(curie="B:2")] + items = [ + Identifier(curie="C:3"), + Identifier(curie="A:1"), + Identifier(curie="B:2"), + ] result = sorted(items) assert [x.curie for x in result] == ["A:1", "B:2", "C:3"] @@ -108,7 +112,9 @@ def test_correct_api_endpoint_and_params(self): mock_resp.json.return_value = {"X:1": {"id": {"identifier": "X:1"}}} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get: + with patch( + "babel_explorer.core.nodenorm.requests.get", return_value=mock_resp + ) as mock_get: nn.normalize_curie("X:1") mock_get.assert_called_once() args, kwargs = mock_get.call_args @@ -132,7 +138,9 @@ def test_lru_caching(self): mock_resp.json.return_value = {"X:1": {"id": "X:1"}} mock_resp.raise_for_status = Mock() - with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get: + with patch( + "babel_explorer.core.nodenorm.requests.get", return_value=mock_resp + ) as mock_get: nn.normalize_curie("X:1") nn.normalize_curie("X:1") mock_get.assert_called_once() @@ -162,7 +170,7 @@ def test_exact_match_found(self): {"identifier": "X:2", "label": "Beta"}, ], } - with patch.object(nn, 'normalize_curie', return_value=api_result): + with patch.object(nn, "normalize_curie", return_value=api_result): ident = nn.get_identifier("X:1") assert ident.curie == "X:1" assert ident.label == "Alpha" @@ -174,14 +182,14 @@ def test_no_match_returns_bare_identifier(self): {"identifier": "X:2", "label": "Beta"}, ], } - with patch.object(nn, 'normalize_curie', return_value=api_result): + with patch.object(nn, "normalize_curie", return_value=api_result): ident = nn.get_identifier("X:1") assert ident.curie == "X:1" assert ident.label == "" def test_falsy_result_returns_bare_identifier(self): nn = self._make_nn() - with patch.object(nn, 'normalize_curie', return_value=None): + with patch.object(nn, "normalize_curie", return_value=None): ident = nn.get_identifier("X:1") assert ident.curie == "X:1" assert ident.label == "" @@ -193,7 +201,7 @@ def test_caching(self): {"identifier": "X:1", "label": "Alpha"}, ], } - with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm: + with patch.object(nn, "normalize_curie", return_value=api_result) as mock_norm: nn.get_identifier("X:1") nn.get_identifier("X:1") mock_norm.assert_called_once() @@ -214,7 +222,7 @@ def test_success_returns_list(self): {"identifier": "X:2", "label": "Beta"}, ], } - with patch.object(nn, 'normalize_curie', return_value=api_result): + with patch.object(nn, "normalize_curie", return_value=api_result): result = nn.get_clique_identifiers("X:1") assert len(result) == 2 assert all(isinstance(x, Identifier) for x in result) @@ -222,7 +230,7 @@ def test_success_returns_list(self): def test_missing_key_returns_none(self): nn = self._make_nn() api_result = {"id": {"identifier": "X:1"}} # no equivalent_identifiers - with patch.object(nn, 'normalize_curie', return_value=api_result): + with patch.object(nn, "normalize_curie", return_value=api_result): result = nn.get_clique_identifiers("X:1") assert result is None @@ -231,7 +239,7 @@ def test_caching(self): api_result = { "equivalent_identifiers": [{"identifier": "X:1"}], } - with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm: + with patch.object(nn, "normalize_curie", return_value=api_result) as mock_norm: nn.get_clique_identifiers("X:1") nn.get_clique_identifiers("X:1") mock_norm.assert_called_once() From f9e549e62c9f5d150d582eacd1210fc9d004191b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 17:00:36 -0400 Subject: [PATCH 50/66] Rename lint workflow to CI and add unit test job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaced .github/workflows/lint.yml with ci.yml. The new file keeps the existing ruff lint/format job and adds a parallel test job that runs `pytest -v -m "not integration"` (unit tests only — no network required). Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/{lint.yml => ci.yml} | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) rename .github/workflows/{lint.yml => ci.yml} (54%) diff --git a/.github/workflows/lint.yml b/.github/workflows/ci.yml similarity index 54% rename from .github/workflows/lint.yml rename to .github/workflows/ci.yml index f4771d4..8ce9dd1 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,10 @@ -name: Lint +name: CI on: pull_request: jobs: - ruff: + lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -12,3 +12,11 @@ jobs: - run: uv sync --group dev - run: uv run ruff check src/ tests/ - run: uv run ruff format --check src/ tests/ + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - run: uv sync --group dev + - run: uv run pytest -v -m "not integration" From 7d1d5ca438b029f8ca82032083295c1b46530f8c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 17:25:40 -0400 Subject: [PATCH 51/66] Improved documentation. --- src/babel_explorer/cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 0e25ea3..878720a 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -85,14 +85,12 @@ def xrefs( """ Fetches and prints the cross-references (xrefs) for the given CURIEs. - This function searches for xrefs associated with the provided CURIEs. - \f :param curies: A list of CURIEs (Compact URI) for which cross-references need to be retrieved. :type curies: list[str] - :param babel_url: Base URL of the Babel server + :param babel_url: Base URL of the Babel server from which to download DuckDB files. :type babel_url: str :return: None From 0d9e32c14b7c16a89e8e180c7437752b65bc9156 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 17:30:45 -0400 Subject: [PATCH 52/66] Add tests for parse_duration() in cli.py Covers all 5 branches: empty input, 'never', unit suffix, bare integer, and invalid values that raise click.BadParameter. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_cli.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/test_cli.py diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..ed12a69 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,52 @@ +""" +Tests for CLI helper functions. + +Unit tests — no network required. +""" + +import pytest +import click + +from babel_explorer.cli import parse_duration + + +# ========================================================================== +# Unit Tests — no network required +# ========================================================================== + + +class TestParseDuration: + """Tests for parse_duration().""" + + @pytest.mark.parametrize( + "value, expected", + [ + ("never", float("inf")), + ("NEVER", float("inf")), + ("3h", 10800), + ("3H", 10800), + ("30m", 1800), + ("1d", 86400), + ("7200s", 7200), + ("7200", 7200), + ("0", 0), + (" 3h ", 10800), + ], + ) + def test_valid_inputs(self, value, expected): + assert parse_duration(value) == expected + + @pytest.mark.parametrize( + "value", + [ + "", + None, + "abc", + "3.5h", + "1.5", + "3x", + ], + ) + def test_invalid_inputs_raise_bad_parameter(self, value): + with pytest.raises(click.BadParameter): + parse_duration(value) From 0ca35eb7b881c7dc9d834746eb25ec6bd4492aa7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Mar 2026 17:39:41 -0400 Subject: [PATCH 53/66] Add CliRunner tests for xrefs, ids, and test-concord commands Co-Authored-By: Claude Sonnet 4.6 --- tests/test_cli.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index ed12a69..c8cc924 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,8 +6,10 @@ import pytest import click +from click.testing import CliRunner +from unittest.mock import patch, MagicMock -from babel_explorer.cli import parse_duration +from babel_explorer.cli import parse_duration, cli # ========================================================================== @@ -50,3 +52,95 @@ def test_valid_inputs(self, value, expected): def test_invalid_inputs_raise_bad_parameter(self, value): with pytest.raises(click.BadParameter): parse_duration(value) + + +class TestCliCommands: + """Tests for CLI commands using CliRunner — no network required.""" + + def test_xrefs_happy_path(self): + runner = CliRunner() + mock_xref = MagicMock() + mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" + + with patch("babel_explorer.cli.BabelDownloader"), \ + patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ + patch("babel_explorer.cli.NodeNorm"): + mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref] + result = runner.invoke(cli, ["xrefs", "MONDO:0004979"]) + + assert result.exit_code == 0 + mock_bx.return_value.get_curie_xrefs.assert_called_once_with( + ("MONDO:0004979",), False, label_curies=False + ) + + def test_xrefs_recurse_and_labels_flags(self): + runner = CliRunner() + mock_xref = MagicMock() + mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" + + with patch("babel_explorer.cli.BabelDownloader"), \ + patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ + patch("babel_explorer.cli.NodeNorm"): + mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref] + result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"]) + + assert result.exit_code == 0 + mock_bx.return_value.get_curie_xrefs.assert_called_once_with( + ("MONDO:0004979",), True, label_curies=True + ) + + def test_xrefs_check_download_option(self): + runner = CliRunner() + + with patch("babel_explorer.cli.BabelDownloader") as mock_dl, \ + patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ + patch("babel_explorer.cli.NodeNorm"): + mock_bx.return_value.get_curie_xrefs.return_value = [] + result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"]) + + assert result.exit_code == 0 + _, kwargs = mock_dl.call_args + assert kwargs.get("freshness_seconds") == 3600 + + def test_ids_happy_path(self): + runner = CliRunner() + mock_id = MagicMock() + mock_id.__str__ = lambda self: "MONDO:0004979 record" + + with patch("babel_explorer.cli.BabelDownloader"), \ + patch("babel_explorer.cli.BabelXRefs") as mock_bx: + mock_bx.return_value.get_curie_ids.return_value = [mock_id] + result = runner.invoke(cli, ["ids", "MONDO:0004979"]) + + assert result.exit_code == 0 + mock_bx.return_value.get_curie_ids.assert_called_once_with(("MONDO:0004979",)) + + def test_test_concord_happy_path(self): + runner = CliRunner() + mock_ident = MagicMock() + mock_ident.curie = "MONDO:0004979" + mock_ident.label = "asthma" + mock_ident.biolink_type = "biolink:Disease" + + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979"]) + + assert result.exit_code == 0 + assert "asthma" in result.output + mock_nn.return_value.get_clique_identifiers.assert_called_once_with("MONDO:0004979") + + def test_test_concord_no_label(self): + runner = CliRunner() + mock_ident = MagicMock() + mock_ident.curie = "MONDO:0004979" + mock_ident.label = None + mock_ident.biolink_type = "biolink:Disease" + + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979"]) + + assert result.exit_code == 0 + assert "MONDO:0004979" in result.output + assert "biolink:Disease" in result.output From 17782a2149229c6ac33356bf54ed5b416b67ed8d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 26 Mar 2026 00:49:42 -0400 Subject: [PATCH 54/66] Reformatted code with ruff. --- tests/test_cli.py | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index c8cc924..09d415d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -62,9 +62,11 @@ def test_xrefs_happy_path(self): mock_xref = MagicMock() mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" - with patch("babel_explorer.cli.BabelDownloader"), \ - patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ - patch("babel_explorer.cli.NodeNorm"): + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref] result = runner.invoke(cli, ["xrefs", "MONDO:0004979"]) @@ -78,11 +80,15 @@ def test_xrefs_recurse_and_labels_flags(self): mock_xref = MagicMock() mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" - with patch("babel_explorer.cli.BabelDownloader"), \ - patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ - patch("babel_explorer.cli.NodeNorm"): + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref] - result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"]) + result = runner.invoke( + cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"] + ) assert result.exit_code == 0 mock_bx.return_value.get_curie_xrefs.assert_called_once_with( @@ -92,11 +98,15 @@ def test_xrefs_recurse_and_labels_flags(self): def test_xrefs_check_download_option(self): runner = CliRunner() - with patch("babel_explorer.cli.BabelDownloader") as mock_dl, \ - patch("babel_explorer.cli.BabelXRefs") as mock_bx, \ - patch("babel_explorer.cli.NodeNorm"): + with ( + patch("babel_explorer.cli.BabelDownloader") as mock_dl, + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): mock_bx.return_value.get_curie_xrefs.return_value = [] - result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"]) + result = runner.invoke( + cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"] + ) assert result.exit_code == 0 _, kwargs = mock_dl.call_args @@ -107,8 +117,10 @@ def test_ids_happy_path(self): mock_id = MagicMock() mock_id.__str__ = lambda self: "MONDO:0004979 record" - with patch("babel_explorer.cli.BabelDownloader"), \ - patch("babel_explorer.cli.BabelXRefs") as mock_bx: + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + ): mock_bx.return_value.get_curie_ids.return_value = [mock_id] result = runner.invoke(cli, ["ids", "MONDO:0004979"]) @@ -128,7 +140,9 @@ def test_test_concord_happy_path(self): assert result.exit_code == 0 assert "asthma" in result.output - mock_nn.return_value.get_clique_identifiers.assert_called_once_with("MONDO:0004979") + mock_nn.return_value.get_clique_identifiers.assert_called_once_with( + "MONDO:0004979" + ) def test_test_concord_no_label(self): runner = CliRunner() From d34c5c32ee272bbab4af4c241be3d95f25a9238a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 16:14:01 -0400 Subject: [PATCH 55/66] Update src/babel_explorer/cli.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/babel_explorer/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 878720a..eb2cbe1 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -61,7 +61,7 @@ def cli(): "--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", - help="NodeNorm URL to check for concord changes", + help="NodeNorm base URL used for node normalization and label enrichment", ) @click.option("--recurse", is_flag=True, help="Recursively query returned xrefs") @click.option("--labels", is_flag=True, help="Include labels for CURIEs") From ec0a71c03b1b69acb06b3d572f01266edc7de5f8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 16:28:10 -0400 Subject: [PATCH 56/66] Cache get_identifier() locals in _to_labeled_xref; root-anchor lib/ in .gitignore Both backported from add-nodenorm-frontend: - babel_xrefs.py: avoid calling get_identifier() twice per CURIE in _to_labeled_xref - .gitignore: anchor lib/ and lib64/ to repo root so nested lib dirs aren't ignored Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 5 +++-- src/babel_explorer/core/babel_xrefs.py | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 67d8b31..55c6b2a 100644 --- a/.gitignore +++ b/.gitignore @@ -17,8 +17,9 @@ dist/ downloads/ eggs/ .eggs/ -lib/ -lib64/ +# Python distribution lib directories (not web/src/lib/) +/lib/ +/lib64/ parts/ sdist/ var/ diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index c218761..53dfb28 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -122,15 +122,17 @@ def get_curie_xref(self, curie: str, label_curies: bool = False): def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference: """Convert a CrossReference to a LabeledCrossReference using NodeNorm.""" + subj_ident = self.nodenorm.get_identifier(xref.subj) + obj_ident = self.nodenorm.get_identifier(xref.obj) return LabeledCrossReference( subj=xref.subj, obj=xref.obj, filename=xref.filename, pred=xref.pred, - subj_label=self.nodenorm.get_identifier(xref.subj).label, - subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type, - obj_label=self.nodenorm.get_identifier(xref.obj).label, - obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type, + subj_label=subj_ident.label, + subj_biolink_type=subj_ident.biolink_type, + obj_label=obj_ident.label, + obj_biolink_type=obj_ident.biolink_type, ) def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False): From 67be81e0b535de1a6a75fbbacc27153d483d6f98 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 16:47:31 -0400 Subject: [PATCH 57/66] Fix bugs and gaps identified in PR #1 code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nodenorm.py: Identifier.biolink_type str→list[str] to match NodeNorm API - nodenorm.py: get_clique_identifiers returns [] instead of None; add return type annotation - nodenorm.py: log debug message when get_identifier finds no exact match - cli.py: parse_duration return type int|float; join biolink_type list for display - tests: update assertions for new biolink_type type; add test-concord edge cases (unknown CURIE producing no output, multiple CURIEs queried independently) - ci.yml: add workflow_dispatch trigger and integration-test job Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 10 ++++++++ src/babel_explorer/cli.py | 11 ++++----- src/babel_explorer/core/nodenorm.py | 11 +++++---- tests/test_cli.py | 37 +++++++++++++++++++++++++++-- tests/test_nodenorm.py | 10 ++++---- 5 files changed, 62 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8ce9dd1..5ee70c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,7 @@ name: CI on: pull_request: + workflow_dispatch: jobs: lint: @@ -20,3 +21,12 @@ jobs: - uses: astral-sh/setup-uv@v5 - run: uv sync --group dev - run: uv run pytest -v -m "not integration" + + integration-test: + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v5 + - run: uv sync --group dev + - run: uv run pytest -v -m "integration and not slow" diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index eb2cbe1..e6e3fab 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -6,7 +6,7 @@ from babel_explorer.core.nodenorm import NodeNorm -def parse_duration(value: str) -> float: +def parse_duration(value: str) -> int | float: """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds.""" units = {"s": 1, "m": 60, "h": 3600, "d": 86400} lower = (value or "").strip().lower() @@ -169,13 +169,12 @@ def test_concord(curies, nodenorm_url): nodenorm = NodeNorm(nodenorm_url) for curie in curies: identifiers = nodenorm.get_clique_identifiers(curie) - for identifier in identifiers or []: + for identifier in identifiers: + biolink = ", ".join(identifier.biolink_type) if identifier.label: - print( - f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}" - ) + print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{biolink}") else: - print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}") + print(f"{curie}\t{identifier.curie}\t\t{biolink}") if __name__ == "__main__": diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index ec5b0a8..f4ead30 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -8,7 +8,7 @@ class Identifier: curie: str label: str = "" - biolink_type: str = "" + biolink_type: list[str] = dataclasses.field(default_factory=list) taxa: list[str] = dataclasses.field(default_factory=list) description: list[str] = dataclasses.field(default_factory=list) @@ -47,6 +47,9 @@ def get_identifier(self, curie: str): logging.debug(f"Found exact match for {curie}: {identifier}") return Identifier.from_dict(identifier) + logging.debug( + f"No exact match for {curie!r} in equivalent_identifiers; returning bare Identifier" + ) return Identifier(curie=curie) @functools.lru_cache(maxsize=None) @@ -83,12 +86,12 @@ def normalize_curie( return None @functools.lru_cache(maxsize=None) - def get_clique_identifiers(self, curie, **kwargs): + def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]: result = self.normalize_curie(curie, **kwargs) if not result: - return None + return [] if "equivalent_identifiers" not in result: - return None + return [] return list( map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"]) ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 09d415d..ac75fe6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -132,7 +132,7 @@ def test_test_concord_happy_path(self): mock_ident = MagicMock() mock_ident.curie = "MONDO:0004979" mock_ident.label = "asthma" - mock_ident.biolink_type = "biolink:Disease" + mock_ident.biolink_type = ["biolink:Disease"] with patch("babel_explorer.cli.NodeNorm") as mock_nn: mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident] @@ -149,7 +149,7 @@ def test_test_concord_no_label(self): mock_ident = MagicMock() mock_ident.curie = "MONDO:0004979" mock_ident.label = None - mock_ident.biolink_type = "biolink:Disease" + mock_ident.biolink_type = ["biolink:Disease"] with patch("babel_explorer.cli.NodeNorm") as mock_nn: mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident] @@ -158,3 +158,36 @@ def test_test_concord_no_label(self): assert result.exit_code == 0 assert "MONDO:0004979" in result.output assert "biolink:Disease" in result.output + + def test_test_concord_unknown_curie_produces_no_output(self): + """When get_clique_identifiers returns [], no output is produced and exit code is 0.""" + runner = CliRunner() + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [] + result = runner.invoke(cli, ["test-concord", "UNKNOWN:9999"]) + assert result.exit_code == 0 + assert result.output.strip() == "" + + def test_test_concord_multiple_curies(self): + """Each CURIE is looked up independently.""" + runner = CliRunner() + mock_a = MagicMock() + mock_a.curie = "A:1" + mock_a.label = "Alpha" + mock_a.biolink_type = ["biolink:Disease"] + mock_b = MagicMock() + mock_b.curie = "B:2" + mock_b.label = "Beta" + mock_b.biolink_type = ["biolink:Gene"] + + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.side_effect = [ + [mock_a], + [mock_b], + ] + result = runner.invoke(cli, ["test-concord", "A:1", "B:2"]) + + assert result.exit_code == 0 + assert mock_nn.return_value.get_clique_identifiers.call_count == 2 + assert "Alpha" in result.output + assert "Beta" in result.output diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py index 363d077..71459e7 100644 --- a/tests/test_nodenorm.py +++ b/tests/test_nodenorm.py @@ -26,7 +26,7 @@ def test_creation_with_defaults(self): ident = Identifier(curie="MONDO:0004979") assert ident.curie == "MONDO:0004979" assert ident.label == "" - assert ident.biolink_type == "" + assert ident.biolink_type == [] assert ident.taxa == [] assert ident.description == [] @@ -34,12 +34,12 @@ def test_full_creation(self): ident = Identifier( curie="MONDO:0004979", label="asthma", - biolink_type="biolink:Disease", + biolink_type=["biolink:Disease"], taxa=["NCBITaxon:9606"], description=["A chronic respiratory disease"], ) assert ident.label == "asthma" - assert ident.biolink_type == "biolink:Disease" + assert ident.biolink_type == ["biolink:Disease"] assert ident.taxa == ["NCBITaxon:9606"] def test_from_dict_minimal(self): @@ -67,7 +67,7 @@ def test_from_dict_partial(self): ident = Identifier.from_dict(d) assert ident.curie == "X:1" assert ident.label == "Beta" - assert ident.biolink_type == "" + assert ident.biolink_type == [] def test_lt_ordering(self): a = Identifier(curie="A:1") @@ -232,7 +232,7 @@ def test_missing_key_returns_none(self): api_result = {"id": {"identifier": "X:1"}} # no equivalent_identifiers with patch.object(nn, "normalize_curie", return_value=api_result): result = nn.get_clique_identifiers("X:1") - assert result is None + assert result == [] def test_caching(self): nn = self._make_nn() From 4199ae2151745e9be86fe00dbd334c35f44af5a6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 16:55:35 -0400 Subject: [PATCH 58/66] Run integration tests on push to master and weekly on Tuesdays - Add push trigger for master branch (fires when PRs are merged) - Add schedule trigger: Tuesdays at 17:00 UTC (12pm EST / 1pm EDT) - Change integration-test job condition to run on all non-PR events Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5ee70c1..7f712ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,10 @@ name: CI on: pull_request: + push: + branches: [master] + schedule: + - cron: "0 17 * * 2" # Tuesdays at 12pm EST (17:00 UTC); 1pm during EDT workflow_dispatch: jobs: @@ -24,7 +28,7 @@ jobs: integration-test: runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' + if: github.event_name != 'pull_request' steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v5 From 17f6b09f9a610be97bd30eb918c2278e8b76e8b6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 18:09:17 -0400 Subject: [PATCH 59/66] Add module, class, and method docstrings to new files in PR #1 Source files: - nodenorm.py: module, Identifier class/from_dict, NodeNorm class/__init__/ get_identifier/normalize_curie/get_clique_identifiers - babel_xrefs.py: convert # comment to module docstring; CrossReference class/ from_tuple/curies property; LabeledCrossReference class; IdentifierRecord.__str__; BabelXRefs class/__init__/get_curie_xref - downloader.py: module, BabelDownloader.__init__, get_output_file - cli.py: cli() group, test_concord() command Test files (class docstrings only): - test_babel_xrefs.py: TestCrossReference, TestLabeledCrossReference, TestIdentifierRecord, TestBabelXRefsInit - test_nodenorm.py: TestIdentifier, TestNodeNormInit, TestNormalizeCurieMocked, TestGetIdentifierMocked, TestGetCliqueIdentifiersMocked Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/cli.py | 6 ++++ src/babel_explorer/core/babel_xrefs.py | 40 ++++++++++++++++++++++++-- src/babel_explorer/core/downloader.py | 14 +++++++++ src/babel_explorer/core/nodenorm.py | 33 ++++++++++++++++++++- tests/test_babel_xrefs.py | 8 ++++++ tests/test_nodenorm.py | 10 +++++++ 6 files changed, 107 insertions(+), 4 deletions(-) diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index e6e3fab..6e899df 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -40,6 +40,7 @@ def parse_duration(value: str) -> int | float: @click.group() def cli(): + """babel-explorer: query and explore Babel intermediate files.""" pass @@ -163,6 +164,11 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): help="NodeNorm URL to check for concord changes", ) def test_concord(curies, nodenorm_url): + """For each CURIE, print the current NodeNorm clique (all equivalent identifiers, labels, and Biolink types). + + Useful for inspecting how a potential Babel concordance change would affect NodeNorm: + run before and after a Babel rebuild to see how cliques would shift. + """ # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm? # By definition, this can only combine all the cliques mentioned in the CURIEs. diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 53dfb28..e89acb1 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -1,6 +1,10 @@ -# Babel XRefs is a tool for accessing and querying the intermediate files -# that we make available with Babel builds. This allows you to find out -# why we consider two identifiers to be identical. +"""Query engine for Babel cross-reference intermediate files. + +Provides access to Concord.parquet and Identifiers.parquet via DuckDB, +allowing callers to discover why two biological/chemical identifiers are +considered identical in a Babel build. +""" + import dataclasses import logging import duckdb @@ -12,6 +16,8 @@ @dataclasses.dataclass(frozen=True) class CrossReference: + """A single cross-reference edge read from Concord.parquet.""" + filename: str subj: str pred: str @@ -19,12 +25,14 @@ class CrossReference: @staticmethod def from_tuple(tuple: tuple[str, str, str, str]): + """Construct from a ``(filename, subj, pred, obj)`` database row tuple.""" return CrossReference( filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3] ) @property def curies(self): + """The frozenset of both CURIEs in this edge (subject and object).""" return frozenset([self.subj, self.obj]) def __lt__(self, other): @@ -38,6 +46,8 @@ def __lt__(self, other): @dataclasses.dataclass(frozen=True) class LabeledCrossReference(CrossReference): + """A CrossReference enriched with human-readable labels and Biolink types from NodeNorm.""" + subj_label: str subj_biolink_type: str obj_label: str @@ -64,6 +74,7 @@ def from_row(row: tuple, column_names: list[str]): return IdentifierRecord(curie=row[curie_idx], extra_fields=extra) def __str__(self): + """Return a ``key=value`` string of the CURIE and all extra fields.""" parts = [f"curie={self.curie!r}"] for name, value in self.extra_fields: parts.append(f"{name}={value!r}") @@ -71,7 +82,20 @@ def __str__(self): class BabelXRefs: + """Query engine for Babel cross-reference and identifier Parquet files. + + Uses DuckDB for in-memory SQL queries against Concord.parquet and + Identifiers.parquet. NodeNorm is optional and only required when + ``label_curies=True`` is passed to enrichment-aware methods. + """ + def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None): + """ + :param downloader: A configured ``BabelDownloader`` that provides local paths + to the required Parquet files, downloading them on first access. + :param nodenorm: Optional ``NodeNorm`` client. Required only when callers pass + ``label_curies=True``; may be ``None`` for label-free queries. + """ self.downloader = downloader self.nodenorm = nodenorm @@ -101,6 +125,16 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]: @functools.lru_cache(maxsize=None) def get_curie_xref(self, curie: str, label_curies: bool = False): + """Return all cross-references in Concord.parquet where *curie* is the subject or object. + + Results are LRU-cached per ``(curie, label_curies)`` pair. + + :param curie: The CURIE to look up. + :param label_curies: If ``True``, annotate each result with NodeNorm labels and + Biolink types. Requires a NodeNorm instance to have been passed to ``__init__``. + :raises ValueError: If ``label_curies=True`` but no NodeNorm instance is available. + :return: A list of ``CrossReference`` (or ``LabeledCrossReference``) objects. + """ if label_curies and self.nodenorm is None: raise ValueError( "label_curies=True requires a configured NodeNorm instance (nodenorm was None)." diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 6ba9a38..f314c3d 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -1,3 +1,5 @@ +"""HTTP downloader for Babel Parquet files with ETag-based freshness checking.""" + import functools import json import os @@ -23,6 +25,17 @@ def __init__( freshness_seconds=3 * 3600, timeout: int = 30, ): + """ + :param url_base: Base URL of the Babel server (must end with ``/``). + :param local_path: Directory for cached downloads. Defaults to + ``tempfile.gettempdir()`` if ``None``; created automatically if it + does not exist. + :param retries: Maximum number of download retry attempts on failure. + :param freshness_seconds: How long a local file is considered fresh without + re-checking the server. Use ``float('inf')`` to never re-check, or ``0`` + to always issue a HEAD request. Defaults to 3 hours. + :param timeout: HTTP request timeout in seconds. + """ # We assume the URL base is correct (if not, we can fix it later). self.url_base = url_base self.retries = retries @@ -46,6 +59,7 @@ def __init__( @functools.lru_cache(maxsize=None) def get_output_file(self, filename): + """Return (and create) the local filesystem path for a given relative filename.""" filepath = os.path.join(self.local_path, filename) os.makedirs(os.path.dirname(filepath), exist_ok=True) return filepath diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index f4ead30..fae0a57 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -1,3 +1,5 @@ +"""NodeNorm API client for identifier normalisation and label enrichment.""" + import dataclasses import functools import requests @@ -6,6 +8,8 @@ @dataclasses.dataclass class Identifier: + """Normalised identifier record returned by the NodeNorm API.""" + curie: str label: str = "" biolink_type: list[str] = dataclasses.field(default_factory=list) @@ -17,6 +21,7 @@ def __lt__(self, other): @staticmethod def from_dict(d: dict): + """Parse an identifier entry from a NodeNorm API response dict.""" identifier = Identifier(curie=d["identifier"]) if "label" in d: identifier.label = d["label"] @@ -30,14 +35,29 @@ def from_dict(d: dict): class NodeNorm: + """Client for the NodeNormalization API (https://nodenormalization-sri.renci.org/).""" + def __init__(self, nodenorm_url: str = "", timeout: int = 30): + """ + :param nodenorm_url: Base URL of the NodeNorm service. Pass an empty string (default) + to skip all network calls and have every lookup return a bare ``Identifier``. + :param timeout: HTTP request timeout in seconds. + """ self.nodenorm_url = nodenorm_url self.timeout = timeout if self.nodenorm_url and not self.nodenorm_url.endswith("/"): self.nodenorm_url += "/" @functools.lru_cache(maxsize=None) - def get_identifier(self, curie: str): + def get_identifier(self, curie: str) -> "Identifier": + """Return the ``Identifier`` for *curie* by looking it up in its NodeNorm clique. + + Searches ``equivalent_identifiers`` for an entry whose ``identifier`` field matches + *curie* exactly. Falls back to a bare ``Identifier(curie=curie)`` (empty label and + type) if NodeNorm does not recognise the CURIE or it is not listed in the clique. + + Results are LRU-cached so repeated calls for the same CURIE are free. + """ result = self.normalize_curie(curie) logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}") if not result: @@ -62,6 +82,12 @@ def normalize_curie( individual_types=True, include_taxa=True, ): + """Call ``get_normalized_nodes`` and return the per-CURIE result dict. + + :return: The normalisation dict for *curie* (contains ``id``, ``equivalent_identifiers``, + ``type``, etc.), or ``None`` if the CURIE is not recognised by NodeNorm. + :raises requests.HTTPError: If the API returns a non-2xx status code. + """ response = requests.get( f"{self.nodenorm_url}get_normalized_nodes", params={ @@ -87,6 +113,11 @@ def normalize_curie( @functools.lru_cache(maxsize=None) def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]: + """Return all ``Identifier`` objects in the NodeNorm clique for *curie*. + + :return: A list of ``Identifier`` objects (one per entry in ``equivalent_identifiers``), + or an empty list if the CURIE is unknown or has no equivalents. + """ result = self.normalize_curie(curie, **kwargs) if not result: return [] diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index d67f81b..7c48935 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -27,6 +27,8 @@ class TestCrossReference: + """Tests for the CrossReference frozen dataclass.""" + def test_creation(self): xr = CrossReference( filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2" @@ -85,6 +87,8 @@ def test_sorting(self): class TestLabeledCrossReference: + """Tests for the LabeledCrossReference frozen dataclass.""" + def test_creation(self): lxr = LabeledCrossReference( subj="A:1", @@ -149,6 +153,8 @@ def test_str(self): class TestIdentifierRecord: + """Tests for the IdentifierRecord frozen dataclass.""" + def test_creation(self): rec = IdentifierRecord(curie="MONDO:0004979") assert rec.curie == "MONDO:0004979" @@ -181,6 +187,8 @@ def test_str(self): class TestBabelXRefsInit: + """Tests for BabelXRefs constructor.""" + def test_init_without_nodenorm(self, tmp_path): dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) bx = BabelXRefs(dl) diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py index 71459e7..5fb088d 100644 --- a/tests/test_nodenorm.py +++ b/tests/test_nodenorm.py @@ -22,6 +22,8 @@ class TestIdentifier: + """Tests for the Identifier dataclass.""" + def test_creation_with_defaults(self): ident = Identifier(curie="MONDO:0004979") assert ident.curie == "MONDO:0004979" @@ -90,6 +92,8 @@ def test_sorting(self): class TestNodeNormInit: + """Tests for NodeNorm constructor and URL normalisation.""" + def test_default_url(self): nn = NodeNorm() assert nn.nodenorm_url == "" @@ -100,6 +104,8 @@ def test_custom_url(self): class TestNormalizeCurieMocked: + """Unit tests for NodeNorm.normalize_curie() with mocked HTTP responses.""" + def _make_nn(self): nn = NodeNorm(nodenorm_url="https://example.com/") nn.normalize_curie.cache_clear() @@ -156,6 +162,8 @@ def test_http_error_raises(self): class TestGetIdentifierMocked: + """Unit tests for NodeNorm.get_identifier() with mocked normalize_curie.""" + def _make_nn(self): nn = NodeNorm(nodenorm_url="https://example.com/") nn.normalize_curie.cache_clear() @@ -208,6 +216,8 @@ def test_caching(self): class TestGetCliqueIdentifiersMocked: + """Unit tests for NodeNorm.get_clique_identifiers() with mocked normalize_curie.""" + def _make_nn(self): nn = NodeNorm(nodenorm_url="https://example.com/") nn.normalize_curie.cache_clear() From 8216b0b4007bde67dcf27a3a7a3b5feb44cfc1cc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 1 Apr 2026 01:40:43 -0400 Subject: [PATCH 60/66] Fix LabeledCrossReference biolink_type fields to list[str]; simplify map to listcomp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - babel_xrefs.py: subj_biolink_type/obj_biolink_type str→list[str] to match Identifier.biolink_type after the nodenorm.py type change - nodenorm.py: replace map(lambda) with list comprehension in get_clique_identifiers - tests: update LabeledCrossReference construction to use list values Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/babel_xrefs.py | 4 ++-- src/babel_explorer/core/nodenorm.py | 4 +--- tests/test_babel_xrefs.py | 18 +++++++++--------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index 53dfb28..2df9ac6 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -39,9 +39,9 @@ def __lt__(self, other): @dataclasses.dataclass(frozen=True) class LabeledCrossReference(CrossReference): subj_label: str - subj_biolink_type: str + subj_biolink_type: list[str] obj_label: str - obj_biolink_type: str + obj_biolink_type: list[str] def __str__(self): return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")""" diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index f4ead30..f83a5d4 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -92,6 +92,4 @@ def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]: return [] if "equivalent_identifiers" not in result: return [] - return list( - map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"]) - ) + return [Identifier.from_dict(x) for x in result["equivalent_identifiers"]] diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index d67f81b..409c6fe 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -92,13 +92,13 @@ def test_creation(self): obj="B:2", filename="f", subj_label="Alpha", - subj_biolink_type="biolink:Disease", + subj_biolink_type=["biolink:Disease"], obj_label="Beta", - obj_biolink_type="biolink:Gene", + obj_biolink_type=["biolink:Gene"], ) assert lxr.subj == "A:1" assert lxr.subj_label == "Alpha" - assert lxr.obj_biolink_type == "biolink:Gene" + assert lxr.obj_biolink_type == ["biolink:Gene"] def test_inherits_from_cross_reference(self): lxr = LabeledCrossReference( @@ -107,9 +107,9 @@ def test_inherits_from_cross_reference(self): obj="B:2", filename="f", subj_label="", - subj_biolink_type="", + subj_biolink_type=[], obj_label="", - obj_biolink_type="", + obj_biolink_type=[], ) assert isinstance(lxr, CrossReference) @@ -120,9 +120,9 @@ def test_curies_property(self): obj="B:2", filename="f", subj_label="", - subj_biolink_type="", + subj_biolink_type=[], obj_label="", - obj_biolink_type="", + obj_biolink_type=[], ) assert lxr.curies == frozenset({"A:1", "B:2"}) @@ -133,9 +133,9 @@ def test_str(self): obj="B:2", filename="f", subj_label="Alpha", - subj_biolink_type="biolink:Disease", + subj_biolink_type=["biolink:Disease"], obj_label="Beta", - obj_biolink_type="biolink:Gene", + obj_biolink_type=["biolink:Gene"], ) s = str(lxr) assert "A:1" in s From ac418ff3d87440c77ad61b12c411cc6142a46dc1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 1 Apr 2026 02:52:54 -0400 Subject: [PATCH 61/66] Address PR #1 review: frozen Identifier, atomic rename, fail-open HEAD, type fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nodenorm.py: Identifier is now frozen=True; rewrite from_dict as one-shot constructor to avoid post-construction mutation of lru_cache'd objects - nodenorm.py: remove **kwargs from get_clique_identifiers — unhashable and unused, would raise TypeError if any kwarg was ever passed - downloader.py: download to .tmp then os.replace() so the final file is never partially written; clean up .tmp on failure - downloader.py: _etag_matches returns True (fail open) on HEAD network error instead of False, avoiding spurious 2GB re-downloads on transient failures - cli.py: add nodenorm_url: str annotation in xrefs and test_concord; move test_concord inline comment to docstring - tests: update test_returns_false_on_request_error → test_returns_true_on_request_error - FUTURE.md: track CLI option deduplication refactor Co-Authored-By: Claude Sonnet 4.6 --- FUTURE.md | 7 +++++++ src/babel_explorer/cli.py | 10 ++++++---- src/babel_explorer/core/downloader.py | 25 +++++++++++++++++-------- src/babel_explorer/core/nodenorm.py | 25 +++++++++++-------------- tests/test_downloader.py | 5 +++-- 5 files changed, 44 insertions(+), 28 deletions(-) create mode 100644 FUTURE.md diff --git a/FUTURE.md b/FUTURE.md new file mode 100644 index 0000000..a2bb3fb --- /dev/null +++ b/FUTURE.md @@ -0,0 +1,7 @@ +# Future Work + +## Deduplicate CLI option blocks + +`--local-dir`, `--babel-url`, and `--check-download` are copy-pasted between the +`xrefs` and `ids` commands in `cli.py`. Extract a `@common_babel_options` Click +decorator so defaults are defined in one place and can't drift. diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index e6e3fab..207968f 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -76,7 +76,7 @@ def cli(): def xrefs( curies: list[str], babel_url: str, - nodenorm_url, + nodenorm_url: str, local_dir: str, recurse: bool, labels: bool, @@ -162,10 +162,12 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes", ) -def test_concord(curies, nodenorm_url): - # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm? - # By definition, this can only combine all the cliques mentioned in the CURIEs. +def test_concord(curies: tuple[str, ...], nodenorm_url: str): + """ + For each input CURIE, show what clique NodeNorm currently maps it to. + Answers: if these CURIEs were merged in Babel, which NodeNorm cliques would combine? + """ nodenorm = NodeNorm(nodenorm_url) for curie in curies: identifiers = nodenorm.get_clique_identifiers(curie) diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index 6ba9a38..63e6826 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -130,8 +130,10 @@ def _etag_matches(self, url, meta): response = requests.head(url, timeout=self.timeout) response.raise_for_status() except requests.RequestException as e: - self.logger.warning(f"HEAD request failed for {url}: {e}") - return False + self.logger.warning( + f"HEAD request failed for {url}: {e}; assuming file is current" + ) + return True remote_headers = response.headers @@ -321,20 +323,27 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024): ) return local_path_to_download_to - # Tier 3: ETag changed — delete and re-download + # Tier 3: ETag changed — re-download self.logger.warning( f"Remote file changed, re-downloading: {local_path_to_download_to}" ) - os.remove(local_path_to_download_to) self.logger.info( f"Downloading {url_to_download} to {local_path_to_download_to}" ) - # Download with retry logic; get response headers back - response_headers = self._download_with_retry( - url_to_download, local_path_to_download_to, chunk_size - ) + # Download to a sibling .tmp file, then atomically replace the final destination. + # This ensures the final file is never partially written. + tmp_path = local_path_to_download_to + ".tmp" + try: + response_headers = self._download_with_retry( + url_to_download, tmp_path, chunk_size + ) + os.replace(tmp_path, local_path_to_download_to) + except Exception: + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise # Save sidecar metadata if response_headers is not None: diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index f83a5d4..42845fb 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -4,7 +4,7 @@ import logging -@dataclasses.dataclass +@dataclasses.dataclass(frozen=True) class Identifier: curie: str label: str = "" @@ -16,17 +16,14 @@ def __lt__(self, other): return self.curie < other.curie @staticmethod - def from_dict(d: dict): - identifier = Identifier(curie=d["identifier"]) - if "label" in d: - identifier.label = d["label"] - if "taxa" in d: - identifier.taxa = d["taxa"] - if "description" in d: - identifier.description = d["description"] - if "type" in d: - identifier.biolink_type = d["type"] - return identifier + def from_dict(d: dict) -> "Identifier": + return Identifier( + curie=d["identifier"], + label=d.get("label", ""), + biolink_type=d.get("type", []), + taxa=d.get("taxa", []), + description=d.get("description", []), + ) class NodeNorm: @@ -86,8 +83,8 @@ def normalize_curie( return None @functools.lru_cache(maxsize=None) - def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]: - result = self.normalize_curie(curie, **kwargs) + def get_clique_identifiers(self, curie: str) -> list[Identifier]: + result = self.normalize_curie(curie) if not result: return [] if "equivalent_identifiers" not in result: diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 9b33e7a..d23b323 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -267,14 +267,15 @@ def test_fallback_last_modified_match(self, tmp_path): ): assert dl._etag_matches("https://example.com/f.parquet", meta) is True - def test_returns_false_on_request_error(self, tmp_path): + def test_returns_true_on_request_error(self, tmp_path): + """Network errors are treated as 'assume still fresh' to avoid triggering large re-downloads.""" dl = self._make_dl(tmp_path) meta = {"etag": '"abc"'} with patch( "babel_explorer.core.downloader.requests.head", side_effect=requests.ConnectionError("fail"), ): - assert dl._etag_matches("https://example.com/f.parquet", meta) is False + assert dl._etag_matches("https://example.com/f.parquet", meta) is True class TestGetDownloadedFileTiers: From 06cd300786b69e5f4437f57897b704ad7452171b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 1 Apr 2026 02:58:46 -0400 Subject: [PATCH 62/66] Sync CLAUDE.md with current code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix --expand → --recurse (the actual flag name) in Data Flow and Key Design Patterns - BabelXRefs: remove false claim about writing DuckDB databases to disk; all connections are in-memory (duckdb.connect() with no path) - Remove 'Generated DuckDB databases' entry from File Locations (nothing on disk) - Update test count table: numbers were stale and test_cli.py was missing entirely - Add Identifier to Key Dataclasses (now frozen=True as of recent fix) Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7ad79fb..8ecea72 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -79,9 +79,9 @@ uv run ruff format 2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`): - Main query engine for cross-references - - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`) - - Supports recursive expansion of cross-references - - Creates ephemeral DuckDB databases in `data//output/duckdbs/` + - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`) + - Supports recursive expansion of cross-references via a single `WITH RECURSIVE` query + - Uses ephemeral in-memory DuckDB connections (nothing written to disk) 3. **NodeNorm** (`src/babel_explorer/core/nodenorm.py`): - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/) @@ -98,14 +98,14 @@ uv run ruff format 1. User provides CURIEs via CLI 2. BabelDownloader ensures required Parquet files are downloaded 3. BabelXRefs queries files using DuckDB -4. If `--labels` or `--expand` flags are set, NodeNorm is queried for additional metadata +4. If `--labels` or `--recurse` flags are set, NodeNorm is queried for additional metadata 5. Results are printed to stdout ### Key Design Patterns - **Lazy downloading**: Files are only downloaded when first accessed - **LRU caching**: Heavy use of `@functools.lru_cache` to avoid redundant downloads and API calls -- **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs +- **Recursive expansion**: The `--recurse` flag recursively follows all cross-references to build complete graphs - **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups ## Testing @@ -119,9 +119,10 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ | File | Unit | Integration | Slow | Total | |------|------|-------------|------|-------| -| `tests/test_downloader.py` | 22 | 3 | 1 | 26 | -| `tests/test_babel_xrefs.py` | 22 | 8 | 1 | 31 | -| `tests/test_nodenorm.py` | 18 | 5 | 0 | 23 | +| `tests/test_downloader.py` | 41 | 4 | 1 | 46 | +| `tests/test_babel_xrefs.py` | 23 | 20 | 3 | 46 | +| `tests/test_nodenorm.py` | 20 | 13 | 0 | 33 | +| `tests/test_cli.py` | 24 | 0 | 0 | 24 | ### Test Infrastructure @@ -131,6 +132,7 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ ### Key Dataclasses +- **`Identifier`** — Frozen dataclass for a normalized NodeNorm entry (curie, label, biolink_type, taxa, description). Returned by `NodeNorm.get_identifier()` and `get_clique_identifiers()`. - **`CrossReference`** — Frozen dataclass for Concord.parquet rows (filename, subj, pred, obj) - **`LabeledCrossReference`** — Extends CrossReference with labels and biolink types from NodeNorm - **`IdentifierRecord`** — Frozen dataclass for Identifiers.parquet rows (curie + dynamic extra fields). Returned by `BabelXRefs.get_curie_ids()`. @@ -146,5 +148,4 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ - Tests: `tests/` - Test CURIEs: `tests/data/valid_curies.txt` - Downloaded Babel files: `data//duckdb/*.parquet` -- Generated DuckDB databases: `data//output/duckdbs/` - Entry point: `src/babel_explorer/cli.py` From bf1c48c06ff7fa1c3adcd458f71684da9bb10b7b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 1 Apr 2026 18:49:09 -0400 Subject: [PATCH 63/66] Address PR #1 review: fix six correctness and quality issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - list→tuple on Identifier and LabeledCrossReference fields so frozen dataclasses are hashable (was a TypeError crash in get_curie_xrefs) - NodeNorm(''): add early return in normalize_curie so empty URL truly skips all network calls as documented - BabelDownloader: auto-append trailing slash to url_base so urljoin can't silently drop path segments - CI: fix push trigger branch master → main - Remove dead get_downloaded_dir method (lru_cache + NotImplementedError) - parse_duration: reject negative values with a clear BadParameter error Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 2 +- src/babel_explorer/cli.py | 11 ++++++++- src/babel_explorer/core/babel_xrefs.py | 4 ++-- src/babel_explorer/core/downloader.py | 19 ++-------------- src/babel_explorer/core/nodenorm.py | 14 +++++++----- tests/test_babel_xrefs.py | 18 +++++++-------- tests/test_cli.py | 2 ++ tests/test_downloader.py | 23 ++++++++++--------- tests/test_nodenorm.py | 31 +++++++++++++++++--------- 9 files changed, 67 insertions(+), 57 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f712ae..c3ef4c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,7 @@ name: CI on: pull_request: push: - branches: [master] + branches: [main] schedule: - cron: "0 17 * * 2" # Tuesdays at 12pm EST (17:00 UTC); 1pm during EDT workflow_dispatch: diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 1854cc4..bcd8787 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -27,15 +27,24 @@ def parse_duration(value: str) -> int | float: f"Invalid duration {value!r}: expected an integer followed by an optional unit " "('s', 'm', 'h', or 'd'), or 'never'." ) + if amount < 0: + raise click.BadParameter( + f"Invalid duration {value!r}: duration must be non-negative." + ) return amount * units[lower[-1]] # Bare integer seconds try: - return int(lower) + result = int(lower) except ValueError: raise click.BadParameter( f"Invalid duration {value!r}: expected an integer number of seconds, optionally " "followed by 's', 'm', 'h', or 'd', or 'never'." ) + if result < 0: + raise click.BadParameter( + f"Invalid duration {value!r}: duration must be non-negative." + ) + return result @click.group() diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py index a8b94b3..0c94074 100644 --- a/src/babel_explorer/core/babel_xrefs.py +++ b/src/babel_explorer/core/babel_xrefs.py @@ -49,9 +49,9 @@ class LabeledCrossReference(CrossReference): """A CrossReference enriched with human-readable labels and Biolink types from NodeNorm.""" subj_label: str - subj_biolink_type: list[str] + subj_biolink_type: tuple[str, ...] obj_label: str - obj_biolink_type: list[str] + obj_biolink_type: tuple[str, ...] def __str__(self): return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")""" diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py index c5dcada..4d2f9a2 100644 --- a/src/babel_explorer/core/downloader.py +++ b/src/babel_explorer/core/downloader.py @@ -36,7 +36,8 @@ def __init__( to always issue a HEAD request. Defaults to 3 hours. :param timeout: HTTP request timeout in seconds. """ - # We assume the URL base is correct (if not, we can fix it later). + if not url_base.endswith("/"): + url_base += "/" self.url_base = url_base self.retries = retries self.freshness_seconds = freshness_seconds @@ -368,19 +369,3 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024): f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes" ) return local_path_to_download_to - - @functools.lru_cache(maxsize=None) - def get_downloaded_dir(self, dirpath: str): - """ - Download a directory recursively. - - NOTE: This method is not implemented in the Python-based downloader. - Use get_downloaded_file() for individual files instead. - - Raises: - NotImplementedError: This method is not implemented - """ - raise NotImplementedError( - "Recursive directory downloads are not supported. " - "Use get_downloaded_file() for individual files." - ) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index 11a7da9..9ce916d 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -12,9 +12,9 @@ class Identifier: curie: str label: str = "" - biolink_type: list[str] = dataclasses.field(default_factory=list) - taxa: list[str] = dataclasses.field(default_factory=list) - description: list[str] = dataclasses.field(default_factory=list) + biolink_type: tuple[str, ...] = () + taxa: tuple[str, ...] = () + description: tuple[str, ...] = () def __lt__(self, other): return self.curie < other.curie @@ -24,9 +24,9 @@ def from_dict(d: dict) -> "Identifier": return Identifier( curie=d["identifier"], label=d.get("label", ""), - biolink_type=d.get("type", []), - taxa=d.get("taxa", []), - description=d.get("description", []), + biolink_type=tuple(d.get("type", [])), + taxa=tuple(d.get("taxa", [])), + description=tuple(d.get("description", [])), ) @@ -84,6 +84,8 @@ def normalize_curie( ``type``, etc.), or ``None`` if the CURIE is not recognised by NodeNorm. :raises requests.HTTPError: If the API returns a non-2xx status code. """ + if not self.nodenorm_url: + return None response = requests.get( f"{self.nodenorm_url}get_normalized_nodes", params={ diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py index 9fdb5da..114d269 100644 --- a/tests/test_babel_xrefs.py +++ b/tests/test_babel_xrefs.py @@ -96,13 +96,13 @@ def test_creation(self): obj="B:2", filename="f", subj_label="Alpha", - subj_biolink_type=["biolink:Disease"], + subj_biolink_type=("biolink:Disease",), obj_label="Beta", - obj_biolink_type=["biolink:Gene"], + obj_biolink_type=("biolink:Gene",), ) assert lxr.subj == "A:1" assert lxr.subj_label == "Alpha" - assert lxr.obj_biolink_type == ["biolink:Gene"] + assert lxr.obj_biolink_type == ("biolink:Gene",) def test_inherits_from_cross_reference(self): lxr = LabeledCrossReference( @@ -111,9 +111,9 @@ def test_inherits_from_cross_reference(self): obj="B:2", filename="f", subj_label="", - subj_biolink_type=[], + subj_biolink_type=(), obj_label="", - obj_biolink_type=[], + obj_biolink_type=(), ) assert isinstance(lxr, CrossReference) @@ -124,9 +124,9 @@ def test_curies_property(self): obj="B:2", filename="f", subj_label="", - subj_biolink_type=[], + subj_biolink_type=(), obj_label="", - obj_biolink_type=[], + obj_biolink_type=(), ) assert lxr.curies == frozenset({"A:1", "B:2"}) @@ -137,9 +137,9 @@ def test_str(self): obj="B:2", filename="f", subj_label="Alpha", - subj_biolink_type=["biolink:Disease"], + subj_biolink_type=("biolink:Disease",), obj_label="Beta", - obj_biolink_type=["biolink:Gene"], + obj_biolink_type=("biolink:Gene",), ) s = str(lxr) assert "A:1" in s diff --git a/tests/test_cli.py b/tests/test_cli.py index ac75fe6..3d73e55 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -47,6 +47,8 @@ def test_valid_inputs(self, value, expected): "3.5h", "1.5", "3x", + "-5", + "-5h", ], ) def test_invalid_inputs_raise_bad_parameter(self, value): diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d23b323..7fe8609 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -60,6 +60,19 @@ def test_custom_freshness_seconds(self, tmp_path): ) assert dl.freshness_seconds == 0 + def test_url_base_trailing_slash_added(self, tmp_path): + """url_base without trailing slash gets one appended automatically.""" + dl = BabelDownloader( + url_base="https://example.com/path", local_path=str(tmp_path) + ) + assert dl.url_base == "https://example.com/path/" + + def test_url_base_with_trailing_slash_unchanged(self, tmp_path): + dl = BabelDownloader( + url_base="https://example.com/path/", local_path=str(tmp_path) + ) + assert dl.url_base == "https://example.com/path/" + def test_invalid_path_raises_value_error(self): """Using a file path (not a directory) should raise ValueError.""" with tempfile.NamedTemporaryFile(delete=False) as f: @@ -610,16 +623,6 @@ def test_append_mode_on_resume(self, tmp_path): assert out_path.read_bytes() == b"startend" -class TestGetDownloadedDir: - """Tests for get_downloaded_dir.""" - - def test_raises_not_implemented(self, tmp_path): - dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path)) - dl.get_downloaded_dir.cache_clear() - with pytest.raises(NotImplementedError): - dl.get_downloaded_dir("some/dir") - - # ========================================================================== # Integration Tests — require network access # ========================================================================== diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py index 5fb088d..8b30fcd 100644 --- a/tests/test_nodenorm.py +++ b/tests/test_nodenorm.py @@ -28,21 +28,21 @@ def test_creation_with_defaults(self): ident = Identifier(curie="MONDO:0004979") assert ident.curie == "MONDO:0004979" assert ident.label == "" - assert ident.biolink_type == [] - assert ident.taxa == [] - assert ident.description == [] + assert ident.biolink_type == () + assert ident.taxa == () + assert ident.description == () def test_full_creation(self): ident = Identifier( curie="MONDO:0004979", label="asthma", - biolink_type=["biolink:Disease"], - taxa=["NCBITaxon:9606"], - description=["A chronic respiratory disease"], + biolink_type=("biolink:Disease",), + taxa=("NCBITaxon:9606",), + description=("A chronic respiratory disease",), ) assert ident.label == "asthma" - assert ident.biolink_type == ["biolink:Disease"] - assert ident.taxa == ["NCBITaxon:9606"] + assert ident.biolink_type == ("biolink:Disease",) + assert ident.taxa == ("NCBITaxon:9606",) def test_from_dict_minimal(self): d = {"identifier": "X:1"} @@ -61,15 +61,15 @@ def test_from_dict_full(self): ident = Identifier.from_dict(d) assert ident.curie == "X:1" assert ident.label == "Alpha" - assert ident.biolink_type == ["biolink:NamedThing"] - assert ident.taxa == ["NCBITaxon:9606"] + assert ident.biolink_type == ("biolink:NamedThing",) + assert ident.taxa == ("NCBITaxon:9606",) def test_from_dict_partial(self): d = {"identifier": "X:1", "label": "Beta"} ident = Identifier.from_dict(d) assert ident.curie == "X:1" assert ident.label == "Beta" - assert ident.biolink_type == [] + assert ident.biolink_type == () def test_lt_ordering(self): a = Identifier(curie="A:1") @@ -102,6 +102,15 @@ def test_custom_url(self): nn = NodeNorm(nodenorm_url="https://custom.api/") assert nn.nodenorm_url == "https://custom.api/" + def test_empty_url_normalize_curie_returns_none_without_network(self): + """NodeNorm('') must not make any HTTP calls and must return None.""" + nn = NodeNorm("") + nn.normalize_curie.cache_clear() + with patch("babel_explorer.core.nodenorm.requests.get") as mock_get: + result = nn.normalize_curie("MONDO:0004979") + mock_get.assert_not_called() + assert result is None + class TestNormalizeCurieMocked: """Unit tests for NodeNorm.normalize_curie() with mocked HTTP responses.""" From 49e27c01a3a0f10916f1b99d13e412caa1ef4d43 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 9 Apr 2026 11:45:33 -0600 Subject: [PATCH 64/66] Add --format [text|json|tsv|csv] option to all CLI commands Introduces a central formatting.py module (write_records + _record_to_dict) that serialises any dataclass to text, JSON, TSV, or CSV without touching domain objects. A format_option decorator wires --format and --json-indent onto xrefs, ids, and test-concord. test-concord injects a query_curie column for non-text formats. 30 new unit tests in test_formatting.py; 7 CLI format tests added to test_cli.py. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/cli.py | 56 ++++++-- src/babel_explorer/formatting.py | 71 ++++++++++ tests/test_cli.py | 138 +++++++++++++++++- tests/test_formatting.py | 233 +++++++++++++++++++++++++++++++ 4 files changed, 483 insertions(+), 15 deletions(-) create mode 100644 src/babel_explorer/formatting.py create mode 100644 tests/test_formatting.py diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index bcd8787..098e831 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -4,6 +4,26 @@ from babel_explorer.core.downloader import BabelDownloader from babel_explorer.core.babel_xrefs import BabelXRefs from babel_explorer.core.nodenorm import NodeNorm +from babel_explorer.formatting import write_records, _record_to_dict + + +def format_option(f): + """Decorator adding --format and --json-indent options to a command.""" + f = click.option( + "--format", + "fmt", + default="text", + type=click.Choice(["text", "json", "tsv", "csv"]), + show_default=True, + help="Output format", + )(f) + f = click.option( + "--json-indent", + default=2, + show_default=True, + help="Indentation depth for JSON output", + )(f) + return f def parse_duration(value: str) -> int | float: @@ -83,6 +103,7 @@ def cli(): help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.", ) +@format_option def xrefs( curies: list[str], babel_url: str, @@ -91,6 +112,8 @@ def xrefs( recurse: bool, labels: bool, check_download: str, + fmt: str, + json_indent: int, ): """ Fetches and prints the cross-references (xrefs) for the given CURIEs. @@ -113,8 +136,7 @@ def xrefs( NodeNorm(nodenorm_url), ) xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels) - for xref in xrefs: - print(xref) + write_records(xrefs, fmt=fmt, indent=json_indent) @cli.command("ids") @@ -139,7 +161,8 @@ def xrefs( help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). " "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.", ) -def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): +@format_option +def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str, fmt: str, json_indent: int): """ Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided. @@ -160,8 +183,7 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness) ) xrefs = bxref.get_curie_ids(curies) - for xref in xrefs: - print(xref) + write_records(xrefs, fmt=fmt, indent=json_indent) @cli.command("test-concord") @@ -172,21 +194,27 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str): default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes", ) -def test_concord(curies, nodenorm_url): +@format_option +def test_concord(curies, nodenorm_url, fmt, json_indent): """For each CURIE, print the current NodeNorm clique (all equivalent identifiers, labels, and Biolink types). Useful for inspecting how a potential Babel concordance change would affect NodeNorm: run before and after a Babel rebuild to see how cliques would shift. """ nodenorm = NodeNorm(nodenorm_url) - for curie in curies: - identifiers = nodenorm.get_clique_identifiers(curie) - for identifier in identifiers: - biolink = ", ".join(identifier.biolink_type) - if identifier.label: - print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{biolink}") - else: - print(f"{curie}\t{identifier.curie}\t\t{biolink}") + if fmt == "text": + for curie in curies: + for identifier in nodenorm.get_clique_identifiers(curie): + biolink = ", ".join(identifier.biolink_type) + label = identifier.label or "" + print(f"{curie}\t{identifier.curie}\t{label}\t{biolink}") + else: + rows = [ + {"query_curie": curie, **_record_to_dict(ident)} + for curie in curies + for ident in nodenorm.get_clique_identifiers(curie) + ] + write_records(rows, fmt=fmt, indent=json_indent) if __name__ == "__main__": diff --git a/src/babel_explorer/formatting.py b/src/babel_explorer/formatting.py new file mode 100644 index 0000000..7e7701e --- /dev/null +++ b/src/babel_explorer/formatting.py @@ -0,0 +1,71 @@ +"""Output formatting for babel-explorer CLI commands. + +Provides write_records() to render any list of dataclass records (or plain +dicts) as text, JSON, TSV, or CSV. +""" + +import csv +import dataclasses +import json +import sys +from typing import Any + + +def _record_to_dict(record) -> dict[str, Any]: + """Convert a dataclass (or plain dict) to a flat dict. + + Handles IdentifierRecord's extra_fields, which asdict() returns as a + list of [col, val] pairs rather than a nested dict. + """ + if isinstance(record, dict): + return record + d = dataclasses.asdict(record) + if "extra_fields" in d: + for col, val in d.pop("extra_fields"): + d[col] = val + return d + + +def _flatten_for_tabular(row: dict) -> dict: + """Convert list/tuple fields to pipe-joined strings for TSV/CSV output.""" + return {k: "|".join(v) if isinstance(v, (list, tuple)) else v for k, v in row.items()} + + +def write_records(records, fmt: str, indent: int = 2, file=None): + """Write an iterable of dataclass records (or dicts) in the requested format. + + :param records: Iterable of dataclass instances or plain dicts. + :param fmt: One of "text", "json", "tsv", "csv". + :param indent: JSON indentation depth (ignored for other formats). + :param file: Output file-like object; defaults to sys.stdout. + :raises ValueError: If fmt is not a recognised format. + """ + if file is None: + file = sys.stdout + records = list(records) + + if fmt == "text": + for r in records: + print(r, file=file) + + elif fmt == "json": + rows = [_record_to_dict(r) for r in records] + json.dump(rows, file, indent=indent, default=str) + print(file=file) # trailing newline + + elif fmt in ("tsv", "csv"): + if not records: + return + rows = [_flatten_for_tabular(_record_to_dict(r)) for r in records] + delimiter = "\t" if fmt == "tsv" else "," + writer = csv.DictWriter( + file, + fieldnames=list(rows[0].keys()), + delimiter=delimiter, + lineterminator="\n", + ) + writer.writeheader() + writer.writerows(rows) + + else: + raise ValueError(f"Unknown format: {fmt!r}") diff --git a/tests/test_cli.py b/tests/test_cli.py index 3d73e55..62112d4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,12 +4,16 @@ Unit tests — no network required. """ -import pytest +import json + import click +import pytest from click.testing import CliRunner from unittest.mock import patch, MagicMock from babel_explorer.cli import parse_duration, cli +from babel_explorer.core.babel_xrefs import CrossReference, IdentifierRecord +from babel_explorer.core.nodenorm import Identifier # ========================================================================== @@ -193,3 +197,135 @@ def test_test_concord_multiple_curies(self): assert mock_nn.return_value.get_clique_identifiers.call_count == 2 assert "Alpha" in result.output assert "Beta" in result.output + + +class TestOutputFormats: + """Tests for --format option on all commands.""" + + # Shared real dataclass instances (no mocking needed for formatting logic) + _xref = CrossReference(filename="Concord.parquet", subj="A:1", pred="skos:exactMatch", obj="B:2") + _id_record = IdentifierRecord(curie="A:1", extra_fields=(("type", "gene"), ("label", "Alpha"))) + _identifier = Identifier( + curie="MONDO:0004979", label="asthma", + biolink_type=("biolink:Disease",), taxa=(), description=(), + ) + + # -- xrefs -- + + def test_xrefs_format_json(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): + mock_bx.return_value.get_curie_xrefs.return_value = [self._xref] + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "json"]) + + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + assert data[0]["subj"] == "A:1" + assert data[0]["obj"] == "B:2" + + def test_xrefs_format_tsv(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): + mock_bx.return_value.get_curie_xrefs.return_value = [self._xref] + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "tsv"]) + + assert result.exit_code == 0 + lines = result.output.splitlines() + assert lines[0] == "filename\tsubj\tpred\tobj" + assert "A:1" in lines[1] + + def test_xrefs_format_csv(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): + mock_bx.return_value.get_curie_xrefs.return_value = [self._xref] + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "csv"]) + + assert result.exit_code == 0 + lines = result.output.splitlines() + assert lines[0] == "filename,subj,pred,obj" + assert "A:1" in lines[1] + + # -- ids -- + + def test_ids_format_json_expands_extra_fields(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + ): + mock_bx.return_value.get_curie_ids.return_value = [self._id_record] + result = runner.invoke(cli, ["ids", "A:1", "--format", "json"]) + + assert result.exit_code == 0 + data = json.loads(result.output) + assert data[0]["curie"] == "A:1" + assert data[0]["type"] == "gene" + assert data[0]["label"] == "Alpha" + assert "extra_fields" not in data[0] + + def test_ids_format_tsv_expands_extra_fields(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + ): + mock_bx.return_value.get_curie_ids.return_value = [self._id_record] + result = runner.invoke(cli, ["ids", "A:1", "--format", "tsv"]) + + assert result.exit_code == 0 + lines = result.output.splitlines() + assert "type" in lines[0] + assert "label" in lines[0] + assert "gene" in lines[1] + + # -- test-concord -- + + def test_test_concord_format_json_includes_query_curie(self): + runner = CliRunner() + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "json"]) + + assert result.exit_code == 0 + data = json.loads(result.output) + assert data[0]["query_curie"] == "MONDO:0004979" + assert data[0]["curie"] == "MONDO:0004979" + assert data[0]["label"] == "asthma" + assert data[0]["biolink_type"] == ["biolink:Disease"] + + def test_test_concord_format_tsv(self): + runner = CliRunner() + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "tsv"]) + + assert result.exit_code == 0 + lines = result.output.splitlines() + assert "query_curie" in lines[0] + assert "MONDO:0004979" in lines[1] + + # -- format validation -- + + def test_invalid_format_rejected_by_click(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs"), + patch("babel_explorer.cli.NodeNorm"), + ): + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "xml"]) + + assert result.exit_code != 0 diff --git a/tests/test_formatting.py b/tests/test_formatting.py new file mode 100644 index 0000000..9b9aa90 --- /dev/null +++ b/tests/test_formatting.py @@ -0,0 +1,233 @@ +""" +Unit tests for formatting.py — no network, no mocking required. +""" + +import io +import json + +import pytest + +from babel_explorer.core.babel_xrefs import CrossReference, LabeledCrossReference, IdentifierRecord +from babel_explorer.core.nodenorm import Identifier +from babel_explorer.formatting import _record_to_dict, write_records + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def xref(): + return CrossReference(filename="Concord.parquet", subj="A:1", pred="skos:exactMatch", obj="B:2") + + +@pytest.fixture +def labeled_xref(): + return LabeledCrossReference( + filename="Concord.parquet", + subj="A:1", + pred="skos:exactMatch", + obj="B:2", + subj_label="Alpha", + subj_biolink_type=("biolink:Disease",), + obj_label="Beta", + obj_biolink_type=("biolink:Gene", "biolink:NamedThing"), + ) + + +@pytest.fixture +def id_record(): + return IdentifierRecord( + curie="A:1", + extra_fields=(("type", "gene"), ("label", "Alpha")), + ) + + +@pytest.fixture +def identifier(): + return Identifier( + curie="MONDO:0004979", + label="asthma", + biolink_type=("biolink:Disease",), + taxa=("NCBITaxon:9606",), + description=("A chronic inflammatory disease",), + ) + + +# --------------------------------------------------------------------------- +# Tests for _record_to_dict +# --------------------------------------------------------------------------- + + +class TestRecordToDict: + def test_cross_reference(self, xref): + d = _record_to_dict(xref) + assert d == {"filename": "Concord.parquet", "subj": "A:1", "pred": "skos:exactMatch", "obj": "B:2"} + + def test_labeled_cross_reference_has_all_eight_fields(self, labeled_xref): + d = _record_to_dict(labeled_xref) + assert set(d.keys()) == { + "filename", "subj", "pred", "obj", + "subj_label", "subj_biolink_type", "obj_label", "obj_biolink_type", + } + # dataclasses.asdict() preserves tuple types + assert d["subj_biolink_type"] == ("biolink:Disease",) + assert d["obj_biolink_type"] == ("biolink:Gene", "biolink:NamedThing") + + def test_identifier_record_extra_fields_expanded(self, id_record): + d = _record_to_dict(id_record) + assert "extra_fields" not in d + assert d["curie"] == "A:1" + assert d["type"] == "gene" + assert d["label"] == "Alpha" + + def test_identifier_record_no_extra_fields(self): + rec = IdentifierRecord(curie="X:1") + d = _record_to_dict(rec) + assert d == {"curie": "X:1"} + + def test_plain_dict_passthrough(self): + data = {"a": 1, "b": "hello"} + assert _record_to_dict(data) is data + + def test_identifier_dataclass(self, identifier): + d = _record_to_dict(identifier) + assert d["curie"] == "MONDO:0004979" + assert d["label"] == "asthma" + # dataclasses.asdict() preserves tuple types + assert d["biolink_type"] == ("biolink:Disease",) + assert d["taxa"] == ("NCBITaxon:9606",) + + +# --------------------------------------------------------------------------- +# Tests for write_records +# --------------------------------------------------------------------------- + + +class TestWriteRecords: + + # -- text format -- + + def test_text_uses_str(self, xref): + out = io.StringIO() + write_records([xref], "text", file=out) + assert out.getvalue().strip() == str(xref) + + def test_text_empty_no_output(self): + out = io.StringIO() + write_records([], "text", file=out) + assert out.getvalue() == "" + + def test_text_multiple_records(self, xref): + out = io.StringIO() + write_records([xref, xref], "text", file=out) + lines = out.getvalue().strip().splitlines() + assert len(lines) == 2 + + # -- json format -- + + def test_json_is_valid_list(self, xref): + out = io.StringIO() + write_records([xref], "json", file=out) + data = json.loads(out.getvalue()) + assert isinstance(data, list) + assert len(data) == 1 + assert data[0]["subj"] == "A:1" + + def test_json_empty_list(self): + out = io.StringIO() + write_records([], "json", file=out) + assert json.loads(out.getvalue()) == [] + + def test_json_indent_controls_formatting(self, xref): + out_pretty = io.StringIO() + write_records([xref], "json", indent=2, file=out_pretty) + + out_compact = io.StringIO() + write_records([xref], "json", indent=None, file=out_compact) + + # Pretty-printed output has more lines (has newlines per field) + assert out_pretty.getvalue().count("\n") > out_compact.getvalue().count("\n") + + def test_json_tuple_fields_serialized_as_arrays(self, labeled_xref): + # json.dump converts tuples to JSON arrays, so json.loads gives back lists + out = io.StringIO() + write_records([labeled_xref], "json", file=out) + data = json.loads(out.getvalue()) + assert isinstance(data[0]["subj_biolink_type"], list) + assert data[0]["obj_biolink_type"] == ["biolink:Gene", "biolink:NamedThing"] + + def test_json_plain_dict(self): + out = io.StringIO() + write_records([{"a": 1, "b": "x"}], "json", file=out) + assert json.loads(out.getvalue()) == [{"a": 1, "b": "x"}] + + # -- tsv format -- + + def test_tsv_has_header_row(self, xref): + out = io.StringIO() + write_records([xref], "tsv", file=out) + lines = out.getvalue().splitlines() + assert lines[0] == "filename\tsubj\tpred\tobj" + + def test_tsv_data_row(self, xref): + out = io.StringIO() + write_records([xref], "tsv", file=out) + lines = out.getvalue().splitlines() + assert lines[1] == "Concord.parquet\tA:1\tskos:exactMatch\tB:2" + + def test_tsv_tuple_fields_pipe_joined(self, labeled_xref): + out = io.StringIO() + write_records([labeled_xref], "tsv", file=out) + lines = out.getvalue().splitlines() + # Header row + assert "subj_biolink_type" in lines[0] + # Data row: multi-value tuple joined with pipe + assert "biolink:Gene|biolink:NamedThing" in lines[1] + + def test_tsv_empty_no_output(self): + out = io.StringIO() + write_records([], "tsv", file=out) + assert out.getvalue() == "" + + def test_tsv_identifier_record_extra_fields_expanded(self, id_record): + out = io.StringIO() + write_records([id_record], "tsv", file=out) + lines = out.getvalue().splitlines() + assert "curie" in lines[0] + assert "type" in lines[0] + assert "label" in lines[0] + assert "A:1" in lines[1] + + # -- csv format -- + + def test_csv_has_header_row(self, xref): + out = io.StringIO() + write_records([xref], "csv", file=out) + lines = out.getvalue().splitlines() + assert lines[0] == "filename,subj,pred,obj" + + def test_csv_data_row(self, xref): + out = io.StringIO() + write_records([xref], "csv", file=out) + lines = out.getvalue().splitlines() + assert lines[1] == "Concord.parquet,A:1,skos:exactMatch,B:2" + + def test_csv_empty_no_output(self): + out = io.StringIO() + write_records([], "csv", file=out) + assert out.getvalue() == "" + + def test_csv_tuple_fields_pipe_joined(self, labeled_xref): + out = io.StringIO() + write_records([labeled_xref], "csv", file=out) + lines = out.getvalue().splitlines() + assert "biolink:Gene|biolink:NamedThing" in lines[1] + + # -- invalid format -- + + def test_invalid_format_raises_value_error(self, xref): + out = io.StringIO() + with pytest.raises(ValueError, match="Unknown format"): + write_records([xref], "xml", file=out) From aecae5085acb623085b5894835e95dfb6fd4ce52 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 9 Apr 2026 12:10:12 -0600 Subject: [PATCH 65/66] Add console format with rich color highlighting; replace text default Replaces the 'text' default format with 'console', backed by the rich library. xrefs and test-concord highlight query CURIEs in bold cyan wherever they appear as subject or object; rich auto-strips markup when output is piped. ids uses console.print(str(record)) for TTY-aware plain output. formatting.py gains make_console() and hl_curie() utilities for new commands to reuse. LabeledCrossReference labels appear in parentheses next to CURIEs in console output. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 1 + src/babel_explorer/cli.py | 51 ++++++++++++++++---- src/babel_explorer/formatting.py | 32 +++++++++--- tests/test_cli.py | 83 ++++++++++++++++++++++++++++++-- tests/test_formatting.py | 80 ++++++++++++++++++++++-------- uv.lock | 36 ++++++++++++++ 6 files changed, 241 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 59c1b68..34a0be3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "click>=8.3.1", "duckdb>=1.4.2", "requests>=2.32.5", + "rich>=13", "tqdm>=4.67.0", ] diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py index 098e831..db0d46f 100644 --- a/src/babel_explorer/cli.py +++ b/src/babel_explorer/cli.py @@ -4,7 +4,9 @@ from babel_explorer.core.downloader import BabelDownloader from babel_explorer.core.babel_xrefs import BabelXRefs from babel_explorer.core.nodenorm import NodeNorm -from babel_explorer.formatting import write_records, _record_to_dict +from babel_explorer.core.babel_xrefs import LabeledCrossReference +from babel_explorer.formatting import write_records, _record_to_dict, make_console, hl_curie +from rich.markup import escape def format_option(f): @@ -12,8 +14,8 @@ def format_option(f): f = click.option( "--format", "fmt", - default="text", - type=click.Choice(["text", "json", "tsv", "csv"]), + default="console", + type=click.Choice(["console", "json", "tsv", "csv"]), show_default=True, help="Output format", )(f) @@ -136,7 +138,24 @@ def xrefs( NodeNorm(nodenorm_url), ) xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels) - write_records(xrefs, fmt=fmt, indent=json_indent) + + if fmt == "console": + console = make_console() + query_set = set(curies) + for xref in xrefs: + subj_str = hl_curie(xref.subj, xref.subj in query_set) + obj_str = hl_curie(xref.obj, xref.obj in query_set) + if isinstance(xref, LabeledCrossReference): + if xref.subj_label: + subj_str += f" ({escape(xref.subj_label)})" + if xref.obj_label: + obj_str += f" ({escape(xref.obj_label)})" + console.print( + f"{subj_str} [dim]{escape(xref.pred)}[/dim] " + f"{obj_str} [dim italic]{escape(xref.filename)}[/dim italic]" + ) + else: + write_records(xrefs, fmt=fmt, indent=json_indent) @cli.command("ids") @@ -183,7 +202,13 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str, BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness) ) xrefs = bxref.get_curie_ids(curies) - write_records(xrefs, fmt=fmt, indent=json_indent) + + if fmt == "console": + console = make_console() + for record in xrefs: + console.print(str(record)) + else: + write_records(xrefs, fmt=fmt, indent=json_indent) @cli.command("test-concord") @@ -202,12 +227,18 @@ def test_concord(curies, nodenorm_url, fmt, json_indent): run before and after a Babel rebuild to see how cliques would shift. """ nodenorm = NodeNorm(nodenorm_url) - if fmt == "text": + if fmt == "console": + console = make_console() + query_set = set(curies) for curie in curies: - for identifier in nodenorm.get_clique_identifiers(curie): - biolink = ", ".join(identifier.biolink_type) - label = identifier.label or "" - print(f"{curie}\t{identifier.curie}\t{label}\t{biolink}") + for ident in nodenorm.get_clique_identifiers(curie): + biolink = ", ".join(ident.biolink_type) + console.print( + f"{hl_curie(curie, True)} " + f"{hl_curie(ident.curie, ident.curie in query_set)} " + f"{escape(ident.label or '-')} " + f"[dim]{escape(biolink)}[/dim]" + ) else: rows = [ {"query_curie": curie, **_record_to_dict(ident)} diff --git a/src/babel_explorer/formatting.py b/src/babel_explorer/formatting.py index 7e7701e..191f1cd 100644 --- a/src/babel_explorer/formatting.py +++ b/src/babel_explorer/formatting.py @@ -1,7 +1,8 @@ """Output formatting for babel-explorer CLI commands. -Provides write_records() to render any list of dataclass records (or plain -dicts) as text, JSON, TSV, or CSV. +Provides: +- write_records() for machine-readable output (json, tsv, csv) +- make_console() and hl_curie() for rich console output """ import csv @@ -10,6 +11,9 @@ import sys from typing import Any +from rich.console import Console +from rich.markup import escape + def _record_to_dict(record) -> dict[str, Any]: """Convert a dataclass (or plain dict) to a flat dict. @@ -31,11 +35,27 @@ def _flatten_for_tabular(row: dict) -> dict: return {k: "|".join(v) if isinstance(v, (list, tuple)) else v for k, v in row.items()} +def make_console(file=None) -> Console: + """Create a rich Console with babel-explorer defaults. + + Auto-detects TTY and NO_COLOR; strips markup when output is piped. + highlight=False prevents rich from auto-highlighting numbers and strings. + """ + return Console(file=file, highlight=False) + + +def hl_curie(curie: str, highlight: bool) -> str: + """Return rich markup for a CURIE — bold cyan if it is a query CURIE.""" + escaped = escape(curie) + return f"[bold cyan]{escaped}[/bold cyan]" if highlight else escaped + + def write_records(records, fmt: str, indent: int = 2, file=None): """Write an iterable of dataclass records (or dicts) in the requested format. :param records: Iterable of dataclass instances or plain dicts. - :param fmt: One of "text", "json", "tsv", "csv". + :param fmt: One of "json", "tsv", "csv". (Console output is handled by + make_console/hl_curie in the CLI layer.) :param indent: JSON indentation depth (ignored for other formats). :param file: Output file-like object; defaults to sys.stdout. :raises ValueError: If fmt is not a recognised format. @@ -44,11 +64,7 @@ def write_records(records, fmt: str, indent: int = 2, file=None): file = sys.stdout records = list(records) - if fmt == "text": - for r in records: - print(r, file=file) - - elif fmt == "json": + if fmt == "json": rows = [_record_to_dict(r) for r in records] json.dump(rows, file, indent=indent, default=str) print(file=file) # trailing newline diff --git a/tests/test_cli.py b/tests/test_cli.py index 62112d4..f4f6dbb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -67,6 +67,10 @@ def test_xrefs_happy_path(self): runner = CliRunner() mock_xref = MagicMock() mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" + mock_xref.subj = "A:1" + mock_xref.obj = "B:2" + mock_xref.pred = "skos:exactMatch" + mock_xref.filename = "test.parquet" with ( patch("babel_explorer.cli.BabelDownloader"), @@ -84,7 +88,10 @@ def test_xrefs_happy_path(self): def test_xrefs_recurse_and_labels_flags(self): runner = CliRunner() mock_xref = MagicMock() - mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2" + mock_xref.subj = "A:1" + mock_xref.obj = "B:2" + mock_xref.pred = "skos:exactMatch" + mock_xref.filename = "test.parquet" with ( patch("babel_explorer.cli.BabelDownloader"), @@ -210,7 +217,65 @@ class TestOutputFormats: biolink_type=("biolink:Disease",), taxa=(), description=(), ) - # -- xrefs -- + # -- console format (default) -- + + def test_xrefs_default_format_is_console(self): + """Default format is console — output contains the CURIEs as plain text (no TTY in runner).""" + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): + mock_bx.return_value.get_curie_xrefs.return_value = [self._xref] + result = runner.invoke(cli, ["xrefs", "A:1"]) + + assert result.exit_code == 0 + # Rich strips markup on non-TTY; plain CURIEs and predicate appear + assert "A:1" in result.output + assert "B:2" in result.output + assert "skos:exactMatch" in result.output + + def test_xrefs_console_shows_query_curie(self): + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs") as mock_bx, + patch("babel_explorer.cli.NodeNorm"), + ): + mock_bx.return_value.get_curie_xrefs.return_value = [self._xref] + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "console"]) + + assert result.exit_code == 0 + assert "A:1" in result.output + + def test_test_concord_console_format(self): + runner = CliRunner() + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "console"]) + + assert result.exit_code == 0 + assert "MONDO:0004979" in result.output + assert "asthma" in result.output + assert "biolink:Disease" in result.output + + def test_test_concord_console_no_label_shows_dash(self): + """Identifiers with no label display '-' in console format.""" + runner = CliRunner() + mock_ident = MagicMock() + mock_ident.curie = "MONDO:0004979" + mock_ident.label = None + mock_ident.biolink_type = ["biolink:Disease"] + + with patch("babel_explorer.cli.NodeNorm") as mock_nn: + mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident] + result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "console"]) + + assert result.exit_code == 0 + assert "-" in result.output + + # -- json format -- def test_xrefs_format_json(self): runner = CliRunner() @@ -291,7 +356,7 @@ def test_ids_format_tsv_expands_extra_fields(self): assert "label" in lines[0] assert "gene" in lines[1] - # -- test-concord -- + # -- test-concord structured formats -- def test_test_concord_format_json_includes_query_curie(self): runner = CliRunner() @@ -329,3 +394,15 @@ def test_invalid_format_rejected_by_click(self): result = runner.invoke(cli, ["xrefs", "A:1", "--format", "xml"]) assert result.exit_code != 0 + + def test_text_format_rejected_by_click(self): + """'text' was removed; it is no longer a valid choice.""" + runner = CliRunner() + with ( + patch("babel_explorer.cli.BabelDownloader"), + patch("babel_explorer.cli.BabelXRefs"), + patch("babel_explorer.cli.NodeNorm"), + ): + result = runner.invoke(cli, ["xrefs", "A:1", "--format", "text"]) + + assert result.exit_code != 0 diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 9b9aa90..2d402db 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -6,10 +6,11 @@ import json import pytest +from rich.console import Console from babel_explorer.core.babel_xrefs import CrossReference, LabeledCrossReference, IdentifierRecord from babel_explorer.core.nodenorm import Identifier -from babel_explorer.formatting import _record_to_dict, write_records +from babel_explorer.formatting import _record_to_dict, write_records, make_console, hl_curie # --------------------------------------------------------------------------- @@ -55,6 +56,50 @@ def identifier(): ) +# --------------------------------------------------------------------------- +# Tests for make_console and hl_curie +# --------------------------------------------------------------------------- + + +class TestConsoleUtilities: + def test_make_console_returns_console(self): + console = make_console() + assert isinstance(console, Console) + + def test_make_console_accepts_file(self): + out = io.StringIO() + console = make_console(file=out) + assert isinstance(console, Console) + console.print("hello") + assert "hello" in out.getvalue() + + def test_hl_curie_highlighted_contains_markup(self): + result = hl_curie("HGNC:1100", highlight=True) + assert "bold cyan" in result + assert "HGNC:1100" in result + + def test_hl_curie_not_highlighted_is_plain(self): + result = hl_curie("HGNC:1100", highlight=False) + assert result == "HGNC:1100" + assert "[" not in result + + def test_hl_curie_highlighted_renders_correctly(self): + """Markup renders to plain text on a non-TTY console.""" + out = io.StringIO() + console = Console(file=out, highlight=False, no_color=True) + console.print(hl_curie("HGNC:1100", highlight=True)) + assert "HGNC:1100" in out.getvalue() + + def test_hl_curie_highlighted_renders_with_color(self): + """On a forced-TTY console, ANSI codes are emitted.""" + out = io.StringIO() + console = Console(file=out, highlight=False, force_terminal=True) + console.print(hl_curie("HGNC:1100", highlight=True)) + output = out.getvalue() + assert "HGNC:1100" in output + assert "\x1b[" in output # ANSI escape present + + # --------------------------------------------------------------------------- # Tests for _record_to_dict # --------------------------------------------------------------------------- @@ -107,24 +152,6 @@ def test_identifier_dataclass(self, identifier): class TestWriteRecords: - # -- text format -- - - def test_text_uses_str(self, xref): - out = io.StringIO() - write_records([xref], "text", file=out) - assert out.getvalue().strip() == str(xref) - - def test_text_empty_no_output(self): - out = io.StringIO() - write_records([], "text", file=out) - assert out.getvalue() == "" - - def test_text_multiple_records(self, xref): - out = io.StringIO() - write_records([xref, xref], "text", file=out) - lines = out.getvalue().strip().splitlines() - assert len(lines) == 2 - # -- json format -- def test_json_is_valid_list(self, xref): @@ -225,9 +252,20 @@ def test_csv_tuple_fields_pipe_joined(self, labeled_xref): lines = out.getvalue().splitlines() assert "biolink:Gene|biolink:NamedThing" in lines[1] - # -- invalid format -- + # -- invalid formats (including console, which is handled at CLI layer) -- + + def test_text_format_raises_value_error(self, xref): + out = io.StringIO() + with pytest.raises(ValueError, match="Unknown format"): + write_records([xref], "text", file=out) + + def test_console_format_raises_value_error(self, xref): + """Console format is handled by the CLI, not write_records.""" + out = io.StringIO() + with pytest.raises(ValueError, match="Unknown format"): + write_records([xref], "console", file=out) - def test_invalid_format_raises_value_error(self, xref): + def test_unknown_format_raises_value_error(self, xref): out = io.StringIO() with pytest.raises(ValueError, match="Unknown format"): write_records([xref], "xml", file=out) diff --git a/uv.lock b/uv.lock index b8496b5..7b201e4 100644 --- a/uv.lock +++ b/uv.lock @@ -10,6 +10,7 @@ dependencies = [ { name = "click" }, { name = "duckdb" }, { name = "requests" }, + { name = "rich" }, { name = "tqdm" }, ] @@ -26,6 +27,7 @@ requires-dist = [ { name = "click", specifier = ">=8.3.1" }, { name = "duckdb", specifier = ">=1.4.2" }, { name = "requests", specifier = ">=2.32.5" }, + { name = "rich", specifier = ">=13" }, { name = "tqdm", specifier = ">=4.67.0" }, ] @@ -212,6 +214,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + [[package]] name = "packaging" version = "26.0" @@ -316,6 +339,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, +] + [[package]] name = "ruff" version = "0.15.2" From f7cde3a77d72aa70bc22b3aa3d29165e124d5fe6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 9 Apr 2026 13:41:39 -0600 Subject: [PATCH 66/66] Fix Identifier.from_dict splitting string fields into characters tuple() on a bare string iterates its characters, so biolink_type, taxa, and description would become ('b','i','o',...) when NodeNorm returns them as strings rather than lists. _to_tuple() now wraps a bare string in a 1-tuple. Four new unit tests cover the string case for each field. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_explorer/core/nodenorm.py | 12 ++++++--- tests/test_nodenorm.py | 38 +++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py index 9ce916d..4f0f6d1 100644 --- a/src/babel_explorer/core/nodenorm.py +++ b/src/babel_explorer/core/nodenorm.py @@ -21,12 +21,18 @@ def __lt__(self, other): @staticmethod def from_dict(d: dict) -> "Identifier": + def _to_tuple(val) -> tuple[str, ...]: + """Coerce a string or list to a tuple — guards against iterating string chars.""" + if not val: + return () + return (val,) if isinstance(val, str) else tuple(val) + return Identifier( curie=d["identifier"], label=d.get("label", ""), - biolink_type=tuple(d.get("type", [])), - taxa=tuple(d.get("taxa", [])), - description=tuple(d.get("description", [])), + biolink_type=_to_tuple(d.get("type")), + taxa=_to_tuple(d.get("taxa")), + description=_to_tuple(d.get("description")), ) diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py index 8b30fcd..57b6dab 100644 --- a/tests/test_nodenorm.py +++ b/tests/test_nodenorm.py @@ -71,6 +71,44 @@ def test_from_dict_partial(self): assert ident.label == "Beta" assert ident.biolink_type == () + def test_from_dict_type_as_string(self): + """NodeNorm may return 'type' as a bare string for individual identifiers.""" + d = {"identifier": "X:1", "type": "biolink:Disease"} + ident = Identifier.from_dict(d) + assert ident.biolink_type == ("biolink:Disease",), ( + "biolink_type should be a 1-tuple, not a tuple of characters" + ) + + def test_from_dict_description_as_string(self): + """NodeNorm may return 'description' as a bare string.""" + d = {"identifier": "X:1", "description": "A chronic disease"} + ident = Identifier.from_dict(d) + assert ident.description == ("A chronic disease",), ( + "description should be a 1-tuple, not a tuple of characters" + ) + + def test_from_dict_taxa_as_string(self): + """NodeNorm may return 'taxa' as a bare string.""" + d = {"identifier": "X:1", "taxa": "NCBITaxon:9606"} + ident = Identifier.from_dict(d) + assert ident.taxa == ("NCBITaxon:9606",), ( + "taxa should be a 1-tuple, not a tuple of characters" + ) + + def test_from_dict_all_fields_as_strings(self): + """All three tuple fields as strings produce correct single-element tuples.""" + d = { + "identifier": "X:1", + "label": "Alpha", + "type": "biolink:NamedThing", + "taxa": "NCBITaxon:9606", + "description": "Some description", + } + ident = Identifier.from_dict(d) + assert ident.biolink_type == ("biolink:NamedThing",) + assert ident.taxa == ("NCBITaxon:9606",) + assert ident.description == ("Some description",) + def test_lt_ordering(self): a = Identifier(curie="A:1") b = Identifier(curie="B:2")