From fcc27c21f59dbd0578eb744511bf072912869ad5 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 2 Dec 2025 15:38:25 -0500
Subject: [PATCH 01/66] This initializes a uv package in this repository.

---
 .python-version | 1 +
 main.py         | 6 ++++++
 pyproject.toml  | 7 +++++++
 3 files changed, 14 insertions(+)
 create mode 100644 .python-version
 create mode 100644 main.py
 create mode 100644 pyproject.toml

diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..2c07333
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..55226a0
--- /dev/null
+++ b/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from babel-xrefs!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..826dbc8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "babel-xrefs"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = []

From 876353dae4b62b7f6ea9872cbfb29cb2fe221bc7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 2 Dec 2025 15:42:12 -0500
Subject: [PATCH 02/66] Added basic CLI.

---
 main.py                     |  6 ------
 pyproject.toml              | 17 ++++++++++++++++-
 src/babel_xrefs/__init__.py |  0
 src/babel_xrefs/cli.py      |  9 +++++++++
 4 files changed, 25 insertions(+), 7 deletions(-)
 delete mode 100644 main.py
 create mode 100644 src/babel_xrefs/__init__.py
 create mode 100644 src/babel_xrefs/cli.py

diff --git a/main.py b/main.py
deleted file mode 100644
index 55226a0..0000000
--- a/main.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def main():
-    print("Hello from babel-xrefs!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pyproject.toml b/pyproject.toml
index 826dbc8..7c84773 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,4 +4,19 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
-dependencies = []
+dependencies = [
+    "click>=8.3.1",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+    "pytest>=8.3.5",
+    "ruff>=0.11.0",
+]
+
+[project.scripts]
+babel-xrefs = "babel_xrefs.cli:main"
diff --git a/src/babel_xrefs/__init__.py b/src/babel_xrefs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
new file mode 100644
index 0000000..537fe60
--- /dev/null
+++ b/src/babel_xrefs/cli.py
@@ -0,0 +1,9 @@
+# Command line interface for babel-xrefs
+import click
+
+@click.command()
+def main():
+    pass
+
+if __name__ == "__main__":
+    main()

From ec1d1f09b11f45b5bc31466f32d52b442b1423ac Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 2 Dec 2025 16:09:07 -0500
Subject: [PATCH 03/66] Add /data to the .gitignore.

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index b7faf40..67d8b31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Ignore data files.
+/data
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]

From eff8f26988981601c578e54ee5f47a867c77fdd0 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 3 Dec 2025 02:27:10 -0500
Subject: [PATCH 04/66] Initial implementation of a basic xref query-er.

---
 pyproject.toml                     |  4 +-
 src/__init__.py                    |  0
 src/babel_xrefs/babel_xrefs.py     | 33 ++++++++++
 src/babel_xrefs/cli.py             | 38 +++++++++++-
 src/babel_xrefs/core/__init__.py   |  0
 src/babel_xrefs/core/downloader.py | 98 ++++++++++++++++++++++++++++++
 src/babel_xrefs/core/model.py      |  2 +
 7 files changed, 171 insertions(+), 4 deletions(-)
 create mode 100644 src/__init__.py
 create mode 100644 src/babel_xrefs/babel_xrefs.py
 create mode 100644 src/babel_xrefs/core/__init__.py
 create mode 100644 src/babel_xrefs/core/downloader.py
 create mode 100644 src/babel_xrefs/core/model.py

diff --git a/pyproject.toml b/pyproject.toml
index 7c84773..5696f67 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,8 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "click>=8.3.1",
+    "duckdb>=1.4.2",
+    "requests>=2.32.5",
 ]
 
 [build-system]
@@ -19,4 +21,4 @@ dev = [
 ]
 
 [project.scripts]
-babel-xrefs = "babel_xrefs.cli:main"
+babel-xrefs = "babel_xrefs.cli:cli"
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
new file mode 100644
index 0000000..3ec44ad
--- /dev/null
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -0,0 +1,33 @@
+# Babel XRefs is a tool for accessing and querying the intermediate files
+# that we make available with Babel builds. This allows you to find out
+# why we consider two identifiers to be identical.
+import logging
+import duckdb
+
+from babel_xrefs.core.downloader import BabelDownloader
+
+
+class BabelXRefs:
+    def __init__(self, downloader: BabelDownloader):
+        self.downloader = downloader
+
+    def get_curie_xrefs(self, curies: list[str]):
+        """
+        Search for all identifiers that are cross-referenced to the given CURIE.
+
+        :param curie: A CURIE to search for.
+        :return: A list of cross-references containing that CURIE.
+        """
+
+        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
+        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/ConcordMetadata.parquet')
+
+        # Query the Parquet files using DuckDB.
+        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
+        db = duckdb.connect(duckdb_path)
+        concord_table = db.read_parquet(concord_parquet)
+        xrefs = db.execute(f"SELECT * FROM concord_table WHERE subj IN $1 OR obj in $1", [curies])
+
+        # TODO: convert into case classes.
+
+        return xrefs.fetchall()
diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index 537fe60..691fcd6 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -1,9 +1,41 @@
 # Command line interface for babel-xrefs
 import click
+import logging
+import babel_xrefs
+from babel_xrefs.core.downloader import BabelDownloader
+from babel_xrefs.babel_xrefs import BabelXRefs
 
-@click.command()
-def main():
+
+@click.group()
+def cli():
     pass
 
+@cli.command("xrefs")
+@click.argument("curies", type=str, required=True, nargs=-1)
+@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
+@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
+def xrefs(curies: list[str], babel_url: str, local_dir: str):
+    """
+    Fetches and prints the cross-references (xrefs) for the given CURIEs.
+
+    This function searches for xrefs associated with the provided CURIEs.
+
+    \f
+
+    :param curies: A list of CURIEs (Compact URI) for which cross-references need
+        to be retrieved.
+    :type curies: list[str]
+    :param babel_url: Base URL of the Babel server
+    :type babel_url: str
+
+    :return: None
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir))
+    xrefs = bxref.get_curie_xrefs(curies)
+    for xref in xrefs:
+        print(xref)
+
 if __name__ == "__main__":
-    main()
+    cli()
diff --git a/src/babel_xrefs/core/__init__.py b/src/babel_xrefs/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py
new file mode 100644
index 0000000..34e4cb0
--- /dev/null
+++ b/src/babel_xrefs/core/downloader.py
@@ -0,0 +1,98 @@
+import os
+import urllib.parse
+import subprocess
+import requests
+import logging
+
+class BabelDownloader:
+    """
+    Class for downloading Babel cross-reference files to a local directory as needed.
+    """
+
+    def __init__(self, url_base, local_path=None, retries=10):
+        # We assume the URL base is correct (if not, we can fix it later).
+        self.url_base = url_base
+        self.retries = retries
+        self.logger = logging.getLogger(BabelDownloader.__name__)
+
+        if local_path is None:
+            # Default to using TMPDIR.
+            # TODO: replace with a real temporary directory.
+            tmpdir = os.environ.get("TMPDIR")
+            if tmpdir:
+                local_path = tmpdir
+
+        # Make sure the local path is an existing directory or that we can create it.
+        if not os.path.exists(local_path):
+            os.makedirs(local_path, exist_ok=True)
+            self.local_path = local_path
+        elif os.path.exists(local_path) and os.path.isdir(local_path):
+            self.local_path = local_path
+        else:
+            raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'")
+
+    def get_output_file(self, filename):
+        filepath = os.path.join(self.local_path, filename)
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        return filepath
+
+    def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024):
+        local_path_to_download_to = os.path.join(self.local_path, dirpath)
+        os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
+
+        url_to_download = urllib.parse.urljoin(self.url_base, dirpath)
+        bytes_downloaded = 0
+
+        wget_command_line = [
+            "wget",
+            "--progress=bar:force:noscroll",        # Display progress bar.
+            "--compression=auto",                   # Compress files if available.
+            "--continue",                           # Continue downloading in case of interruption.
+            f"--tries={self.retries}",
+            "-O" + local_path_to_download_to,
+        ]
+
+        # Add URL and output file.
+        wget_command_line.append(url_to_download)
+
+        # Execute wget.
+        self.logger.info(f"Downloading {url_to_download} using wget: {wget_command_line}")
+        process = subprocess.run(wget_command_line)
+        if process.returncode != 0:
+            raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")
+
+        bytes_downloaded = os.path.getsize(local_path_to_download_to)
+        self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
+        return local_path_to_download_to
+
+
+    def get_downloaded_dir(self, dirpath: str):
+        local_path_to_download_to = os.path.join(self.local_path, dirpath)
+        os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
+
+        url_to_download_recursively = urllib.parse.urljoin(self.url_base, dirpath)
+
+        wget_command_line = [
+            "wget",
+            "--progress=bar:force:noscroll",        # Display progress bar.
+            "--compression=auto",                   # Compress files if available.
+            "--continue",                           # Continue downloading in case of interruption.
+            f"--tries={self.retries}",
+            "--recursive",
+            "--no-parent",
+            "--no-host-directories",
+            "--directory-prefix=" + local_path_to_download_to,
+        ]
+
+        # Add URL and output file.
+        if url_to_download_recursively[-1] != "/":
+            url_to_download_recursively += "/"
+        wget_command_line.append(url_to_download_recursively)
+
+        # Execute wget.
+        self.logger.info(f"Downloading {url_to_download_recursively} using wget: {wget_command_line}")
+        process = subprocess.run(wget_command_line)
+        if process.returncode != 0:
+            raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")
+
+        return local_path_to_download_to
diff --git a/src/babel_xrefs/core/model.py b/src/babel_xrefs/core/model.py
new file mode 100644
index 0000000..139597f
--- /dev/null
+++ b/src/babel_xrefs/core/model.py
@@ -0,0 +1,2 @@
+
+

From 4d04e2ae01a768b358a5f6d918edc6d8d442aca1 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 3 Dec 2025 23:02:21 -0500
Subject: [PATCH 05/66] Added a method to look up a particular identifier.

---
 src/babel_xrefs/babel_xrefs.py | 24 +++++++++++++++++++++++-
 src/babel_xrefs/cli.py         | 26 +++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
index 3ec44ad..6e171fa 100644
--- a/src/babel_xrefs/babel_xrefs.py
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -11,6 +11,28 @@ class BabelXRefs:
     def __init__(self, downloader: BabelDownloader):
         self.downloader = downloader
 
+    def get_curie_ids(self, curies: list[str]):
+        """
+        Search for all identifiers in the /ids/ files for a particular CURIE.
+
+        :param curie: A CURIE to search for.
+        :return: A list of cross-references containing that CURIE.
+        """
+
+        identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
+        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
+
+        # Query the Parquet files using DuckDB.
+        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
+        db = duckdb.connect(duckdb_path)
+        identifier_table = db.read_parquet(identifier_parquet)
+        xrefs = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies])
+
+        # TODO: convert into case classes.
+
+        return xrefs.fetchall()
+
+
     def get_curie_xrefs(self, curies: list[str]):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
@@ -20,7 +42,7 @@ def get_curie_xrefs(self, curies: list[str]):
         """
 
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
-        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/ConcordMetadata.parquet')
+        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
         # Query the Parquet files using DuckDB.
         duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index 691fcd6..c77be3b 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -1,7 +1,6 @@
 # Command line interface for babel-xrefs
 import click
 import logging
-import babel_xrefs
 from babel_xrefs.core.downloader import BabelDownloader
 from babel_xrefs.babel_xrefs import BabelXRefs
 
@@ -37,5 +36,30 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str):
     for xref in xrefs:
         print(xref)
 
+@cli.command("ids")
+@click.argument("curies", type=str, required=True, nargs=-1)
+@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
+@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
+def ids(curies: list[str], babel_url: str, local_dir: str):
+    """
+    Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided.
+
+    \f
+
+    :param curies: A list of CURIEs (Compact URI) for which cross-references need
+        to be retrieved.
+    :type curies: list[str]
+    :param babel_url: Base URL of the Babel server
+    :type babel_url: str
+
+    :return: None
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir))
+    xrefs = bxref.get_curie_ids(curies)
+    for xref in xrefs:
+        print(xref)
+
 if __name__ == "__main__":
     cli()

From 8531cb7a9e4c65ea5459a2e56d75cb09cb30bf9d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 3 Dec 2025 23:29:17 -0500
Subject: [PATCH 06/66] Added CURIE expansion/recursive lookup.

---
 src/babel_xrefs/babel_xrefs.py     | 54 ++++++++++++++++++++++++------
 src/babel_xrefs/cli.py             |  5 +--
 src/babel_xrefs/core/downloader.py |  6 +++-
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
index 6e171fa..f321e5e 100644
--- a/src/babel_xrefs/babel_xrefs.py
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -1,11 +1,30 @@
 # Babel XRefs is a tool for accessing and querying the intermediate files
 # that we make available with Babel builds. This allows you to find out
 # why we consider two identifiers to be identical.
+import dataclasses
 import logging
 import duckdb
+import functools
 
 from babel_xrefs.core.downloader import BabelDownloader
 
+@dataclasses.dataclass(frozen=True)
+class CrossReference:
+    filename: str
+    subj: str
+    pred: str
+    obj: str
+
+    @staticmethod
+    def from_tuple(tuple: tuple[str, str, str, str]):
+        return CrossReference(filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3])
+
+    @property
+    def curies(self):
+        return frozenset([self.subj, self.obj])
+
+    def __lt__(self, other):
+        return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred)
 
 class BabelXRefs:
     def __init__(self, downloader: BabelDownloader):
@@ -32,24 +51,37 @@ def get_curie_ids(self, curies: list[str]):
 
         return xrefs.fetchall()
 
+    @functools.lru_cache(maxsize=None)
+    def get_curie_xref(self, curie: str):
+        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
+        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
+
+        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
+        db = duckdb.connect(duckdb_path)
+        concord_table = db.read_parquet(concord_parquet)
+        xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
+        xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples))
+        return xrefs
 
-    def get_curie_xrefs(self, curies: list[str]):
+    def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set()):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
         :param curie: A CURIE to search for.
+        :param expand: Whether to expand the cross-references (i.e. recursively follow all identifiers).
         :return: A list of cross-references containing that CURIE.
         """
 
-        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
-        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
+        xrefs = set()
+        for curie in curies:
+            logging.info(f"Searching for cross-references for {curie}")
+            xrefs.update(self.get_curie_xref(curie))
 
-        # Query the Parquet files using DuckDB.
-        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
-        db = duckdb.connect(duckdb_path)
-        concord_table = db.read_parquet(concord_parquet)
-        xrefs = db.execute(f"SELECT * FROM concord_table WHERE subj IN $1 OR obj in $1", [curies])
-
-        # TODO: convert into case classes.
+        if expand:
+            # Get a unique set of referenced curies, not including the ones currently queried.
+            new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion)
+            if new_curies:
+                logging.info(f"Expanding cross-references to {new_curies}")
+                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies)))
 
-        return xrefs.fetchall()
+        return sorted(xrefs)
diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index c77be3b..c73d3db 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -13,7 +13,8 @@ def cli():
 @click.argument("curies", type=str, required=True, nargs=-1)
 @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
 @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
-def xrefs(curies: list[str], babel_url: str, local_dir: str):
+@click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs")
+def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
 
@@ -32,7 +33,7 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str):
     logging.basicConfig(level=logging.INFO)
 
     bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir))
-    xrefs = bxref.get_curie_xrefs(curies)
+    xrefs = bxref.get_curie_xrefs(curies, expand)
     for xref in xrefs:
         print(xref)
 
diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py
index 34e4cb0..9313685 100644
--- a/src/babel_xrefs/core/downloader.py
+++ b/src/babel_xrefs/core/downloader.py
@@ -1,8 +1,10 @@
+import functools
 import os
 import urllib.parse
 import subprocess
 import requests
 import logging
+import functools
 
 class BabelDownloader:
     """
@@ -31,11 +33,13 @@ def __init__(self, url_base, local_path=None, retries=10):
         else:
             raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'")
 
+    @functools.lru_cache(maxsize=None)
     def get_output_file(self, filename):
         filepath = os.path.join(self.local_path, filename)
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
         return filepath
 
+    @functools.lru_cache(maxsize=None)
     def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024):
         local_path_to_download_to = os.path.join(self.local_path, dirpath)
         os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
@@ -65,7 +69,7 @@ def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024):
         self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
         return local_path_to_download_to
 
-
+    @functools.lru_cache(maxsize=None)
     def get_downloaded_dir(self, dirpath: str):
         local_path_to_download_to = os.path.join(self.local_path, dirpath)
         os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)

From a1aeec6be4ad11a3720c666d59483d34a77e0560 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 4 Dec 2025 00:56:06 -0500
Subject: [PATCH 07/66] Added a basic ConcordTester.

---
 src/babel_xrefs/cli.py           | 18 ++++++++++++
 src/babel_xrefs/core/nodenorm.py | 50 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 src/babel_xrefs/core/nodenorm.py

diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index c73d3db..4c65391 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -3,6 +3,7 @@
 import logging
 from babel_xrefs.core.downloader import BabelDownloader
 from babel_xrefs.babel_xrefs import BabelXRefs
+from babel_xrefs.core.nodenorm import NodeNorm
 
 
 @click.group()
@@ -62,5 +63,22 @@ def ids(curies: list[str], babel_url: str, local_dir: str):
     for xref in xrefs:
         print(xref)
 
+@cli.command("test-concord")
+@click.argument("curies", type=str, required=True, nargs=-1)
+@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
+def test_concord(curies, nodenorm_url):
+    # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm?
+    # By definition, this can only combine all the cliques mentioned in the CURIEs.
+
+    nodenorm = NodeNorm(nodenorm_url)
+    for curie in curies:
+        identifiers = nodenorm.get_clique_identifiers(curie)
+        for identifier in identifiers:
+            if identifier.label:
+                print(f"{curie}\t{identifier.curie}\t{identifier.label}")
+            else:
+                print(f"{curie}\t{identifier.curie}\t")
+
+
 if __name__ == "__main__":
     cli()
diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py
new file mode 100644
index 0000000..6c45e02
--- /dev/null
+++ b/src/babel_xrefs/core/nodenorm.py
@@ -0,0 +1,50 @@
+import dataclasses
+import functools
+import requests
+
+@dataclasses.dataclass
+class Identifier:
+    curie: str
+    label: str = ""
+    taxa: list[str] = dataclasses.field(default_factory=list)
+    description: list[str] = dataclasses.field(default_factory=list)
+
+    def __lt__(self, other):
+        return self.curie < other.curie
+
+    @staticmethod
+    def from_dict(d: dict):
+        identifier = Identifier(curie=d['identifier'])
+        if 'label' in d:
+            identifier.label = d['label']
+        if 'taxa' in d:
+            identifier.taxa = d['taxa']
+        if 'description' in d:
+            identifier.description = d['description']
+        return identifier
+
+class NodeNorm:
+    def __init__(self, nodenorm_url: str=""):
+        self.nodenorm_url = nodenorm_url
+
+    @functools.lru_cache(maxsize=None)
+    def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=False, description=False, individual_types=None, include_taxa=None):
+        response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={
+            "curie": curie,
+            "conflate": conflate,
+            "drug_chemical_conflate": drug_chemical_conflate,
+            "description": description,
+            "individual_types": individual_types,
+            "include_taxa": include_taxa,
+        })
+        response.raise_for_status()
+        result = response.json()
+
+        return result[curie]
+
+    @functools.lru_cache(maxsize=None)
+    def get_clique_identifiers(self, curie, **kwargs):
+        result = self.normalize_curie(curie, **kwargs)
+        if 'equivalent_identifiers' not in result:
+            return None
+        return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers']))

From bb1eb996a851010726576c298e0bce54eeeb6414 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 4 Dec 2025 01:58:24 -0500
Subject: [PATCH 08/66] Added labels via NodeNorm.

---
 src/babel_xrefs/babel_xrefs.py   | 42 ++++++++++++++++++++++++++++----
 src/babel_xrefs/cli.py           | 12 +++++----
 src/babel_xrefs/core/nodenorm.py | 17 ++++++++++++-
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
index f321e5e..e953c0f 100644
--- a/src/babel_xrefs/babel_xrefs.py
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -7,6 +7,8 @@
 import functools
 
 from babel_xrefs.core.downloader import BabelDownloader
+from babel_xrefs.core.nodenorm import NodeNorm
+
 
 @dataclasses.dataclass(frozen=True)
 class CrossReference:
@@ -26,9 +28,26 @@ def curies(self):
     def __lt__(self, other):
         return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred)
 
+class LabeledCrossReference(CrossReference):
+    subj_label: str
+    subj_biolink_type: str
+    obj_label: str
+    obj_biolink_type: str
+
+    def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: str, subj_biolink_type: str, obj_label: str, obj_biolink_type: str):
+        super().__init__(subj=subj, obj=obj, filename=filename, pred=pred)
+        self.subj_label = subj_label
+        self.subj_biolink_type = subj_biolink_type
+        self.obj_label = obj_label
+        self.obj_biolink_type = obj_biolink_type
+
+    def __str__(self):
+        return f"""LabeledCrossReference(subj="{self.subj}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")"""
+
 class BabelXRefs:
-    def __init__(self, downloader: BabelDownloader):
+    def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None):
         self.downloader = downloader
+        self.nodenorm = nodenorm
 
     def get_curie_ids(self, curies: list[str]):
         """
@@ -52,7 +71,7 @@ def get_curie_ids(self, curies: list[str]):
         return xrefs.fetchall()
 
     @functools.lru_cache(maxsize=None)
-    def get_curie_xref(self, curie: str):
+    def get_curie_xref(self, curie: str, label_curies: bool = False):
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
         concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
@@ -61,9 +80,22 @@ def get_curie_xref(self, curie: str):
         concord_table = db.read_parquet(concord_parquet)
         xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
         xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples))
+
+        if label_curies:
+            xrefs = map(lambda xref: LabeledCrossReference(
+                subj=xref.subj,
+                obj=xref.obj,
+                filename=xref.filename,
+                pred=xref.pred,
+                subj_label=self.nodenorm.get_identifier(xref.subj).label,
+                subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
+                obj_label=self.nodenorm.get_identifier(xref.obj).label,
+                obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
+            ), xrefs)
+
         return xrefs
 
-    def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set()):
+    def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
@@ -75,13 +107,13 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies
         xrefs = set()
         for curie in curies:
             logging.info(f"Searching for cross-references for {curie}")
-            xrefs.update(self.get_curie_xref(curie))
+            xrefs.update(self.get_curie_xref(curie, label_curies))
 
         if expand:
             # Get a unique set of referenced curies, not including the ones currently queried.
             new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion)
             if new_curies:
                 logging.info(f"Expanding cross-references to {new_curies}")
-                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies)))
+                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies), label_curies=label_curies))
 
         return sorted(xrefs)
diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index 4c65391..673500d 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -14,8 +14,10 @@ def cli():
 @click.argument("curies", type=str, required=True, nargs=-1)
 @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
 @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
+@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
 @click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs")
-def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool):
+@click.option("--labels", is_flag=True, help="Include labels for CURIEs")
+def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
 
@@ -33,8 +35,8 @@ def xrefs(curies: list[str], babel_url: str, local_dir: str, expand: bool):
     """
     logging.basicConfig(level=logging.INFO)
 
-    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir))
-    xrefs = bxref.get_curie_xrefs(curies, expand)
+    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir), NodeNorm(nodenorm_url))
+    xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels)
     for xref in xrefs:
         print(xref)
 
@@ -75,9 +77,9 @@ def test_concord(curies, nodenorm_url):
         identifiers = nodenorm.get_clique_identifiers(curie)
         for identifier in identifiers:
             if identifier.label:
-                print(f"{curie}\t{identifier.curie}\t{identifier.label}")
+                print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}")
             else:
-                print(f"{curie}\t{identifier.curie}\t")
+                print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}")
 
 
 if __name__ == "__main__":
diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py
index 6c45e02..5c8e0c9 100644
--- a/src/babel_xrefs/core/nodenorm.py
+++ b/src/babel_xrefs/core/nodenorm.py
@@ -1,11 +1,13 @@
 import dataclasses
 import functools
 import requests
+import logging
 
 @dataclasses.dataclass
 class Identifier:
     curie: str
     label: str = ""
+    biolink_type: str = ""
     taxa: list[str] = dataclasses.field(default_factory=list)
     description: list[str] = dataclasses.field(default_factory=list)
 
@@ -21,6 +23,8 @@ def from_dict(d: dict):
             identifier.taxa = d['taxa']
         if 'description' in d:
             identifier.description = d['description']
+        if 'type' in d:
+            identifier.biolink_type = d['type']
         return identifier
 
 class NodeNorm:
@@ -28,7 +32,18 @@ def __init__(self, nodenorm_url: str=""):
         self.nodenorm_url = nodenorm_url
 
     @functools.lru_cache(maxsize=None)
-    def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=False, description=False, individual_types=None, include_taxa=None):
+    def get_identifier(self, curie):
+        result = self.normalize_curie(curie)
+        logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}")
+        for identifier in result.get('equivalent_identifiers', []):
+            if identifier['identifier'] == curie:
+                logging.debug(f"Found exact match for {curie}: {identifier}")
+                return Identifier.from_dict(identifier)
+
+        return Identifier(curie=curie)
+
+    @functools.lru_cache(maxsize=None)
+    def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True, description=True, individual_types=True, include_taxa=True):
         response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={
             "curie": curie,
             "conflate": conflate,

From 40c3338d2d72b651e6f07da0dfbb87e226437b1c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 8 Jan 2026 14:58:30 -0500
Subject: [PATCH 09/66] Midnight commit: attempting to improve expansion.

---
 src/babel_xrefs/babel_xrefs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
index e953c0f..32829c1 100644
--- a/src/babel_xrefs/babel_xrefs.py
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -104,6 +104,9 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies
         :return: A list of cross-references containing that CURIE.
         """
 
+        if ignore_curies_in_expansion:
+            logging.info(f"Ignoring {len(ignore_curies_in_expansion)}: {ignore_curies_in_expansion}")
+
         xrefs = set()
         for curie in curies:
             logging.info(f"Searching for cross-references for {curie}")
@@ -114,6 +117,6 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies
             new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion)
             if new_curies:
                 logging.info(f"Expanding cross-references to {new_curies}")
-                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(new_curies), label_curies=label_curies))
+                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies))
 
         return sorted(xrefs)

From 8c41112940b8e60fa186c775e2d3f138c3b2a935 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 8 Jan 2026 16:04:56 -0500
Subject: [PATCH 10/66] Added some improvements.

---
 src/babel_xrefs/babel_xrefs.py   | 2 +-
 src/babel_xrefs/core/nodenorm.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/babel_xrefs.py
index 32829c1..bf32521 100644
--- a/src/babel_xrefs/babel_xrefs.py
+++ b/src/babel_xrefs/babel_xrefs.py
@@ -42,7 +42,7 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st
         self.obj_biolink_type = obj_biolink_type
 
     def __str__(self):
-        return f"""LabeledCrossReference(subj="{self.subj}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")"""
+        return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")"""
 
 class BabelXRefs:
     def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None):
diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_xrefs/core/nodenorm.py
index 5c8e0c9..018f106 100644
--- a/src/babel_xrefs/core/nodenorm.py
+++ b/src/babel_xrefs/core/nodenorm.py
@@ -35,6 +35,8 @@ def __init__(self, nodenorm_url: str=""):
     def get_identifier(self, curie):
         result = self.normalize_curie(curie)
         logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}")
+        if not result:
+            return Identifier(curie=curie)
         for identifier in result.get('equivalent_identifiers', []):
             if identifier['identifier'] == curie:
                 logging.debug(f"Found exact match for {curie}: {identifier}")

From 239c89f7bf6e9853cac75452234d0f2fce265c9c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sat, 14 Feb 2026 01:25:40 -0500
Subject: [PATCH 11/66] Added a CLAUDE.md by Claude.ai.

---
 CLAUDE.md | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..536af9b
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,118 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+babel-xrefs is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC).
+
+## Development Setup
+
+This project uses **uv** for package management:
+
+```bash
+# Install dependencies
+uv sync
+
+# Install with dev dependencies
+uv sync --group dev
+
+# Activate virtual environment (if needed)
+source .venv/bin/activate
+
+# Run the CLI
+uv run babel-xrefs --help
+```
+
+## Commands
+
+### Running the Application
+
+```bash
+# Get cross-references for one or more CURIEs
+uv run babel-xrefs xrefs MONDO:0004979
+
+# Get cross-references with expansion (recursive lookup)
+uv run babel-xrefs xrefs MONDO:0004979 --expand
+
+# Get cross-references with labels from NodeNorm
+uv run babel-xrefs xrefs MONDO:0004979 --labels
+
+# Get ID records for CURIEs
+uv run babel-xrefs ids MONDO:0004979
+
+# Test concordance changes with NodeNorm
+uv run babel-xrefs test-concord MONDO:0004979 HP:0000001
+
+# Use custom Babel server or local directory
+uv run babel-xrefs xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/
+```
+
+### Development Commands
+
+```bash
+# Run tests
+uv run pytest
+
+# Run linter
+uv run ruff check
+
+# Format code
+uv run ruff format
+```
+
+## Architecture
+
+### Core Components
+
+1. **BabelDownloader** (`src/babel_xrefs/core/downloader.py`):
+   - Downloads Babel intermediate files from a remote server using `wget`
+   - Caches files locally in configurable directory (default: `data/2025nov19/`)
+   - Uses `@functools.lru_cache` to avoid re-downloading
+   - **Important**: Requires `wget` to be installed on the system
+
+2. **BabelXRefs** (`src/babel_xrefs/babel_xrefs.py`):
+   - Main query engine for cross-references
+   - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`)
+   - Supports recursive expansion of cross-references
+   - Creates ephemeral DuckDB databases in `data/<version>/output/duckdbs/`
+
+3. **NodeNorm** (`src/babel_xrefs/core/nodenorm.py`):
+   - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/)
+   - Fetches labels, biolink types, and equivalent identifiers for CURIEs
+   - Uses `@functools.lru_cache` for performance
+   - Optional component for label enrichment
+
+4. **CLI** (`src/babel_xrefs/cli.py`):
+   - Click-based command-line interface
+   - Three main commands: `xrefs`, `ids`, `test-concord`
+
+### Data Flow
+
+1. User provides CURIEs via CLI
+2. BabelDownloader ensures required Parquet files are downloaded
+3. BabelXRefs queries files using DuckDB
+4. If `--labels` or `--expand` flags are set, NodeNorm is queried for additional metadata
+5. Results are printed to stdout
+
+### Key Design Patterns
+
+- **Lazy downloading**: Files are only downloaded when first accessed
+- **LRU caching**: Heavy use of `@functools.lru_cache` to avoid redundant downloads and API calls
+- **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs
+- **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups
+
+## Important Notes
+
+- **System dependency**: This project requires `wget` to be installed (used by BabelDownloader)
+- **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases
+- **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url`
+- **No tests yet**: The project currently has pytest configured but no test files exist
+- **Empty model.py**: The `src/babel_xrefs/core/model.py` file exists but is currently empty; data classes are defined in `babel_xrefs.py` and `nodenorm.py` instead
+
+## File Locations
+
+- Source code: `src/babel_xrefs/`
+- Downloaded Babel files: `data/<version>/duckdb/*.parquet`
+- Generated DuckDB databases: `data/<version>/output/duckdbs/`
+- Entry point: `src/babel_xrefs/cli.py`

From 8132fe1f64d455557929059ec7981435e6d82a63 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sat, 14 Feb 2026 01:52:15 -0500
Subject: [PATCH 12/66] Reorganized file slightly.

---
 src/babel_xrefs/cli.py                    | 2 +-
 src/babel_xrefs/{ => core}/babel_xrefs.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/babel_xrefs/{ => core}/babel_xrefs.py (100%)

diff --git a/src/babel_xrefs/cli.py b/src/babel_xrefs/cli.py
index 673500d..ab2e283 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_xrefs/cli.py
@@ -2,7 +2,7 @@
 import click
 import logging
 from babel_xrefs.core.downloader import BabelDownloader
-from babel_xrefs.babel_xrefs import BabelXRefs
+from babel_xrefs.core.babel_xrefs import BabelXRefs
 from babel_xrefs.core.nodenorm import NodeNorm
 
 
diff --git a/src/babel_xrefs/babel_xrefs.py b/src/babel_xrefs/core/babel_xrefs.py
similarity index 100%
rename from src/babel_xrefs/babel_xrefs.py
rename to src/babel_xrefs/core/babel_xrefs.py

From bd009721c06dee9e482d2947c16afacaaf265a0e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sat, 14 Feb 2026 02:02:02 -0500
Subject: [PATCH 13/66] Claude wrote some tests.

---
 tests/__init__.py        |   1 +
 tests/test_downloader.py | 194 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_downloader.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..61c04ac
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests for babel-xrefs
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
new file mode 100644
index 0000000..461854c
--- /dev/null
+++ b/tests/test_downloader.py
@@ -0,0 +1,194 @@
+"""
+Tests for the BabelDownloader class.
+
+These tests verify that the downloader can successfully fetch large Parquet files
+from the Babel server using wget and properly manage local file caching.
+"""
+
+import os
+import shutil
+import pytest
+from babel_xrefs.core.downloader import BabelDownloader
+
+
+# Constants for test configuration
+BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/"
+TEST_DATA_DIR = "data/test"
+IDENTIFIERS_FILE = "duckdb/Identifiers.parquet"
+MINIMUM_FILE_SIZE_GB = 2
+MINIMUM_FILE_SIZE_BYTES = MINIMUM_FILE_SIZE_GB * 1024 * 1024 * 1024  # 2GB in bytes
+
+
+@pytest.fixture(scope="module")
+def test_data_dir():
+    """
+    Fixture that provides a clean test data directory.
+
+    This fixture:
+    - Creates the test data directory before tests run
+    - Yields the directory path to tests
+    - Cleans up (removes) the directory after all tests complete
+
+    Scope is 'module' so the directory persists across all tests in this file,
+    allowing downloaded files to be reused by multiple tests.
+    """
+    # Setup: ensure clean test directory
+    if os.path.exists(TEST_DATA_DIR):
+        shutil.rmtree(TEST_DATA_DIR)
+    os.makedirs(TEST_DATA_DIR, exist_ok=True)
+
+    yield TEST_DATA_DIR
+
+    # Teardown: remove test directory and all contents
+    if os.path.exists(TEST_DATA_DIR):
+        shutil.rmtree(TEST_DATA_DIR)
+
+
+@pytest.fixture(scope="module")
+def downloader(test_data_dir):
+    """
+    Fixture that provides a BabelDownloader instance configured for testing.
+
+    Args:
+        test_data_dir: The test data directory fixture
+
+    Returns:
+        BabelDownloader: Configured downloader instance
+    """
+    return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+
+def test_downloader_initialization(test_data_dir):
+    """
+    Test that BabelDownloader initializes correctly with custom parameters.
+
+    Verifies:
+    - Downloader accepts URL and local path
+    - Local path is stored correctly
+    - Directory is created if it doesn't exist
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    assert downloader.url_base == BABEL_URL
+    assert downloader.local_path == test_data_dir
+    assert os.path.exists(test_data_dir)
+    assert os.path.isdir(test_data_dir)
+
+
+def test_download_large_parquet_file(downloader):
+    """
+    Test downloading a large Parquet file from the Babel server.
+
+    This test:
+    1. Downloads the Identifiers.parquet file (2GB+) from the real Babel server
+    2. Verifies the file was downloaded successfully
+    3. Confirms the file size is at least 2GB
+
+    Note: This test takes several minutes to complete due to the large file size.
+
+    Args:
+        downloader: BabelDownloader fixture
+    """
+    # Download the Identifiers.parquet file
+    downloaded_path = downloader.get_downloaded_file(IDENTIFIERS_FILE)
+
+    # Verify the file exists
+    assert os.path.exists(downloaded_path), \
+        f"Downloaded file does not exist at {downloaded_path}"
+
+    # Verify it's a file, not a directory
+    assert os.path.isfile(downloaded_path), \
+        f"Downloaded path is not a file: {downloaded_path}"
+
+    # Get the file size in bytes
+    file_size_bytes = os.path.getsize(downloaded_path)
+    file_size_gb = file_size_bytes / (1024 * 1024 * 1024)
+
+    # Verify the file is at least 2GB
+    assert file_size_bytes >= MINIMUM_FILE_SIZE_BYTES, \
+        f"Downloaded file is too small: {file_size_gb:.2f}GB (expected at least {MINIMUM_FILE_SIZE_GB}GB)"
+
+    print(f"\n✓ Successfully downloaded {IDENTIFIERS_FILE}")
+    print(f"  Size: {file_size_gb:.2f}GB ({file_size_bytes:,} bytes)")
+    print(f"  Path: {downloaded_path}")
+
+
+def test_download_caching(downloader):
+    """
+    Test that the downloader uses LRU caching to avoid re-downloading files.
+
+    This test:
+    1. Downloads the same file twice
+    2. Verifies both calls return the same path
+    3. Confirms the file is only downloaded once (via caching)
+
+    Args:
+        downloader: BabelDownloader fixture
+    """
+    # First download
+    path1 = downloader.get_downloaded_file(IDENTIFIERS_FILE)
+    initial_mtime = os.path.getmtime(path1)
+
+    # Second download - should use cache
+    path2 = downloader.get_downloaded_file(IDENTIFIERS_FILE)
+    second_mtime = os.path.getmtime(path2)
+
+    # Verify same path returned
+    assert path1 == path2, "Cached download returned different path"
+
+    # Verify file wasn't modified (i.e., wasn't re-downloaded)
+    assert initial_mtime == second_mtime, \
+        "File was modified, suggesting it was re-downloaded instead of cached"
+
+    print(f"\n✓ Caching works correctly - file not re-downloaded")
+
+
+def test_get_output_file(downloader):
+    """
+    Test the get_output_file method for creating output file paths.
+
+    This test:
+    1. Creates an output file path
+    2. Verifies the directory structure is created
+    3. Confirms the path is in the correct location
+
+    Args:
+        downloader: BabelDownloader fixture
+    """
+    output_filename = "output/duckdbs/test.duckdb"
+    output_path = downloader.get_output_file(output_filename)
+
+    # Verify the path is correct
+    expected_path = os.path.join(TEST_DATA_DIR, output_filename)
+    assert output_path == expected_path, \
+        f"Output path mismatch: expected {expected_path}, got {output_path}"
+
+    # Verify the parent directory was created
+    assert os.path.exists(os.path.dirname(output_path)), \
+        "Parent directory for output file was not created"
+
+    print(f"\n✓ Output file path created correctly: {output_path}")
+
+
+def test_invalid_local_path():
+    """
+    Test that BabelDownloader raises an error for invalid local paths.
+
+    This test verifies error handling when attempting to use a file path
+    as the local directory (should be a directory, not a file).
+    """
+    # Create a temporary file
+    invalid_path = "/tmp/test_babel_invalid_file.txt"
+    with open(invalid_path, 'w') as f:
+        f.write("test")
+
+    try:
+        # Attempt to create downloader with a file path instead of directory
+        with pytest.raises(ValueError, match="Invalid local_path"):
+            BabelDownloader(url_base=BABEL_URL, local_path=invalid_path)
+
+        print("\n✓ Correctly raised ValueError for invalid local path")
+    finally:
+        # Clean up
+        if os.path.exists(invalid_path):
+            os.remove(invalid_path)

From 9cc06bc28fe29077bea689e847c5a946898bcd2b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sat, 14 Feb 2026 02:13:52 -0500
Subject: [PATCH 14/66] Improved downloader using Claude.

---
 pyproject.toml                     |   1 +
 src/babel_xrefs/core/downloader.py | 182 +++++++++++++++++++++--------
 2 files changed, 132 insertions(+), 51 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5696f67..ddc0a95 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "click>=8.3.1",
     "duckdb>=1.4.2",
     "requests>=2.32.5",
+    "tqdm>=4.67.0",
 ]
 
 [build-system]
diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py
index 9313685..8e9a07e 100644
--- a/src/babel_xrefs/core/downloader.py
+++ b/src/babel_xrefs/core/downloader.py
@@ -1,10 +1,11 @@
 import functools
 import os
 import urllib.parse
-import subprocess
+import time
 import requests
+from tqdm import tqdm
 import logging
-import functools
+
 
 class BabelDownloader:
     """
@@ -39,31 +40,126 @@ def get_output_file(self, filename):
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
         return filepath
 
+    def _stream_download(self, response, local_path, resume_byte_pos, chunk_size):
+        """
+        Stream download from response to file with progress bar.
+
+        Args:
+            response: requests.Response object with stream=True
+            local_path: Local file path to write to
+            resume_byte_pos: Starting byte position (for resume)
+            chunk_size: Size of chunks to read/write
+        """
+        # Get total size from Content-Length header (may not be present)
+        content_length = response.headers.get('Content-Length')
+        if content_length:
+            total_size = int(content_length) + resume_byte_pos
+        else:
+            total_size = None
+
+        # Open file in append mode if resuming, write mode otherwise
+        mode = 'ab' if resume_byte_pos > 0 else 'wb'
+
+        with open(local_path, mode) as f:
+            with tqdm(
+                total=total_size,
+                initial=resume_byte_pos,
+                unit='B',
+                unit_scale=True,
+                unit_divisor=1024,
+                desc=os.path.basename(local_path)
+            ) as progress_bar:
+                for chunk in response.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        f.write(chunk)
+                        progress_bar.update(len(chunk))
+
+    def _download_with_retry(self, url, local_path, chunk_size):
+        """
+        Download a file with retry logic and resume capability.
+
+        Args:
+            url: URL to download from
+            local_path: Local file path to save to
+            chunk_size: Size of chunks to read/write
+
+        Raises:
+            RuntimeError: If all retry attempts fail
+        """
+        for attempt in range(1, self.retries + 1):
+            try:
+                # Check if we're resuming a partial download
+                resume_byte_pos = 0
+                if os.path.exists(local_path):
+                    resume_byte_pos = os.path.getsize(local_path)
+
+                # Prepare headers for resume
+                headers = {}
+                if resume_byte_pos > 0:
+                    headers['Range'] = f'bytes={resume_byte_pos}-'
+                    self.logger.info(f"Resuming download from byte {resume_byte_pos}")
+
+                # Make streaming request with timeout for connection (not total time)
+                response = requests.get(url, headers=headers, stream=True, timeout=30)
+
+                # Handle different response codes
+                if response.status_code == 416:
+                    # Range Not Satisfiable - file already complete
+                    self.logger.info(f"File already complete: {local_path}")
+                    return
+                elif response.status_code == 206:
+                    # Partial Content - resume successful
+                    self.logger.info(f"Resuming download (HTTP 206)")
+                elif response.status_code == 200:
+                    # OK - server doesn't support resume or no Range header was sent
+                    if resume_byte_pos > 0:
+                        self.logger.warning(f"Server doesn't support resume, restarting from beginning")
+                        resume_byte_pos = 0
+                        # Remove partial file
+                        if os.path.exists(local_path):
+                            os.remove(local_path)
+                else:
+                    response.raise_for_status()
+
+                # Stream download with progress bar
+                self._stream_download(response, local_path, resume_byte_pos, chunk_size)
+
+                # Success - exit retry loop
+                return
+
+            except (requests.RequestException, IOError) as e:
+                self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}")
+
+                if attempt < self.retries:
+                    # Calculate exponential backoff with max of 60 seconds
+                    wait_time = min(2 ** attempt, 60)
+                    self.logger.info(f"Retrying in {wait_time} seconds...")
+                    time.sleep(wait_time)
+                else:
+                    # All retries exhausted
+                    raise RuntimeError(f"Failed to download {url} after {self.retries} attempts: {e}")
+
     @functools.lru_cache(maxsize=None)
-    def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024):
+    def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
+        """
+        Download a file from the Babel server to local storage.
+
+        Args:
+            dirpath: Relative path from url_base to the file
+            chunk_size: Size of chunks to download (default 1MB)
+
+        Returns:
+            str: Local path to the downloaded file
+        """
         local_path_to_download_to = os.path.join(self.local_path, dirpath)
         os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
 
         url_to_download = urllib.parse.urljoin(self.url_base, dirpath)
-        bytes_downloaded = 0
-
-        wget_command_line = [
-            "wget",
-            "--progress=bar:force:noscroll",        # Display progress bar.
-            "--compression=auto",                   # Compress files if available.
-            "--continue",                           # Continue downloading in case of interruption.
-            f"--tries={self.retries}",
-            "-O" + local_path_to_download_to,
-        ]
-
-        # Add URL and output file.
-        wget_command_line.append(url_to_download)
-
-        # Execute wget.
-        self.logger.info(f"Downloading {url_to_download} using wget: {wget_command_line}")
-        process = subprocess.run(wget_command_line)
-        if process.returncode != 0:
-            raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")
+
+        self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}")
+
+        # Download with retry logic
+        self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size)
 
         bytes_downloaded = os.path.getsize(local_path_to_download_to)
         self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
@@ -71,32 +167,16 @@ def get_downloaded_file(self, dirpath: str, chunk_size:int=1024*1024):
 
     @functools.lru_cache(maxsize=None)
     def get_downloaded_dir(self, dirpath: str):
-        local_path_to_download_to = os.path.join(self.local_path, dirpath)
-        os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
-
-        url_to_download_recursively = urllib.parse.urljoin(self.url_base, dirpath)
-
-        wget_command_line = [
-            "wget",
-            "--progress=bar:force:noscroll",        # Display progress bar.
-            "--compression=auto",                   # Compress files if available.
-            "--continue",                           # Continue downloading in case of interruption.
-            f"--tries={self.retries}",
-            "--recursive",
-            "--no-parent",
-            "--no-host-directories",
-            "--directory-prefix=" + local_path_to_download_to,
-        ]
-
-        # Add URL and output file.
-        if url_to_download_recursively[-1] != "/":
-            url_to_download_recursively += "/"
-        wget_command_line.append(url_to_download_recursively)
-
-        # Execute wget.
-        self.logger.info(f"Downloading {url_to_download_recursively} using wget: {wget_command_line}")
-        process = subprocess.run(wget_command_line)
-        if process.returncode != 0:
-            raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}")
-
-        return local_path_to_download_to
+        """
+        Download a directory recursively.
+
+        NOTE: This method is not implemented in the Python-based downloader.
+        Use get_downloaded_file() for individual files instead.
+
+        Raises:
+            NotImplementedError: This method is not implemented
+        """
+        raise NotImplementedError(
+            "Recursive directory downloads are not supported. "
+            "Use get_downloaded_file() for individual files."
+        )

From da8bb0cfa8493990faf934aa70f72ab0de434bc0 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sat, 14 Feb 2026 02:19:09 -0500
Subject: [PATCH 15/66] Added MD5 download functionality.

---
 src/babel_xrefs/core/downloader.py |  96 +++++++++++++-
 tests/test_downloader.py           | 204 ++++++++++++++++++++++++++++-
 2 files changed, 298 insertions(+), 2 deletions(-)

diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_xrefs/core/downloader.py
index 8e9a07e..93081c6 100644
--- a/src/babel_xrefs/core/downloader.py
+++ b/src/babel_xrefs/core/downloader.py
@@ -2,6 +2,7 @@
 import os
 import urllib.parse
 import time
+import hashlib
 import requests
 from tqdm import tqdm
 import logging
@@ -40,6 +41,56 @@ def get_output_file(self, filename):
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
         return filepath
 
+    def _calculate_md5(self, file_path, chunk_size=1024*1024):
+        """
+        Calculate MD5 checksum of a file.
+
+        Args:
+            file_path: Path to the file to checksum
+            chunk_size: Size of chunks to read (default 1MB)
+
+        Returns:
+            str: Hexadecimal MD5 checksum
+        """
+        md5_hash = hashlib.md5()
+        with open(file_path, 'rb') as f:
+            for chunk in iter(lambda: f.read(chunk_size), b''):
+                md5_hash.update(chunk)
+        return md5_hash.hexdigest()
+
+    def _fetch_remote_md5(self, url):
+        """
+        Fetch MD5 checksum from remote .md5 file.
+
+        Args:
+            url: URL to the .md5 file
+
+        Returns:
+            str: MD5 checksum if found, None if file doesn't exist or is malformed
+        """
+        try:
+            response = requests.get(url, timeout=10)
+            if response.status_code == 404:
+                self.logger.debug(f"No .md5 file found at {url}")
+                return None
+            response.raise_for_status()
+
+            # Parse MD5 file content
+            # Format is typically: "md5hash  filename" or just "md5hash"
+            content = response.text.strip()
+            md5_match = content.split()[0]  # Take first token
+
+            # Validate it's a valid MD5 (32 hex characters)
+            if len(md5_match) == 32 and all(c in '0123456789abcdef' for c in md5_match.lower()):
+                return md5_match.lower()
+            else:
+                self.logger.warning(f"Malformed .md5 file at {url}: {content}")
+                return None
+
+        except requests.RequestException as e:
+            self.logger.debug(f"Could not fetch .md5 file from {url}: {e}")
+            return None
+
     def _stream_download(self, response, local_path, resume_byte_pos, chunk_size):
         """
         Stream download from response to file with progress bar.
@@ -142,7 +193,13 @@ def _download_with_retry(self, url, local_path, chunk_size):
     @functools.lru_cache(maxsize=None)
     def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
         """
-        Download a file from the Babel server to local storage.
+        Download a file from the Babel server to local storage with MD5 validation.
+
+        If a .md5 file exists on the server, this method will:
+        1. Check if the local file exists
+        2. Verify its MD5 checksum matches the expected value
+        3. Delete and re-download if checksums don't match
+        4. Skip download if checksums match
 
         Args:
             dirpath: Relative path from url_base to the file
@@ -155,12 +212,49 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
         os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
 
         url_to_download = urllib.parse.urljoin(self.url_base, dirpath)
+        md5_url = url_to_download + '.md5'
+
+        # Check if file already exists and validate with MD5 if available
+        if os.path.exists(local_path_to_download_to):
+            self.logger.info(f"Local file exists: {local_path_to_download_to}")
+
+            # Try to fetch remote MD5 checksum
+            expected_md5 = self._fetch_remote_md5(md5_url)
+
+            if expected_md5:
+                self.logger.info(f"Validating MD5 checksum (expected: {expected_md5})")
+
+                # Calculate local file's MD5
+                actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size)
+                self.logger.info(f"Local file MD5: {actual_md5}")
+
+                if actual_md5 == expected_md5:
+                    # File is valid, skip download
+                    self.logger.info(f"MD5 checksum matches - file is valid, skipping download")
+                    bytes_downloaded = os.path.getsize(local_path_to_download_to)
+                    self.logger.info(f"Using existing file: {local_path_to_download_to} ({bytes_downloaded} bytes)")
+                    return local_path_to_download_to
+                else:
+                    # Checksums don't match - delete and re-download
+                    self.logger.warning(f"MD5 checksum mismatch! Expected {expected_md5}, got {actual_md5}")
+                    self.logger.warning(f"Deleting corrupted file and re-downloading: {local_path_to_download_to}")
+                    os.remove(local_path_to_download_to)
 
         self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}")
 
         # Download with retry logic
         self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size)
 
+        # Verify MD5 after download if available
+        expected_md5 = self._fetch_remote_md5(md5_url)
+        if expected_md5:
+            actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size)
+            if actual_md5 == expected_md5:
+                self.logger.info(f"Post-download MD5 verification passed: {actual_md5}")
+            else:
+                self.logger.error(f"Post-download MD5 verification failed! Expected {expected_md5}, got {actual_md5}")
+                raise RuntimeError(f"Downloaded file has incorrect MD5 checksum")
+
         bytes_downloaded = os.path.getsize(local_path_to_download_to)
         self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
         return local_path_to_download_to
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 461854c..11132f3 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -2,12 +2,14 @@
 Tests for the BabelDownloader class.
 
 These tests verify that the downloader can successfully fetch large Parquet files
-from the Babel server using wget and properly manage local file caching.
+from the Babel server and properly manage local file caching with MD5 validation.
 """
 
 import os
 import shutil
+import hashlib
 import pytest
+from unittest.mock import Mock, patch, MagicMock
 from babel_xrefs.core.downloader import BabelDownloader
 
 
@@ -192,3 +194,203 @@ def test_invalid_local_path():
         # Clean up
         if os.path.exists(invalid_path):
             os.remove(invalid_path)
+
+
+def test_md5_validation_matching_checksum(test_data_dir):
+    """
+    Test that MD5 validation skips download when checksums match.
+
+    This test:
+    1. Creates a local file with known content
+    2. Mocks the .md5 file to return the correct checksum
+    3. Verifies the download is skipped (no actual HTTP download occurs)
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    # Create a test file with known content
+    test_file = "test_file.txt"
+    local_path = os.path.join(test_data_dir, test_file)
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+    test_content = b"This is test content for MD5 validation"
+    with open(local_path, 'wb') as f:
+        f.write(test_content)
+
+    # Calculate the expected MD5
+    expected_md5 = hashlib.md5(test_content).hexdigest()
+
+    # Mock the _fetch_remote_md5 to return the matching checksum
+    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
+        # Mock _download_with_retry to ensure it's NOT called
+        with patch.object(downloader, '_download_with_retry') as mock_download:
+            # Clear the cache before testing
+            downloader.get_downloaded_file.cache_clear()
+
+            result_path = downloader.get_downloaded_file(test_file)
+
+            # Verify the download was skipped
+            mock_download.assert_not_called()
+            assert result_path == local_path
+            assert os.path.exists(result_path)
+
+    print(f"\n✓ MD5 validation correctly skipped download for matching checksum: {expected_md5}")
+
+
+def test_md5_validation_mismatched_checksum(test_data_dir):
+    """
+    Test that MD5 validation deletes and re-downloads file when checksums don't match.
+
+    This test:
+    1. Creates a local file with wrong content
+    2. Mocks the .md5 file to return a different checksum
+    3. Verifies the file is deleted and re-downloaded
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    # Create a test file with incorrect content
+    test_file = "test_file_mismatch.txt"
+    local_path = os.path.join(test_data_dir, test_file)
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+    wrong_content = b"This is WRONG content"
+    with open(local_path, 'wb') as f:
+        f.write(wrong_content)
+
+    # Use a different MD5 (this is MD5 of "correct content")
+    correct_content = b"This is CORRECT content"
+    expected_md5 = hashlib.md5(correct_content).hexdigest()
+
+    # Track whether file was deleted
+    original_exists = os.path.exists(local_path)
+
+    # Mock the _fetch_remote_md5 to return the mismatched checksum
+    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
+        # Mock _download_with_retry to create the "correct" file
+        def mock_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(correct_content)
+
+        with patch.object(downloader, '_download_with_retry', side_effect=mock_download):
+            # Clear the cache before testing
+            downloader.get_downloaded_file.cache_clear()
+
+            result_path = downloader.get_downloaded_file(test_file)
+
+            # Verify the file exists and has correct content
+            assert os.path.exists(result_path)
+            with open(result_path, 'rb') as f:
+                assert f.read() == correct_content
+
+    print(f"\n✓ MD5 validation correctly deleted and re-downloaded file with mismatched checksum")
+
+
+def test_md5_validation_no_md5_file(test_data_dir):
+    """
+    Test that download proceeds normally when no .md5 file exists.
+
+    This test:
+    1. Mocks the .md5 file fetch to return None (404)
+    2. Verifies the download proceeds normally
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    test_file = "test_file_no_md5.txt"
+    local_path = os.path.join(test_data_dir, test_file)
+
+    test_content = b"Test content without MD5 file"
+
+    # Mock the _fetch_remote_md5 to return None (no .md5 file)
+    with patch.object(downloader, '_fetch_remote_md5', return_value=None):
+        # Mock _download_with_retry to create the file
+        def mock_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(test_content)
+
+        with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method:
+            # Clear the cache before testing
+            downloader.get_downloaded_file.cache_clear()
+
+            result_path = downloader.get_downloaded_file(test_file)
+
+            # Verify download was called (normal download path)
+            mock_download_method.assert_called_once()
+            assert os.path.exists(result_path)
+            with open(result_path, 'rb') as f:
+                assert f.read() == test_content
+
+    print(f"\n✓ Download proceeded normally when no .md5 file exists")
+
+
+def test_md5_validation_malformed_md5_file(test_data_dir):
+    """
+    Test that download proceeds normally when .md5 file is malformed.
+
+    This test:
+    1. Mocks the .md5 file fetch to return None (malformed content)
+    2. Verifies the download proceeds normally with a warning
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    test_file = "test_file_malformed_md5.txt"
+    local_path = os.path.join(test_data_dir, test_file)
+
+    test_content = b"Test content with malformed MD5 file"
+
+    # Mock the _fetch_remote_md5 to return None (malformed .md5 file)
+    with patch.object(downloader, '_fetch_remote_md5', return_value=None):
+        # Mock _download_with_retry to create the file
+        def mock_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(test_content)
+
+        with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method:
+            # Clear the cache before testing
+            downloader.get_downloaded_file.cache_clear()
+
+            result_path = downloader.get_downloaded_file(test_file)
+
+            # Verify download was called (normal download path)
+            mock_download_method.assert_called_once()
+            assert os.path.exists(result_path)
+
+    print(f"\n✓ Download proceeded normally when .md5 file is malformed")
+
+
+def test_md5_post_download_validation(test_data_dir):
+    """
+    Test that MD5 validation occurs after download and fails if checksum is wrong.
+
+    This test:
+    1. Downloads a new file
+    2. Mocks the .md5 file to return a checksum
+    3. Mocks the download to create a file with WRONG content
+    4. Verifies a RuntimeError is raised for checksum mismatch
+    """
+    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+    test_file = "test_file_post_validation.txt"
+    local_path = os.path.join(test_data_dir, test_file)
+
+    # Expected content and MD5
+    correct_content = b"Expected content"
+    expected_md5 = hashlib.md5(correct_content).hexdigest()
+
+    # Wrong content that will be downloaded
+    wrong_content = b"Wrong content downloaded"
+
+    # Mock the _fetch_remote_md5 to return the expected checksum
+    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
+        # Mock _download_with_retry to create a file with WRONG content
+        def mock_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(wrong_content)
+
+        with patch.object(downloader, '_download_with_retry', side_effect=mock_download):
+            # Clear the cache before testing
+            downloader.get_downloaded_file.cache_clear()
+
+            # Should raise RuntimeError due to post-download MD5 mismatch
+            with pytest.raises(RuntimeError, match="incorrect MD5 checksum"):
+                downloader.get_downloaded_file(test_file)
+
+    print(f"\n✓ Post-download MD5 validation correctly detected checksum mismatch")

From 8f36b74f9563f6e48e6cc929b88be6e3e2efbd72 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sun, 15 Feb 2026 02:11:32 -0500
Subject: [PATCH 16/66] Removed empty model file.

---
 src/babel_xrefs/core/model.py | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 src/babel_xrefs/core/model.py

diff --git a/src/babel_xrefs/core/model.py b/src/babel_xrefs/core/model.py
deleted file mode 100644
index 139597f..0000000
--- a/src/babel_xrefs/core/model.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-

From 0534fd876c4b81374f88f76b4ed3561a2abe624d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sun, 15 Feb 2026 02:17:13 -0500
Subject: [PATCH 17/66] Attempted to rename this package to babel-explorer.

---
 CLAUDE.md                                     | 33 +++++++++----------
 README.md                                     |  2 +-
 pyproject.toml                                |  4 +--
 .../__init__.py                               |  0
 src/{babel_xrefs => babel_explorer}/cli.py    |  8 ++---
 .../core/__init__.py                          |  0
 .../core/babel_xrefs.py                       |  4 +--
 .../core/downloader.py                        |  0
 .../core/nodenorm.py                          |  0
 tests/__init__.py                             |  2 +-
 tests/test_downloader.py                      |  2 +-
 11 files changed, 26 insertions(+), 29 deletions(-)
 rename src/{babel_xrefs => babel_explorer}/__init__.py (100%)
 rename src/{babel_xrefs => babel_explorer}/cli.py (94%)
 rename src/{babel_xrefs => babel_explorer}/core/__init__.py (100%)
 rename src/{babel_xrefs => babel_explorer}/core/babel_xrefs.py (97%)
 rename src/{babel_xrefs => babel_explorer}/core/downloader.py (100%)
 rename src/{babel_xrefs => babel_explorer}/core/nodenorm.py (100%)

diff --git a/CLAUDE.md b/CLAUDE.md
index 536af9b..1fc596f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-babel-xrefs is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC).
+babel-explorer is a tool for querying and exploring Babel intermediate files. It allows users to discover why two biological/chemical identifiers are considered identical by the Babel system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC).
 
 ## Development Setup
 
@@ -17,11 +17,8 @@ uv sync
 # Install with dev dependencies
 uv sync --group dev
 
-# Activate virtual environment (if needed)
-source .venv/bin/activate
-
 # Run the CLI
-uv run babel-xrefs --help
+uv run babel-explorer --help
 ```
 
 ## Commands
@@ -30,22 +27,22 @@ uv run babel-xrefs --help
 
 ```bash
 # Get cross-references for one or more CURIEs
-uv run babel-xrefs xrefs MONDO:0004979
+uv run babel-explorer xrefs MONDO:0004979
 
 # Get cross-references with expansion (recursive lookup)
-uv run babel-xrefs xrefs MONDO:0004979 --expand
+uv run babel-explorer xrefs MONDO:0004979 --expand
 
 # Get cross-references with labels from NodeNorm
-uv run babel-xrefs xrefs MONDO:0004979 --labels
+uv run babel-explorer xrefs MONDO:0004979 --labels
 
 # Get ID records for CURIEs
-uv run babel-xrefs ids MONDO:0004979
+uv run babel-explorer ids MONDO:0004979
 
 # Test concordance changes with NodeNorm
-uv run babel-xrefs test-concord MONDO:0004979 HP:0000001
+uv run babel-explorer test-concord MONDO:0004979 HP:0000001
 
 # Use custom Babel server or local directory
-uv run babel-xrefs xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/
+uv run babel-explorer xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url https://stars.renci.org:443/var/babel_outputs/2025nov19/
 ```
 
 ### Development Commands
@@ -65,25 +62,25 @@ uv run ruff format
 
 ### Core Components
 
-1. **BabelDownloader** (`src/babel_xrefs/core/downloader.py`):
+1. **BabelDownloader** (`src/babel_explorer/core/downloader.py`):
    - Downloads Babel intermediate files from a remote server using `wget`
    - Caches files locally in configurable directory (default: `data/2025nov19/`)
    - Uses `@functools.lru_cache` to avoid re-downloading
    - **Important**: Requires `wget` to be installed on the system
 
-2. **BabelXRefs** (`src/babel_xrefs/babel_xrefs.py`):
+2. **BabelXRefs** (`src/babel_explorer/babel_xrefs.py`):
    - Main query engine for cross-references
    - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`)
    - Supports recursive expansion of cross-references
    - Creates ephemeral DuckDB databases in `data/<version>/output/duckdbs/`
 
-3. **NodeNorm** (`src/babel_xrefs/core/nodenorm.py`):
+3. **NodeNorm** (`src/babel_explorer/core/nodenorm.py`):
    - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/)
    - Fetches labels, biolink types, and equivalent identifiers for CURIEs
    - Uses `@functools.lru_cache` for performance
    - Optional component for label enrichment
 
-4. **CLI** (`src/babel_xrefs/cli.py`):
+4. **CLI** (`src/babel_explorer/cli.py`):
    - Click-based command-line interface
    - Three main commands: `xrefs`, `ids`, `test-concord`
 
@@ -108,11 +105,11 @@ uv run ruff format
 - **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases
 - **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url`
 - **No tests yet**: The project currently has pytest configured but no test files exist
-- **Empty model.py**: The `src/babel_xrefs/core/model.py` file exists but is currently empty; data classes are defined in `babel_xrefs.py` and `nodenorm.py` instead
+- **Empty model.py**: The `src/babel_explorer/core/model.py` file exists but is currently empty; data classes are defined in `babel_explorer.py` and `nodenorm.py` instead
 
 ## File Locations
 
-- Source code: `src/babel_xrefs/`
+- Source code: `src/babel_explorer/`
 - Downloaded Babel files: `data/<version>/duckdb/*.parquet`
 - Generated DuckDB databases: `data/<version>/output/duckdbs/`
-- Entry point: `src/babel_xrefs/cli.py`
+- Entry point: `src/babel_explorer/cli.py`
diff --git a/README.md b/README.md
index 7e78ca5..d17c739 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,2 @@
-# babel-xrefs
+# babel-explorer
 Software for querying and exporting Babel intermediate files
diff --git a/pyproject.toml b/pyproject.toml
index ddc0a95..0fb8f09 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "babel-xrefs"
+name = "babel-explorer"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
@@ -22,4 +22,4 @@ dev = [
 ]
 
 [project.scripts]
-babel-xrefs = "babel_xrefs.cli:cli"
+babel-explorer = "babel_explorer.cli:cli"
diff --git a/src/babel_xrefs/__init__.py b/src/babel_explorer/__init__.py
similarity index 100%
rename from src/babel_xrefs/__init__.py
rename to src/babel_explorer/__init__.py
diff --git a/src/babel_xrefs/cli.py b/src/babel_explorer/cli.py
similarity index 94%
rename from src/babel_xrefs/cli.py
rename to src/babel_explorer/cli.py
index ab2e283..8dd5fc4 100644
--- a/src/babel_xrefs/cli.py
+++ b/src/babel_explorer/cli.py
@@ -1,9 +1,9 @@
-# Command line interface for babel-xrefs
+# Command line interface for babel-explorer
 import click
 import logging
-from babel_xrefs.core.downloader import BabelDownloader
-from babel_xrefs.core.babel_xrefs import BabelXRefs
-from babel_xrefs.core.nodenorm import NodeNorm
+from babel_explorer.core.downloader import BabelDownloader
+from babel_explorer.core.babel_xrefs import BabelXRefs
+from babel_explorer.core.nodenorm import NodeNorm
 
 
 @click.group()
diff --git a/src/babel_xrefs/core/__init__.py b/src/babel_explorer/core/__init__.py
similarity index 100%
rename from src/babel_xrefs/core/__init__.py
rename to src/babel_explorer/core/__init__.py
diff --git a/src/babel_xrefs/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
similarity index 97%
rename from src/babel_xrefs/core/babel_xrefs.py
rename to src/babel_explorer/core/babel_xrefs.py
index bf32521..6776a98 100644
--- a/src/babel_xrefs/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -6,8 +6,8 @@
 import duckdb
 import functools
 
-from babel_xrefs.core.downloader import BabelDownloader
-from babel_xrefs.core.nodenorm import NodeNorm
+from babel_explorer.core.downloader import BabelDownloader
+from babel_explorer.core.nodenorm import NodeNorm
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/src/babel_xrefs/core/downloader.py b/src/babel_explorer/core/downloader.py
similarity index 100%
rename from src/babel_xrefs/core/downloader.py
rename to src/babel_explorer/core/downloader.py
diff --git a/src/babel_xrefs/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
similarity index 100%
rename from src/babel_xrefs/core/nodenorm.py
rename to src/babel_explorer/core/nodenorm.py
diff --git a/tests/__init__.py b/tests/__init__.py
index 61c04ac..588fec0 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1 +1 @@
-# Tests for babel-xrefs
+# Tests for babel-explorer
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 11132f3..c16bf74 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -10,7 +10,7 @@
 import hashlib
 import pytest
 from unittest.mock import Mock, patch, MagicMock
-from babel_xrefs.core.downloader import BabelDownloader
+from babel_explorer.core.downloader import BabelDownloader
 
 
 # Constants for test configuration

From 0b3a9f512cf5e695fe51376e3f7d0b7557f19a58 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Sun, 15 Feb 2026 02:44:54 -0500
Subject: [PATCH 18/66] Add comprehensive pytest suite for all core modules

- Add IdentifierRecord dataclass to babel_xrefs.py (resolves TODO)
- Add 89 tests across 3 files: test_downloader (26), test_babel_xrefs (31), test_nodenorm (23)
- Unit tests (71) use mocks and run without network; integration tests (18) use real downloads/APIs
- Add session-scoped fixtures in conftest.py for shared Parquet file downloads
- Parametrize integration tests over tests/data/valid_curies.txt for easy expansion
- Add integration and slow pytest markers to pyproject.toml
- Update CLAUDE.md and README.md with testing documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                              |  47 +-
 README.md                              |  56 ++-
 pyproject.toml                         |   6 +
 src/babel_explorer/core/babel_xrefs.py |  35 +-
 tests/conftest.py                      | 106 ++++
 tests/constants.py                     |  26 +
 tests/data/valid_curies.txt            |   3 +
 tests/test_babel_xrefs.py              | 333 ++++++++++++
 tests/test_downloader.py               | 669 +++++++++++++------------
 tests/test_nodenorm.py                 | 296 +++++++++++
 10 files changed, 1230 insertions(+), 347 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/constants.py
 create mode 100644 tests/data/valid_curies.txt
 create mode 100644 tests/test_babel_xrefs.py
 create mode 100644 tests/test_nodenorm.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 1fc596f..ae2e78f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -48,8 +48,17 @@ uv run babel-explorer xrefs MONDO:0004979 --local-dir data/2025nov19 --babel-url
 ### Development Commands
 
 ```bash
-# Run tests
-uv run pytest
+# Run all tests (includes large file downloads)
+uv run pytest -v
+
+# Run unit tests only (fast, no network)
+uv run pytest -v -m "not integration"
+
+# Run integration tests without 2GB+ downloads
+uv run pytest -v -m "integration and not slow"
+
+# Run a single test file
+uv run pytest -v tests/test_nodenorm.py
 
 # Run linter
 uv run ruff check
@@ -68,7 +77,7 @@ uv run ruff format
    - Uses `@functools.lru_cache` to avoid re-downloading
    - **Important**: Requires `wget` to be installed on the system
 
-2. **BabelXRefs** (`src/babel_explorer/babel_xrefs.py`):
+2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`):
    - Main query engine for cross-references
    - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`)
    - Supports recursive expansion of cross-references
@@ -99,17 +108,43 @@ uv run ruff format
 - **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs
 - **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups
 
+## Testing
+
+### Test Structure
+
+Tests live in `tests/` and are split into fast **unit tests** (mocked, no network) and slower **integration tests** (real downloads and API calls). Pytest markers control which tests run:
+
+- **`@pytest.mark.integration`** — requires network access (downloads Parquet files or calls NodeNorm API)
+- **`@pytest.mark.slow`** — downloads very large files (2 GB+)
+
+| File | Unit | Integration | Slow | Total |
+|------|------|-------------|------|-------|
+| `tests/test_downloader.py` | 22 | 3 | 1 | 26 |
+| `tests/test_babel_xrefs.py` | 22 | 8 | 1 | 31 |
+| `tests/test_nodenorm.py` | 18 | 5 | 0 | 23 |
+
+### Test Infrastructure
+
+- **`tests/conftest.py`** — Session-scoped fixtures that download Parquet files once and share them across all integration tests. Teardown removes the `data/test/` directory so the next run starts fresh.
+- **`tests/constants.py`** — Shared constants (URLs, file paths) and `load_curies()` helper.
+- **`tests/data/valid_curies.txt`** — One CURIE per line (`#` comments allowed). Integration tests are parametrized over this list — adding a new line automatically expands test coverage.
+
+### Key Dataclasses
+
+- **`CrossReference`** — Frozen dataclass for Concord.parquet rows (filename, subj, pred, obj)
+- **`LabeledCrossReference`** — Extends CrossReference with labels and biolink types from NodeNorm
+- **`IdentifierRecord`** — Frozen dataclass for Identifiers.parquet rows (curie + dynamic extra fields). Returned by `BabelXRefs.get_curie_ids()`.
+
 ## Important Notes
 
-- **System dependency**: This project requires `wget` to be installed (used by BabelDownloader)
 - **Data directory**: The `data/` directory is gitignored and contains downloaded Parquet files and generated DuckDB databases
 - **Babel versions**: The default Babel version is `2025nov19`, but this can be customized via `--local-dir` and `--babel-url`
-- **No tests yet**: The project currently has pytest configured but no test files exist
-- **Empty model.py**: The `src/babel_explorer/core/model.py` file exists but is currently empty; data classes are defined in `babel_explorer.py` and `nodenorm.py` instead
 
 ## File Locations
 
 - Source code: `src/babel_explorer/`
+- Tests: `tests/`
+- Test CURIEs: `tests/data/valid_curies.txt`
 - Downloaded Babel files: `data/<version>/duckdb/*.parquet`
 - Generated DuckDB databases: `data/<version>/output/duckdbs/`
 - Entry point: `src/babel_explorer/cli.py`
diff --git a/README.md b/README.md
index d17c739..b545c8c 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,56 @@
 # babel-explorer
-Software for querying and exporting Babel intermediate files
+Software for querying and exploring Babel intermediate files.
+
+babel-explorer allows you to discover why two biological/chemical identifiers are considered identical by the [Babel](https://github.com/TranslatorSRI/Babel) system, which handles cross-references between different ontology and database identifiers (e.g., MONDO, HP, UMLS, HGNC).
+
+## Setup
+
+This project uses [uv](https://docs.astral.sh/uv/) for package management:
+
+```bash
+uv sync --group dev
+```
+
+## Usage
+
+```bash
+# Get cross-references for one or more CURIEs
+uv run babel-explorer xrefs MONDO:0004979
+
+# Get cross-references with expansion (recursive lookup)
+uv run babel-explorer xrefs MONDO:0004979 --expand
+
+# Get cross-references with labels from NodeNorm
+uv run babel-explorer xrefs MONDO:0004979 --labels
+
+# Get ID records for CURIEs
+uv run babel-explorer ids MONDO:0004979
+
+# Test concordance changes with NodeNorm
+uv run babel-explorer test-concord MONDO:0004979 HP:0000001
+```
+
+## Testing
+
+Tests are split into fast **unit tests** (mocked, no network) and slower **integration tests** (real file downloads and API calls), controlled by pytest markers.
+
+```bash
+# Unit tests only — fast, no network required
+uv run pytest -v -m "not integration"
+
+# Integration tests without 2GB+ downloads
+uv run pytest -v -m "integration and not slow"
+
+# Full suite including large file downloads
+uv run pytest -v
+```
+
+### Adding Test CURIEs
+
+Integration tests are parametrized over the CURIEs listed in `tests/data/valid_curies.txt`. Add a new CURIE on its own line to automatically expand test coverage:
+
+```
+# tests/data/valid_curies.txt
+MONDO:0004979
+HP:0000001
+```
diff --git a/pyproject.toml b/pyproject.toml
index 0fb8f09..922fa1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,3 +23,9 @@ dev = [
 
 [project.scripts]
 babel-explorer = "babel_explorer.cli:cli"
+
+[tool.pytest.ini_options]
+markers = [
+    "integration: tests requiring network access (deselect with '-m \"not integration\"')",
+    "slow: tests downloading very large files 2GB+ (deselect with '-m \"not slow\"')",
+]
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 6776a98..1e82125 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -44,17 +44,39 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st
     def __str__(self):
         return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")"""
 
+@dataclasses.dataclass(frozen=True)
+class IdentifierRecord:
+    """A record from the Identifiers.parquet file."""
+    curie: str
+    extra_fields: tuple = ()
+
+    @staticmethod
+    def from_row(row: tuple, column_names: list[str]):
+        """Create an IdentifierRecord from a DuckDB result row and its column names."""
+        curie_idx = column_names.index('curie')
+        extra = tuple(
+            (col, row[i]) for i, col in enumerate(column_names) if i != curie_idx
+        )
+        return IdentifierRecord(curie=row[curie_idx], extra_fields=extra)
+
+    def __str__(self):
+        parts = [f"curie={self.curie!r}"]
+        for name, value in self.extra_fields:
+            parts.append(f"{name}={value!r}")
+        return f"IdentifierRecord({', '.join(parts)})"
+
+
 class BabelXRefs:
     def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None):
         self.downloader = downloader
         self.nodenorm = nodenorm
 
-    def get_curie_ids(self, curies: list[str]):
+    def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
         """
         Search for all identifiers in the /ids/ files for a particular CURIE.
 
-        :param curie: A CURIE to search for.
-        :return: A list of cross-references containing that CURIE.
+        :param curies: A list of CURIEs to search for.
+        :return: A list of IdentifierRecords containing those CURIEs.
         """
 
         identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
@@ -64,11 +86,10 @@ def get_curie_ids(self, curies: list[str]):
         duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
         db = duckdb.connect(duckdb_path)
         identifier_table = db.read_parquet(identifier_parquet)
-        xrefs = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies])
-
-        # TODO: convert into case classes.
+        result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies])
 
-        return xrefs.fetchall()
+        column_names = [desc[0] for desc in result.description]
+        return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()]
 
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..f3df2fe
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,106 @@
+"""
+Shared fixtures for babel-explorer tests.
+
+Session-scoped fixtures download Babel files once and share them across all test modules.
+Teardown removes the test data directory so the next run starts fresh.
+"""
+
+import os
+import shutil
+
+import pytest
+
+from babel_explorer.core.downloader import BabelDownloader
+from babel_explorer.core.babel_xrefs import BabelXRefs
+from babel_explorer.core.nodenorm import NodeNorm
+
+from tests.constants import (
+    BABEL_URL,
+    NODENORM_URL,
+    TEST_DATA_DIR,
+    CONCORD_FILE,
+    METADATA_FILE,
+    IDENTIFIERS_FILE,
+    load_curies,
+)
+
+
+# ---------------------------------------------------------------------------
+# Session-scoped fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="session")
+def valid_curies() -> list[str]:
+    """Load test CURIEs from tests/data/valid_curies.txt."""
+    curies = load_curies()
+    assert len(curies) > 0, "No CURIEs found in valid_curies.txt"
+    return curies
+
+
+@pytest.fixture(scope="session")
+def test_data_dir():
+    """
+    Provide a clean test data directory for the entire session.
+
+    Creates the directory before tests, removes it after all tests complete.
+    """
+    if os.path.exists(TEST_DATA_DIR):
+        shutil.rmtree(TEST_DATA_DIR)
+    os.makedirs(TEST_DATA_DIR, exist_ok=True)
+
+    yield TEST_DATA_DIR
+
+    if os.path.exists(TEST_DATA_DIR):
+        shutil.rmtree(TEST_DATA_DIR)
+
+
+@pytest.fixture(scope="session")
+def shared_downloader(test_data_dir) -> BabelDownloader:
+    """A BabelDownloader pointed at the test data directory."""
+    return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+
+
+@pytest.fixture(scope="session")
+def downloaded_concord(shared_downloader) -> str:
+    """Download duckdb/Concord.parquet (~626 MB). Returns the local path."""
+    return shared_downloader.get_downloaded_file(CONCORD_FILE)
+
+
+@pytest.fixture(scope="session")
+def downloaded_metadata(shared_downloader) -> str:
+    """Download duckdb/Metadata.parquet (small). Returns the local path."""
+    return shared_downloader.get_downloaded_file(METADATA_FILE)
+
+
+@pytest.fixture(scope="session")
+def downloaded_parquet_files(downloaded_concord, downloaded_metadata) -> dict[str, str]:
+    """Dict of {relative_name: local_path} for Concord and Metadata files."""
+    return {
+        CONCORD_FILE: downloaded_concord,
+        METADATA_FILE: downloaded_metadata,
+    }
+
+
+@pytest.fixture(scope="session")
+def downloaded_identifiers(shared_downloader) -> str:
+    """Download duckdb/Identifiers.parquet (2 GB+). Returns the local path."""
+    return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE)
+
+
+@pytest.fixture(scope="session")
+def nodenorm() -> NodeNorm:
+    """A NodeNorm client pointed at the public API."""
+    return NodeNorm(nodenorm_url=NODENORM_URL)
+
+
+@pytest.fixture(scope="session")
+def babel_xrefs(shared_downloader, downloaded_parquet_files) -> BabelXRefs:
+    """A BabelXRefs instance (no NodeNorm) with Concord + Metadata already downloaded."""
+    return BabelXRefs(shared_downloader)
+
+
+@pytest.fixture(scope="session")
+def babel_xrefs_with_nodenorm(shared_downloader, nodenorm, downloaded_parquet_files) -> BabelXRefs:
+    """A BabelXRefs instance with NodeNorm, Concord + Metadata already downloaded."""
+    return BabelXRefs(shared_downloader, nodenorm)
diff --git a/tests/constants.py b/tests/constants.py
new file mode 100644
index 0000000..01b75fa
--- /dev/null
+++ b/tests/constants.py
@@ -0,0 +1,26 @@
+"""Shared constants for babel-explorer tests."""
+
+import pathlib
+
+BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/"
+NODENORM_URL = "https://nodenormalization-sri.renci.org/"
+TEST_DATA_DIR = "data/test"
+
+# Parquet file paths (relative to the Babel server / local data dir)
+CONCORD_FILE = "duckdb/Concord.parquet"
+METADATA_FILE = "duckdb/Metadata.parquet"
+IDENTIFIERS_FILE = "duckdb/Identifiers.parquet"
+
+# Path to the valid CURIEs file
+VALID_CURIES_PATH = pathlib.Path(__file__).parent / "data" / "valid_curies.txt"
+
+
+def load_curies(path: pathlib.Path = VALID_CURIES_PATH) -> list[str]:
+    """Read CURIEs from a text file, skipping comments and blank lines."""
+    curies = []
+    with open(path) as f:
+        for line in f:
+            stripped = line.strip()
+            if stripped and not stripped.startswith("#"):
+                curies.append(stripped)
+    return curies
diff --git a/tests/data/valid_curies.txt b/tests/data/valid_curies.txt
new file mode 100644
index 0000000..9f2f87c
--- /dev/null
+++ b/tests/data/valid_curies.txt
@@ -0,0 +1,3 @@
+# Valid CURIEs for integration tests.
+# Add new CURIEs here to expand test coverage — tests are parametrized over this list.
+MONDO:0004979
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
new file mode 100644
index 0000000..052d09c
--- /dev/null
+++ b/tests/test_babel_xrefs.py
@@ -0,0 +1,333 @@
+"""
+Tests for BabelXRefs, CrossReference, LabeledCrossReference, and IdentifierRecord.
+
+Unit tests use mocks; integration tests query real Parquet files via DuckDB.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+from babel_explorer.core.babel_xrefs import (
+    BabelXRefs,
+    CrossReference,
+    LabeledCrossReference,
+    IdentifierRecord,
+)
+from babel_explorer.core.downloader import BabelDownloader
+from babel_explorer.core.nodenorm import NodeNorm
+
+from tests.constants import load_curies
+
+VALID_CURIES = load_curies()
+
+
+# ==========================================================================
+# Unit Tests — CrossReference
+# ==========================================================================
+
+
+class TestCrossReference:
+    def test_creation(self):
+        xr = CrossReference(filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2")
+        assert xr.filename == "f.txt"
+        assert xr.subj == "A:1"
+        assert xr.pred == "skos:exactMatch"
+        assert xr.obj == "B:2"
+
+    def test_from_tuple(self):
+        t = ("file.tsv", "MONDO:1", "owl:sameAs", "HP:2")
+        xr = CrossReference.from_tuple(t)
+        assert xr.filename == "file.tsv"
+        assert xr.subj == "MONDO:1"
+        assert xr.pred == "owl:sameAs"
+        assert xr.obj == "HP:2"
+
+    def test_curies_property(self):
+        xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        assert xr.curies == frozenset({"A:1", "B:2"})
+
+    def test_frozen_immutability(self):
+        xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        with pytest.raises(AttributeError):
+            xr.subj = "changed"
+
+    def test_equality(self):
+        a = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        b = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        assert a == b
+
+    def test_hashability(self):
+        a = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        b = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        assert hash(a) == hash(b)
+        assert len({a, b}) == 1
+
+    def test_lt_ordering(self):
+        a = CrossReference(filename="a.tsv", subj="A:1", pred="p", obj="B:2")
+        b = CrossReference(filename="b.tsv", subj="A:1", pred="p", obj="B:2")
+        assert a < b
+
+    def test_sorting(self):
+        items = [
+            CrossReference(filename="c", subj="C:1", pred="p", obj="D:1"),
+            CrossReference(filename="a", subj="A:1", pred="p", obj="B:1"),
+            CrossReference(filename="b", subj="B:1", pred="p", obj="C:1"),
+        ]
+        result = sorted(items)
+        assert [x.filename for x in result] == ["a", "b", "c"]
+
+
+# ==========================================================================
+# Unit Tests — LabeledCrossReference
+# ==========================================================================
+
+
+class TestLabeledCrossReference:
+    def test_creation(self):
+        lxr = LabeledCrossReference(
+            subj="A:1", pred="p", obj="B:2", filename="f",
+            subj_label="Alpha", subj_biolink_type="biolink:Disease",
+            obj_label="Beta", obj_biolink_type="biolink:Gene",
+        )
+        assert lxr.subj == "A:1"
+        assert lxr.subj_label == "Alpha"
+        assert lxr.obj_biolink_type == "biolink:Gene"
+
+    def test_inherits_from_cross_reference(self):
+        lxr = LabeledCrossReference(
+            subj="A:1", pred="p", obj="B:2", filename="f",
+            subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="",
+        )
+        assert isinstance(lxr, CrossReference)
+
+    def test_curies_property(self):
+        lxr = LabeledCrossReference(
+            subj="A:1", pred="p", obj="B:2", filename="f",
+            subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="",
+        )
+        assert lxr.curies == frozenset({"A:1", "B:2"})
+
+    def test_str(self):
+        lxr = LabeledCrossReference(
+            subj="A:1", pred="p", obj="B:2", filename="f",
+            subj_label="Alpha", subj_biolink_type="biolink:Disease",
+            obj_label="Beta", obj_biolink_type="biolink:Gene",
+        )
+        s = str(lxr)
+        assert "A:1" in s
+        assert "B:2" in s
+        assert "Alpha" in s
+
+
+# ==========================================================================
+# Unit Tests — IdentifierRecord
+# ==========================================================================
+
+
+class TestIdentifierRecord:
+    def test_creation(self):
+        rec = IdentifierRecord(curie="MONDO:0004979")
+        assert rec.curie == "MONDO:0004979"
+        assert rec.extra_fields == ()
+
+    def test_from_row(self):
+        row = ("MONDO:0004979", "Disease", "asthma")
+        cols = ["curie", "category", "label"]
+        rec = IdentifierRecord.from_row(row, cols)
+        assert rec.curie == "MONDO:0004979"
+        assert ("category", "Disease") in rec.extra_fields
+        assert ("label", "asthma") in rec.extra_fields
+
+    def test_frozen(self):
+        rec = IdentifierRecord(curie="X:1")
+        with pytest.raises(AttributeError):
+            rec.curie = "changed"
+
+    def test_str(self):
+        rec = IdentifierRecord(curie="X:1", extra_fields=(("type", "Gene"),))
+        s = str(rec)
+        assert "X:1" in s
+        assert "type" in s
+        assert "Gene" in s
+
+
+# ==========================================================================
+# Unit Tests — BabelXRefs (mocked)
+# ==========================================================================
+
+
+class TestBabelXRefsInit:
+    def test_init_without_nodenorm(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        bx = BabelXRefs(dl)
+        assert bx.downloader is dl
+        assert bx.nodenorm is None
+
+    def test_init_with_nodenorm(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        nn = NodeNorm("https://example.com/")
+        bx = BabelXRefs(dl, nn)
+        assert bx.nodenorm is nn
+
+
+class TestBabelXRefsMocked:
+    """Mocked query tests — no DuckDB or Parquet files needed."""
+
+    def _make_bx(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        return BabelXRefs(dl)
+
+    def test_get_curie_xref_calls_downloader(self, tmp_path):
+        bx = self._make_bx(tmp_path)
+        mock_result = MagicMock()
+        mock_result.fetchall.return_value = [
+            ("concord.tsv", "A:1", "skos:exactMatch", "B:2"),
+        ]
+        mock_db = MagicMock()
+        mock_db.read_parquet.return_value = "table"
+        mock_db.execute.return_value = mock_result
+
+        with patch.object(bx.downloader, 'get_downloaded_file', return_value="/fake/path") as mock_dl:
+            with patch.object(bx.downloader, 'get_output_file', return_value="/fake/db"):
+                with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db):
+                    bx.get_curie_xref.cache_clear()
+                    result = bx.get_curie_xref("A:1")
+                    # Downloader should be called for Concord and Metadata
+                    assert mock_dl.call_count == 2
+                    result_list = list(result)
+                    assert len(result_list) == 1
+                    assert isinstance(result_list[0], CrossReference)
+
+    def test_get_curie_xrefs_no_expand(self, tmp_path):
+        bx = self._make_bx(tmp_path)
+        xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        with patch.object(bx, 'get_curie_xref', return_value=[xr]):
+            bx.get_curie_xref.cache_clear()
+            result = bx.get_curie_xrefs(["A:1"], expand=False)
+            assert len(result) == 1
+            assert result[0] == xr
+
+    def test_get_curie_xrefs_with_expand(self, tmp_path):
+        bx = self._make_bx(tmp_path)
+        xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
+        xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3")
+
+        def mock_get_curie_xref(curie, label_curies=False):
+            if curie == "A:1":
+                return [xr1]
+            elif curie == "B:2":
+                return [xr2]
+            return []
+
+        with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref):
+            result = bx.get_curie_xrefs(["A:1"], expand=True)
+            assert xr1 in result
+            assert xr2 in result
+
+    def test_results_are_sorted(self, tmp_path):
+        bx = self._make_bx(tmp_path)
+        xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1")
+        xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1")
+
+        with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]):
+            result = bx.get_curie_xrefs(["X:1"], expand=False)
+            assert result == [xr_a, xr_b]
+
+
+# ==========================================================================
+# Integration Tests — require downloaded Parquet files
+# ==========================================================================
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xref(babel_xrefs, curie):
+    """get_curie_xref returns non-empty CrossReferences with the queried CURIE."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    results = list(babel_xrefs.get_curie_xref(curie))
+    assert len(results) > 0, f"No cross-references found for {curie}"
+    for xr in results:
+        assert isinstance(xr, CrossReference)
+        assert curie in (xr.subj, xr.obj)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xref_returns_known_xrefs(babel_xrefs, curie):
+    """At least one cross-reference is found."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    results = list(babel_xrefs.get_curie_xref(curie))
+    assert len(results) >= 1
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie):
+    """get_curie_xrefs without expansion returns sorted, non-empty results."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    results = babel_xrefs.get_curie_xrefs([curie], expand=False)
+    assert len(results) > 0
+    assert results == sorted(results)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie):
+    """Expanded results are at least as many as non-expanded."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    non_expanded = babel_xrefs.get_curie_xrefs([curie], expand=False)
+    babel_xrefs.get_curie_xref.cache_clear()
+    expanded = babel_xrefs.get_curie_xrefs([curie], expand=True)
+    assert len(expanded) >= len(non_expanded)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xrefs_expanded_includes_original(babel_xrefs, curie):
+    """Non-expanded results are a subset of expanded results."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    non_expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=False))
+    babel_xrefs.get_curie_xref.cache_clear()
+    expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=True))
+    assert non_expanded.issubset(expanded)
+
+
+@pytest.mark.integration
+def test_get_curie_xref_caching(babel_xrefs):
+    """Cached calls return the same object."""
+    curie = VALID_CURIES[0]
+    babel_xrefs.get_curie_xref.cache_clear()
+    r1 = babel_xrefs.get_curie_xref(curie)
+    r2 = babel_xrefs.get_curie_xref(curie)
+    assert r1 is r2
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_xref_with_labels(babel_xrefs_with_nodenorm, curie):
+    """With labels, returns LabeledCrossReference objects."""
+    babel_xrefs_with_nodenorm.get_curie_xref.cache_clear()
+    results = list(babel_xrefs_with_nodenorm.get_curie_xref(curie, label_curies=True))
+    assert len(results) > 0
+    for xr in results:
+        assert isinstance(xr, LabeledCrossReference)
+
+
+@pytest.mark.integration
+def test_get_curie_xref_nonexistent_curie(babel_xrefs):
+    """A made-up CURIE returns an empty list."""
+    babel_xrefs.get_curie_xref.cache_clear()
+    results = list(babel_xrefs.get_curie_xref("FAKE:9999999999"))
+    assert results == []
+
+
+@pytest.mark.integration
+@pytest.mark.slow
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_curie_ids(babel_xrefs, downloaded_identifiers, curie):
+    """get_curie_ids returns non-empty IdentifierRecord objects."""
+    results = babel_xrefs.get_curie_ids([curie])
+    assert len(results) > 0
+    for rec in results:
+        assert isinstance(rec, IdentifierRecord)
+        assert rec.curie == curie
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index c16bf74..912cd0a 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -1,396 +1,399 @@
 """
 Tests for the BabelDownloader class.
 
-These tests verify that the downloader can successfully fetch large Parquet files
-from the Babel server and properly manage local file caching with MD5 validation.
+Unit tests use mocks and run without network access.
+Integration tests download real files from the Babel server.
 """
 
-import os
-import shutil
 import hashlib
-import pytest
-from unittest.mock import Mock, patch, MagicMock
-from babel_explorer.core.downloader import BabelDownloader
-
-
-# Constants for test configuration
-BABEL_URL = "https://stars.renci.org/var/babel_outputs/2025nov19/"
-TEST_DATA_DIR = "data/test"
-IDENTIFIERS_FILE = "duckdb/Identifiers.parquet"
-MINIMUM_FILE_SIZE_GB = 2
-MINIMUM_FILE_SIZE_BYTES = MINIMUM_FILE_SIZE_GB * 1024 * 1024 * 1024  # 2GB in bytes
-
-
-@pytest.fixture(scope="module")
-def test_data_dir():
-    """
-    Fixture that provides a clean test data directory.
-
-    This fixture:
-    - Creates the test data directory before tests run
-    - Yields the directory path to tests
-    - Cleans up (removes) the directory after all tests complete
-
-    Scope is 'module' so the directory persists across all tests in this file,
-    allowing downloaded files to be reused by multiple tests.
-    """
-    # Setup: ensure clean test directory
-    if os.path.exists(TEST_DATA_DIR):
-        shutil.rmtree(TEST_DATA_DIR)
-    os.makedirs(TEST_DATA_DIR, exist_ok=True)
-
-    yield TEST_DATA_DIR
-
-    # Teardown: remove test directory and all contents
-    if os.path.exists(TEST_DATA_DIR):
-        shutil.rmtree(TEST_DATA_DIR)
-
-
-@pytest.fixture(scope="module")
-def downloader(test_data_dir):
-    """
-    Fixture that provides a BabelDownloader instance configured for testing.
-
-    Args:
-        test_data_dir: The test data directory fixture
-
-    Returns:
-        BabelDownloader: Configured downloader instance
-    """
-    return BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
-
-
-def test_downloader_initialization(test_data_dir):
-    """
-    Test that BabelDownloader initializes correctly with custom parameters.
-
-    Verifies:
-    - Downloader accepts URL and local path
-    - Local path is stored correctly
-    - Directory is created if it doesn't exist
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
-
-    assert downloader.url_base == BABEL_URL
-    assert downloader.local_path == test_data_dir
-    assert os.path.exists(test_data_dir)
-    assert os.path.isdir(test_data_dir)
-
-
-def test_download_large_parquet_file(downloader):
-    """
-    Test downloading a large Parquet file from the Babel server.
-
-    This test:
-    1. Downloads the Identifiers.parquet file (2GB+) from the real Babel server
-    2. Verifies the file was downloaded successfully
-    3. Confirms the file size is at least 2GB
+import os
+import tempfile
 
-    Note: This test takes several minutes to complete due to the large file size.
+import pytest
+import requests
+from unittest.mock import Mock, patch
 
-    Args:
-        downloader: BabelDownloader fixture
-    """
-    # Download the Identifiers.parquet file
-    downloaded_path = downloader.get_downloaded_file(IDENTIFIERS_FILE)
+from babel_explorer.core.downloader import BabelDownloader
 
-    # Verify the file exists
-    assert os.path.exists(downloaded_path), \
-        f"Downloaded file does not exist at {downloaded_path}"
+from tests.constants import CONCORD_FILE
 
-    # Verify it's a file, not a directory
-    assert os.path.isfile(downloaded_path), \
-        f"Downloaded path is not a file: {downloaded_path}"
 
-    # Get the file size in bytes
-    file_size_bytes = os.path.getsize(downloaded_path)
-    file_size_gb = file_size_bytes / (1024 * 1024 * 1024)
+# ==========================================================================
+# Unit Tests — no network required
+# ==========================================================================
+
+
+class TestBabelDownloaderInit:
+    """Tests for BabelDownloader constructor."""
+
+    def test_constructor_stores_url_and_path(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        assert dl.url_base == "https://example.com/"
+        assert dl.local_path == str(tmp_path)
+
+    def test_creates_directory_if_missing(self, tmp_path):
+        new_dir = str(tmp_path / "nested" / "dir")
+        dl = BabelDownloader(url_base="https://example.com/", local_path=new_dir)
+        assert os.path.isdir(new_dir)
+        assert dl.local_path == new_dir
+
+    def test_custom_retries(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3)
+        assert dl.retries == 3
+
+    def test_default_retries(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        assert dl.retries == 10
+
+    def test_invalid_path_raises_value_error(self):
+        """Using a file path (not a directory) should raise ValueError."""
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(b"not a directory")
+            f.flush()
+            try:
+                with pytest.raises(ValueError, match="Invalid local_path"):
+                    BabelDownloader(url_base="https://example.com/", local_path=f.name)
+            finally:
+                os.unlink(f.name)
+
+
+class TestGetOutputFile:
+    """Tests for get_output_file."""
+
+    def test_returns_correct_path(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        result = dl.get_output_file("output/duckdbs/test.duckdb")
+        assert result == os.path.join(str(tmp_path), "output/duckdbs/test.duckdb")
+
+    def test_creates_parent_directories(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        result = dl.get_output_file("deep/nested/dir/file.txt")
+        assert os.path.isdir(os.path.dirname(result))
+
+    def test_lru_caching(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        result1 = dl.get_output_file("some/file.txt")
+        result2 = dl.get_output_file("some/file.txt")
+        assert result1 is result2  # identity check — same cached object
+
+
+class TestCalculateMd5:
+    """Tests for _calculate_md5."""
+
+    def test_correct_hash(self, tmp_path):
+        content = b"Hello, world!"
+        expected = hashlib.md5(content).hexdigest()
+        file_path = tmp_path / "test.bin"
+        file_path.write_bytes(content)
+
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        assert dl._calculate_md5(str(file_path)) == expected
+
+    def test_different_chunk_sizes_same_result(self, tmp_path):
+        content = b"A" * 5000
+        expected = hashlib.md5(content).hexdigest()
+        file_path = tmp_path / "chunks.bin"
+        file_path.write_bytes(content)
+
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        assert dl._calculate_md5(str(file_path), chunk_size=100) == expected
+        assert dl._calculate_md5(str(file_path), chunk_size=4096) == expected
+
+
+class TestFetchRemoteMd5:
+    """Tests for _fetch_remote_md5."""
+
+    def _make_dl(self, tmp_path):
+        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+
+    def test_valid_md5_response(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        mock_resp = Mock()
+        mock_resp.status_code = 200
+        mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e  filename.parquet\n"
+        mock_resp.raise_for_status = Mock()
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
+            result = dl._fetch_remote_md5("https://example.com/file.md5")
+        assert result == "d41d8cd98f00b204e9800998ecf8427e"
+
+    def test_hash_only_format(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        mock_resp = Mock()
+        mock_resp.status_code = 200
+        mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e\n"
+        mock_resp.raise_for_status = Mock()
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
+            result = dl._fetch_remote_md5("https://example.com/file.md5")
+        assert result == "d41d8cd98f00b204e9800998ecf8427e"
+
+    def test_404_returns_none(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        mock_resp = Mock()
+        mock_resp.status_code = 404
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
+            assert dl._fetch_remote_md5("https://example.com/missing.md5") is None
+
+    def test_malformed_returns_none(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        mock_resp = Mock()
+        mock_resp.status_code = 200
+        mock_resp.text = "not-a-valid-md5-hash\n"
+        mock_resp.raise_for_status = Mock()
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
+            assert dl._fetch_remote_md5("https://example.com/bad.md5") is None
+
+    def test_network_error_returns_none(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")):
+            assert dl._fetch_remote_md5("https://example.com/err.md5") is None
+
+
+class TestMd5ValidationFlow:
+    """Tests for the MD5 validation logic inside get_downloaded_file."""
+
+    def test_matching_checksum_skips_download(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        test_file = "test.txt"
+        content = b"test content"
+        local_path = tmp_path / test_file
+        local_path.write_bytes(content)
+        expected_md5 = hashlib.md5(content).hexdigest()
+
+        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
+            with patch.object(dl, '_download_with_retry') as mock_dl:
+                dl.get_downloaded_file.cache_clear()
+                result = dl.get_downloaded_file(test_file)
+                mock_dl.assert_not_called()
+                assert result == str(local_path)
+
+    def test_mismatched_checksum_triggers_redownload(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        test_file = "mismatch.txt"
+        local_path = tmp_path / test_file
+        local_path.write_bytes(b"wrong content")
+        correct_content = b"correct content"
+        expected_md5 = hashlib.md5(correct_content).hexdigest()
+
+        def fake_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(correct_content)
 
-    # Verify the file is at least 2GB
-    assert file_size_bytes >= MINIMUM_FILE_SIZE_BYTES, \
-        f"Downloaded file is too small: {file_size_gb:.2f}GB (expected at least {MINIMUM_FILE_SIZE_GB}GB)"
+        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
+            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
+                dl.get_downloaded_file.cache_clear()
+                result = dl.get_downloaded_file(test_file)
+                assert os.path.exists(result)
+                with open(result, 'rb') as f:
+                    assert f.read() == correct_content
 
-    print(f"\n✓ Successfully downloaded {IDENTIFIERS_FILE}")
-    print(f"  Size: {file_size_gb:.2f}GB ({file_size_bytes:,} bytes)")
-    print(f"  Path: {downloaded_path}")
-
-
-def test_download_caching(downloader):
-    """
-    Test that the downloader uses LRU caching to avoid re-downloading files.
+    def test_no_md5_proceeds_normally(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        test_file = "no_md5.txt"
+        content = b"downloaded content"
 
-    This test:
-    1. Downloads the same file twice
-    2. Verifies both calls return the same path
-    3. Confirms the file is only downloaded once (via caching)
-
-    Args:
-        downloader: BabelDownloader fixture
-    """
-    # First download
-    path1 = downloader.get_downloaded_file(IDENTIFIERS_FILE)
-    initial_mtime = os.path.getmtime(path1)
+        def fake_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(content)
 
-    # Second download - should use cache
-    path2 = downloader.get_downloaded_file(IDENTIFIERS_FILE)
-    second_mtime = os.path.getmtime(path2)
+        with patch.object(dl, '_fetch_remote_md5', return_value=None):
+            with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+                dl.get_downloaded_file.cache_clear()
+                result = dl.get_downloaded_file(test_file)
+                mock_dl.assert_called_once()
+                assert os.path.exists(result)
 
-    # Verify same path returned
-    assert path1 == path2, "Cached download returned different path"
+    def test_post_download_validation_fail_raises(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        test_file = "post_fail.txt"
+        correct_md5 = hashlib.md5(b"expected").hexdigest()
 
-    # Verify file wasn't modified (i.e., wasn't re-downloaded)
-    assert initial_mtime == second_mtime, \
-        "File was modified, suggesting it was re-downloaded instead of cached"
+        def fake_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(b"wrong data after download")
 
-    print(f"\n✓ Caching works correctly - file not re-downloaded")
+        with patch.object(dl, '_fetch_remote_md5', return_value=correct_md5):
+            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
+                dl.get_downloaded_file.cache_clear()
+                with pytest.raises(RuntimeError, match="incorrect MD5 checksum"):
+                    dl.get_downloaded_file(test_file)
 
+    def test_post_download_validation_pass(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        test_file = "post_pass.txt"
+        content = b"correct content"
+        expected_md5 = hashlib.md5(content).hexdigest()
 
-def test_get_output_file(downloader):
-    """
-    Test the get_output_file method for creating output file paths.
+        def fake_download(url, path, chunk_size):
+            with open(path, 'wb') as f:
+                f.write(content)
+
+        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
+            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
+                dl.get_downloaded_file.cache_clear()
+                result = dl.get_downloaded_file(test_file)
+                assert os.path.exists(result)
 
-    This test:
-    1. Creates an output file path
-    2. Verifies the directory structure is created
-    3. Confirms the path is in the correct location
 
-    Args:
-        downloader: BabelDownloader fixture
-    """
-    output_filename = "output/duckdbs/test.duckdb"
-    output_path = downloader.get_output_file(output_filename)
+class TestDownloadWithRetry:
+    """Tests for _download_with_retry."""
 
-    # Verify the path is correct
-    expected_path = os.path.join(TEST_DATA_DIR, output_filename)
-    assert output_path == expected_path, \
-        f"Output path mismatch: expected {expected_path}, got {output_path}"
-
-    # Verify the parent directory was created
-    assert os.path.exists(os.path.dirname(output_path)), \
-        "Parent directory for output file was not created"
+    def test_retries_exhausted_raises_runtime_error(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2)
+        with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")):
+            with patch("babel_explorer.core.downloader.time.sleep"):  # skip waiting
+                with pytest.raises(RuntimeError, match="Failed to download"):
+                    dl._download_with_retry("https://example.com/file", str(tmp_path / "f"), 1024)
 
-    print(f"\n✓ Output file path created correctly: {output_path}")
+    def test_succeeds_on_second_attempt(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3)
+        out_path = str(tmp_path / "retry_success.bin")
 
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.headers = {'Content-Length': '5'}
+        mock_response.iter_content = Mock(return_value=[b"hello"])
 
-def test_invalid_local_path():
-    """
-    Test that BabelDownloader raises an error for invalid local paths.
-
-    This test verifies error handling when attempting to use a file path
-    as the local directory (should be a directory, not a file).
-    """
-    # Create a temporary file
-    invalid_path = "/tmp/test_babel_invalid_file.txt"
-    with open(invalid_path, 'w') as f:
-        f.write("test")
-
-    try:
-        # Attempt to create downloader with a file path instead of directory
-        with pytest.raises(ValueError, match="Invalid local_path"):
-            BabelDownloader(url_base=BABEL_URL, local_path=invalid_path)
-
-        print("\n✓ Correctly raised ValueError for invalid local path")
-    finally:
-        # Clean up
-        if os.path.exists(invalid_path):
-            os.remove(invalid_path)
-
-
-def test_md5_validation_matching_checksum(test_data_dir):
-    """
-    Test that MD5 validation skips download when checksums match.
-
-    This test:
-    1. Creates a local file with known content
-    2. Mocks the .md5 file to return the correct checksum
-    3. Verifies the download is skipped (no actual HTTP download occurs)
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
-
-    # Create a test file with known content
-    test_file = "test_file.txt"
-    local_path = os.path.join(test_data_dir, test_file)
-    os.makedirs(os.path.dirname(local_path), exist_ok=True)
-
-    test_content = b"This is test content for MD5 validation"
-    with open(local_path, 'wb') as f:
-        f.write(test_content)
-
-    # Calculate the expected MD5
-    expected_md5 = hashlib.md5(test_content).hexdigest()
-
-    # Mock the _fetch_remote_md5 to return the matching checksum
-    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
-        # Mock _download_with_retry to ensure it's NOT called
-        with patch.object(downloader, '_download_with_retry') as mock_download:
-            # Clear the cache before testing
-            downloader.get_downloaded_file.cache_clear()
-
-            result_path = downloader.get_downloaded_file(test_file)
-
-            # Verify the download was skipped
-            mock_download.assert_not_called()
-            assert result_path == local_path
-            assert os.path.exists(result_path)
+        side_effects = [requests.ConnectionError("first fail"), mock_response]
 
-    print(f"\n✓ MD5 validation correctly skipped download for matching checksum: {expected_md5}")
+        with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects):
+            with patch("babel_explorer.core.downloader.time.sleep"):
+                dl._download_with_retry("https://example.com/file", out_path, 1024)
+        assert os.path.exists(out_path)
 
+    def test_resume_sends_range_header(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = tmp_path / "partial.bin"
+        out_path.write_bytes(b"partial")  # 7 bytes
 
-def test_md5_validation_mismatched_checksum(test_data_dir):
-    """
-    Test that MD5 validation deletes and re-downloads file when checksums don't match.
-
-    This test:
-    1. Creates a local file with wrong content
-    2. Mocks the .md5 file to return a different checksum
-    3. Verifies the file is deleted and re-downloaded
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
-
-    # Create a test file with incorrect content
-    test_file = "test_file_mismatch.txt"
-    local_path = os.path.join(test_data_dir, test_file)
-    os.makedirs(os.path.dirname(local_path), exist_ok=True)
-
-    wrong_content = b"This is WRONG content"
-    with open(local_path, 'wb') as f:
-        f.write(wrong_content)
-
-    # Use a different MD5 (this is MD5 of "correct content")
-    correct_content = b"This is CORRECT content"
-    expected_md5 = hashlib.md5(correct_content).hexdigest()
-
-    # Track whether file was deleted
-    original_exists = os.path.exists(local_path)
-
-    # Mock the _fetch_remote_md5 to return the mismatched checksum
-    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
-        # Mock _download_with_retry to create the "correct" file
-        def mock_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
-                f.write(correct_content)
-
-        with patch.object(downloader, '_download_with_retry', side_effect=mock_download):
-            # Clear the cache before testing
-            downloader.get_downloaded_file.cache_clear()
+        mock_response = Mock()
+        mock_response.status_code = 206
+        mock_response.headers = {'Content-Length': '3'}
+        mock_response.iter_content = Mock(return_value=[b"end"])
 
-            result_path = downloader.get_downloaded_file(test_file)
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get:
+            dl._download_with_retry("https://example.com/file", str(out_path), 1024)
+            _, kwargs = mock_get.call_args
+            assert kwargs['headers'] == {'Range': 'bytes=7-'}
 
-            # Verify the file exists and has correct content
-            assert os.path.exists(result_path)
-            with open(result_path, 'rb') as f:
-                assert f.read() == correct_content
+    def test_http_416_file_already_complete(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = tmp_path / "complete.bin"
+        out_path.write_bytes(b"full file")
 
-    print(f"\n✓ MD5 validation correctly deleted and re-downloaded file with mismatched checksum")
+        mock_response = Mock()
+        mock_response.status_code = 416
 
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
+            dl._download_with_retry("https://example.com/file", str(out_path), 1024)
+        # Should return without error
+        assert out_path.read_bytes() == b"full file"
 
-def test_md5_validation_no_md5_file(test_data_dir):
-    """
-    Test that download proceeds normally when no .md5 file exists.
+    def test_server_no_resume_restarts_download(self, tmp_path):
+        """When server responds 200 (instead of 206), partial file is removed and download restarts."""
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = tmp_path / "no_resume.bin"
+        out_path.write_bytes(b"partial")
 
-    This test:
-    1. Mocks the .md5 file fetch to return None (404)
-    2. Verifies the download proceeds normally
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.headers = {'Content-Length': '12'}
+        mock_response.iter_content = Mock(return_value=[b"full content"])
 
-    test_file = "test_file_no_md5.txt"
-    local_path = os.path.join(test_data_dir, test_file)
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
+            dl._download_with_retry("https://example.com/file", str(out_path), 1024)
+        assert out_path.read_bytes() == b"full content"
 
-    test_content = b"Test content without MD5 file"
 
-    # Mock the _fetch_remote_md5 to return None (no .md5 file)
-    with patch.object(downloader, '_fetch_remote_md5', return_value=None):
-        # Mock _download_with_retry to create the file
-        def mock_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
-                f.write(test_content)
+class TestStreamDownload:
+    """Tests for _stream_download."""
 
-        with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method:
-            # Clear the cache before testing
-            downloader.get_downloaded_file.cache_clear()
+    def test_writes_chunks(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = str(tmp_path / "stream.bin")
 
-            result_path = downloader.get_downloaded_file(test_file)
+        mock_response = Mock()
+        mock_response.headers = {'Content-Length': '10'}
+        mock_response.iter_content = Mock(return_value=[b"hello", b"world"])
 
-            # Verify download was called (normal download path)
-            mock_download_method.assert_called_once()
-            assert os.path.exists(result_path)
-            with open(result_path, 'rb') as f:
-                assert f.read() == test_content
+        dl._stream_download(mock_response, out_path, resume_byte_pos=0, chunk_size=1024)
+        with open(out_path, 'rb') as f:
+            assert f.read() == b"helloworld"
 
-    print(f"\n✓ Download proceeded normally when no .md5 file exists")
+    def test_append_mode_on_resume(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = tmp_path / "append.bin"
+        out_path.write_bytes(b"start")
 
+        mock_response = Mock()
+        mock_response.headers = {'Content-Length': '3'}
+        mock_response.iter_content = Mock(return_value=[b"end"])
 
-def test_md5_validation_malformed_md5_file(test_data_dir):
-    """
-    Test that download proceeds normally when .md5 file is malformed.
+        dl._stream_download(mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024)
+        assert out_path.read_bytes() == b"startend"
 
-    This test:
-    1. Mocks the .md5 file fetch to return None (malformed content)
-    2. Verifies the download proceeds normally with a warning
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
 
-    test_file = "test_file_malformed_md5.txt"
-    local_path = os.path.join(test_data_dir, test_file)
+class TestGetDownloadedFileCaching:
+    """Tests for get_downloaded_file LRU caching."""
 
-    test_content = b"Test content with malformed MD5 file"
+    def test_cache_returns_same_result(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        content = b"cached content"
 
-    # Mock the _fetch_remote_md5 to return None (malformed .md5 file)
-    with patch.object(downloader, '_fetch_remote_md5', return_value=None):
-        # Mock _download_with_retry to create the file
-        def mock_download(url, path, chunk_size):
+        def fake_download(url, path, chunk_size):
             with open(path, 'wb') as f:
-                f.write(test_content)
+                f.write(content)
 
-        with patch.object(downloader, '_download_with_retry', side_effect=mock_download) as mock_download_method:
-            # Clear the cache before testing
-            downloader.get_downloaded_file.cache_clear()
+        with patch.object(dl, '_fetch_remote_md5', return_value=None):
+            with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+                dl.get_downloaded_file.cache_clear()
+                r1 = dl.get_downloaded_file("cached.txt")
+                r2 = dl.get_downloaded_file("cached.txt")
+                assert r1 == r2
+                mock_dl.assert_called_once()  # only one actual download
 
-            result_path = downloader.get_downloaded_file(test_file)
 
-            # Verify download was called (normal download path)
-            mock_download_method.assert_called_once()
-            assert os.path.exists(result_path)
+class TestGetDownloadedDir:
+    """Tests for get_downloaded_dir."""
 
-    print(f"\n✓ Download proceeded normally when .md5 file is malformed")
+    def test_raises_not_implemented(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        dl.get_downloaded_dir.cache_clear()
+        with pytest.raises(NotImplementedError):
+            dl.get_downloaded_dir("some/dir")
 
 
-def test_md5_post_download_validation(test_data_dir):
-    """
-    Test that MD5 validation occurs after download and fails if checksum is wrong.
+# ==========================================================================
+# Integration Tests — require network access
+# ==========================================================================
 
-    This test:
-    1. Downloads a new file
-    2. Mocks the .md5 file to return a checksum
-    3. Mocks the download to create a file with WRONG content
-    4. Verifies a RuntimeError is raised for checksum mismatch
-    """
-    downloader = BabelDownloader(url_base=BABEL_URL, local_path=test_data_dir)
 
-    test_file = "test_file_post_validation.txt"
-    local_path = os.path.join(test_data_dir, test_file)
+@pytest.mark.integration
+def test_download_concord_parquet(downloaded_concord):
+    """Verify Concord.parquet downloads and is > 100 MB."""
+    assert os.path.isfile(downloaded_concord)
+    size = os.path.getsize(downloaded_concord)
+    assert size > 100 * 1024 * 1024, f"Concord.parquet too small: {size} bytes"
 
-    # Expected content and MD5
-    correct_content = b"Expected content"
-    expected_md5 = hashlib.md5(correct_content).hexdigest()
 
-    # Wrong content that will be downloaded
-    wrong_content = b"Wrong content downloaded"
+@pytest.mark.integration
+def test_download_metadata_parquet(downloaded_metadata):
+    """Verify Metadata.parquet downloads and is non-empty."""
+    assert os.path.isfile(downloaded_metadata)
+    assert os.path.getsize(downloaded_metadata) > 0
 
-    # Mock the _fetch_remote_md5 to return the expected checksum
-    with patch.object(downloader, '_fetch_remote_md5', return_value=expected_md5):
-        # Mock _download_with_retry to create a file with WRONG content
-        def mock_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
-                f.write(wrong_content)
 
-        with patch.object(downloader, '_download_with_retry', side_effect=mock_download):
-            # Clear the cache before testing
-            downloader.get_downloaded_file.cache_clear()
+@pytest.mark.integration
+def test_download_caching_real_files(shared_downloader, downloaded_concord):
+    """Second call returns same path and file is not re-downloaded."""
+    path2 = shared_downloader.get_downloaded_file(CONCORD_FILE)
+    assert path2 == downloaded_concord
+    assert os.path.getmtime(downloaded_concord) == os.path.getmtime(path2)
 
-            # Should raise RuntimeError due to post-download MD5 mismatch
-            with pytest.raises(RuntimeError, match="incorrect MD5 checksum"):
-                downloader.get_downloaded_file(test_file)
 
-    print(f"\n✓ Post-download MD5 validation correctly detected checksum mismatch")
+@pytest.mark.integration
+@pytest.mark.slow
+def test_download_identifiers_parquet(downloaded_identifiers):
+    """Verify Identifiers.parquet downloads and is > 2 GB."""
+    assert os.path.isfile(downloaded_identifiers)
+    size = os.path.getsize(downloaded_identifiers)
+    assert size > 2 * 1024 * 1024 * 1024, f"Identifiers.parquet too small: {size} bytes"
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
new file mode 100644
index 0000000..2322eef
--- /dev/null
+++ b/tests/test_nodenorm.py
@@ -0,0 +1,296 @@
+"""
+Tests for NodeNorm and Identifier classes.
+
+Unit tests use mocks; integration tests call the real NodeNorm API.
+"""
+
+import pytest
+from unittest.mock import Mock, patch
+
+import requests
+
+from babel_explorer.core.nodenorm import NodeNorm, Identifier
+
+from tests.constants import load_curies
+
+VALID_CURIES = load_curies()
+
+
+# ==========================================================================
+# Unit Tests — Identifier
+# ==========================================================================
+
+
+class TestIdentifier:
+    def test_creation_with_defaults(self):
+        ident = Identifier(curie="MONDO:0004979")
+        assert ident.curie == "MONDO:0004979"
+        assert ident.label == ""
+        assert ident.biolink_type == ""
+        assert ident.taxa == []
+        assert ident.description == []
+
+    def test_full_creation(self):
+        ident = Identifier(
+            curie="MONDO:0004979",
+            label="asthma",
+            biolink_type="biolink:Disease",
+            taxa=["NCBITaxon:9606"],
+            description=["A chronic respiratory disease"],
+        )
+        assert ident.label == "asthma"
+        assert ident.biolink_type == "biolink:Disease"
+        assert ident.taxa == ["NCBITaxon:9606"]
+
+    def test_from_dict_minimal(self):
+        d = {"identifier": "X:1"}
+        ident = Identifier.from_dict(d)
+        assert ident.curie == "X:1"
+        assert ident.label == ""
+
+    def test_from_dict_full(self):
+        d = {
+            "identifier": "X:1",
+            "label": "Alpha",
+            "type": ["biolink:NamedThing"],
+            "taxa": ["NCBITaxon:9606"],
+            "description": ["Some thing"],
+        }
+        ident = Identifier.from_dict(d)
+        assert ident.curie == "X:1"
+        assert ident.label == "Alpha"
+        assert ident.biolink_type == ["biolink:NamedThing"]
+        assert ident.taxa == ["NCBITaxon:9606"]
+
+    def test_from_dict_partial(self):
+        d = {"identifier": "X:1", "label": "Beta"}
+        ident = Identifier.from_dict(d)
+        assert ident.curie == "X:1"
+        assert ident.label == "Beta"
+        assert ident.biolink_type == ""
+
+    def test_lt_ordering(self):
+        a = Identifier(curie="A:1")
+        b = Identifier(curie="B:2")
+        assert a < b
+
+    def test_sorting(self):
+        items = [Identifier(curie="C:3"), Identifier(curie="A:1"), Identifier(curie="B:2")]
+        result = sorted(items)
+        assert [x.curie for x in result] == ["A:1", "B:2", "C:3"]
+
+
+# ==========================================================================
+# Unit Tests — NodeNorm (mocked)
+# ==========================================================================
+
+
+class TestNodeNormInit:
+    def test_default_url(self):
+        nn = NodeNorm()
+        assert nn.nodenorm_url == ""
+
+    def test_custom_url(self):
+        nn = NodeNorm(nodenorm_url="https://custom.api/")
+        assert nn.nodenorm_url == "https://custom.api/"
+
+
+class TestNormalizeCurieMocked:
+    def _make_nn(self):
+        nn = NodeNorm(nodenorm_url="https://example.com/")
+        nn.normalize_curie.cache_clear()
+        return nn
+
+    def test_correct_api_endpoint_and_params(self):
+        nn = self._make_nn()
+        mock_resp = Mock()
+        mock_resp.status_code = 200
+        mock_resp.json.return_value = {"X:1": {"id": {"identifier": "X:1"}}}
+        mock_resp.raise_for_status = Mock()
+
+        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get:
+            nn.normalize_curie("X:1")
+            mock_get.assert_called_once()
+            args, kwargs = mock_get.call_args
+            assert args[0] == "https://example.com/get_normalized_nodes"
+            assert kwargs["params"]["curie"] == "X:1"
+
+    def test_returns_result_for_curie(self):
+        nn = self._make_nn()
+        expected = {"id": {"identifier": "X:1"}, "equivalent_identifiers": []}
+        mock_resp = Mock()
+        mock_resp.json.return_value = {"X:1": expected}
+        mock_resp.raise_for_status = Mock()
+
+        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp):
+            result = nn.normalize_curie("X:1")
+            assert result == expected
+
+    def test_lru_caching(self):
+        nn = self._make_nn()
+        mock_resp = Mock()
+        mock_resp.json.return_value = {"X:1": {"id": "X:1"}}
+        mock_resp.raise_for_status = Mock()
+
+        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get:
+            nn.normalize_curie("X:1")
+            nn.normalize_curie("X:1")
+            mock_get.assert_called_once()
+
+    def test_http_error_raises(self):
+        nn = self._make_nn()
+        mock_resp = Mock()
+        mock_resp.raise_for_status.side_effect = requests.HTTPError("500 Server Error")
+
+        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp):
+            with pytest.raises(requests.HTTPError):
+                nn.normalize_curie("BAD:1")
+
+
+class TestGetIdentifierMocked:
+    def _make_nn(self):
+        nn = NodeNorm(nodenorm_url="https://example.com/")
+        nn.normalize_curie.cache_clear()
+        nn.get_identifier.cache_clear()
+        return nn
+
+    def test_exact_match_found(self):
+        nn = self._make_nn()
+        api_result = {
+            "equivalent_identifiers": [
+                {"identifier": "X:1", "label": "Alpha", "type": ["biolink:Disease"]},
+                {"identifier": "X:2", "label": "Beta"},
+            ],
+        }
+        with patch.object(nn, 'normalize_curie', return_value=api_result):
+            ident = nn.get_identifier("X:1")
+            assert ident.curie == "X:1"
+            assert ident.label == "Alpha"
+
+    def test_no_match_returns_bare_identifier(self):
+        nn = self._make_nn()
+        api_result = {
+            "equivalent_identifiers": [
+                {"identifier": "X:2", "label": "Beta"},
+            ],
+        }
+        with patch.object(nn, 'normalize_curie', return_value=api_result):
+            ident = nn.get_identifier("X:1")
+            assert ident.curie == "X:1"
+            assert ident.label == ""
+
+    def test_falsy_result_returns_bare_identifier(self):
+        nn = self._make_nn()
+        with patch.object(nn, 'normalize_curie', return_value=None):
+            ident = nn.get_identifier("X:1")
+            assert ident.curie == "X:1"
+            assert ident.label == ""
+
+    def test_caching(self):
+        nn = self._make_nn()
+        api_result = {
+            "equivalent_identifiers": [
+                {"identifier": "X:1", "label": "Alpha"},
+            ],
+        }
+        with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm:
+            nn.get_identifier("X:1")
+            nn.get_identifier("X:1")
+            mock_norm.assert_called_once()
+
+
+class TestGetCliqueIdentifiersMocked:
+    def _make_nn(self):
+        nn = NodeNorm(nodenorm_url="https://example.com/")
+        nn.normalize_curie.cache_clear()
+        nn.get_clique_identifiers.cache_clear()
+        return nn
+
+    def test_success_returns_list(self):
+        nn = self._make_nn()
+        api_result = {
+            "equivalent_identifiers": [
+                {"identifier": "X:1", "label": "Alpha"},
+                {"identifier": "X:2", "label": "Beta"},
+            ],
+        }
+        with patch.object(nn, 'normalize_curie', return_value=api_result):
+            result = nn.get_clique_identifiers("X:1")
+            assert len(result) == 2
+            assert all(isinstance(x, Identifier) for x in result)
+
+    def test_missing_key_returns_none(self):
+        nn = self._make_nn()
+        api_result = {"id": {"identifier": "X:1"}}  # no equivalent_identifiers
+        with patch.object(nn, 'normalize_curie', return_value=api_result):
+            result = nn.get_clique_identifiers("X:1")
+            assert result is None
+
+    def test_caching(self):
+        nn = self._make_nn()
+        api_result = {
+            "equivalent_identifiers": [{"identifier": "X:1"}],
+        }
+        with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm:
+            nn.get_clique_identifiers("X:1")
+            nn.get_clique_identifiers("X:1")
+            mock_norm.assert_called_once()
+
+
+# ==========================================================================
+# Integration Tests — require real NodeNorm API
+# ==========================================================================
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_normalize_curie_real_api(nodenorm, curie):
+    """normalize_curie returns a dict with expected keys."""
+    nodenorm.normalize_curie.cache_clear()
+    result = nodenorm.normalize_curie(curie)
+    assert isinstance(result, dict)
+    assert "id" in result
+    assert "equivalent_identifiers" in result
+    assert "type" in result
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_identifier_real_api(nodenorm, curie):
+    """get_identifier returns an Identifier with non-empty label and biolink_type."""
+    nodenorm.normalize_curie.cache_clear()
+    nodenorm.get_identifier.cache_clear()
+    ident = nodenorm.get_identifier(curie)
+    assert isinstance(ident, Identifier)
+    assert ident.curie == curie
+    assert ident.label != ""
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_clique_identifiers_real_api(nodenorm, curie):
+    """get_clique_identifiers returns a non-empty list of Identifiers."""
+    nodenorm.normalize_curie.cache_clear()
+    nodenorm.get_clique_identifiers.cache_clear()
+    result = nodenorm.get_clique_identifiers(curie)
+    assert result is not None
+    assert len(result) > 0
+    assert all(isinstance(x, Identifier) for x in result)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("curie", VALID_CURIES)
+def test_get_clique_identifiers_has_known_ids(nodenorm, curie):
+    """At least one equivalent identifier is returned."""
+    nodenorm.normalize_curie.cache_clear()
+    nodenorm.get_clique_identifiers.cache_clear()
+    result = nodenorm.get_clique_identifiers(curie)
+    assert len(result) >= 1
+
+
+@pytest.mark.integration
+def test_normalize_curie_nonexistent(nodenorm):
+    """A made-up CURIE returns None."""
+    nodenorm.normalize_curie.cache_clear()
+    result = nodenorm.normalize_curie("FAKENS:9999999999")
+    assert result is None

From ff0dacc744102f01e541f7b45d83f6ff7834fb69 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 17:35:29 -0500
Subject: [PATCH 19/66] Added uv.lock (not sure why it wasn't added
 previously).

---
 uv.lock | 295 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 uv.lock

diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..56af50a
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,295 @@
+version = 1
+revision = 3
+requires-python = ">=3.11"
+
+[[package]]
+name = "babel-explorer"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "click" },
+    { name = "duckdb" },
+    { name = "requests" },
+    { name = "tqdm" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest" },
+    { name = "ruff" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "click", specifier = ">=8.3.1" },
+    { name = "duckdb", specifier = ">=1.4.2" },
+    { name = "requests", specifier = ">=2.32.5" },
+    { name = "tqdm", specifier = ">=4.67.0" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "pytest", specifier = ">=8.3.5" },
+    { name = "ruff", specifier = ">=0.11.0" },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/27/c6491ff4954e58a10f69ad90aca8a1b6fe9c5d3c6f380907af3c37435b59/charset_normalizer-3.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e1fcf0720908f200cd21aa4e6750a48ff6ce4afe7ff5a79a90d5ed8a08296f8", size = 206988, upload-time = "2025-10-14T04:40:33.79Z" },
+    { url = "https://files.pythonhosted.org/packages/94/59/2e87300fe67ab820b5428580a53cad894272dbb97f38a7a814a2a1ac1011/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f819d5fe9234f9f82d75bdfa9aef3a3d72c4d24a6e57aeaebba32a704553aa0", size = 147324, upload-time = "2025-10-14T04:40:34.961Z" },
+    { url = "https://files.pythonhosted.org/packages/07/fb/0cf61dc84b2b088391830f6274cb57c82e4da8bbc2efeac8c025edb88772/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a59cb51917aa591b1c4e6a43c132f0cdc3c76dbad6155df4e28ee626cc77a0a3", size = 142742, upload-time = "2025-10-14T04:40:36.105Z" },
+    { url = "https://files.pythonhosted.org/packages/62/8b/171935adf2312cd745d290ed93cf16cf0dfe320863ab7cbeeae1dcd6535f/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8ef3c867360f88ac904fd3f5e1f902f13307af9052646963ee08ff4f131adafc", size = 160863, upload-time = "2025-10-14T04:40:37.188Z" },
+    { url = "https://files.pythonhosted.org/packages/09/73/ad875b192bda14f2173bfc1bc9a55e009808484a4b256748d931b6948442/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d9e45d7faa48ee908174d8fe84854479ef838fc6a705c9315372eacbc2f02897", size = 157837, upload-time = "2025-10-14T04:40:38.435Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/fc/de9cce525b2c5b94b47c70a4b4fb19f871b24995c728e957ee68ab1671ea/charset_normalizer-3.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:840c25fb618a231545cbab0564a799f101b63b9901f2569faecd6b222ac72381", size = 151550, upload-time = "2025-10-14T04:40:40.053Z" },
+    { url = "https://files.pythonhosted.org/packages/55/c2/43edd615fdfba8c6f2dfbd459b25a6b3b551f24ea21981e23fb768503ce1/charset_normalizer-3.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca5862d5b3928c4940729dacc329aa9102900382fea192fc5e52eb69d6093815", size = 149162, upload-time = "2025-10-14T04:40:41.163Z" },
+    { url = "https://files.pythonhosted.org/packages/03/86/bde4ad8b4d0e9429a4e82c1e8f5c659993a9a863ad62c7df05cf7b678d75/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9c7f57c3d666a53421049053eaacdd14bbd0a528e2186fcb2e672effd053bb0", size = 150019, upload-time = "2025-10-14T04:40:42.276Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/86/a151eb2af293a7e7bac3a739b81072585ce36ccfb4493039f49f1d3cae8c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:277e970e750505ed74c832b4bf75dac7476262ee2a013f5574dd49075879e161", size = 143310, upload-time = "2025-10-14T04:40:43.439Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/fe/43dae6144a7e07b87478fdfc4dbe9efd5defb0e7ec29f5f58a55aeef7bf7/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31fd66405eaf47bb62e8cd575dc621c56c668f27d46a61d975a249930dd5e2a4", size = 162022, upload-time = "2025-10-14T04:40:44.547Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e6/7aab83774f5d2bca81f42ac58d04caf44f0cc2b65fc6db2b3b2e8a05f3b3/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:0d3d8f15c07f86e9ff82319b3d9ef6f4bf907608f53fe9d92b28ea9ae3d1fd89", size = 149383, upload-time = "2025-10-14T04:40:46.018Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/e8/b289173b4edae05c0dde07f69f8db476a0b511eac556dfe0d6bda3c43384/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:9f7fcd74d410a36883701fafa2482a6af2ff5ba96b9a620e9e0721e28ead5569", size = 159098, upload-time = "2025-10-14T04:40:47.081Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/df/fe699727754cae3f8478493c7f45f777b17c3ef0600e28abfec8619eb49c/charset_normalizer-3.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ebf3e58c7ec8a8bed6d66a75d7fb37b55e5015b03ceae72a8e7c74495551e224", size = 152991, upload-time = "2025-10-14T04:40:48.246Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/86/584869fe4ddb6ffa3bd9f491b87a01568797fb9bd8933f557dba9771beaf/charset_normalizer-3.4.4-cp311-cp311-win32.whl", hash = "sha256:eecbc200c7fd5ddb9a7f16c7decb07b566c29fa2161a16cf67b8d068bd21690a", size = 99456, upload-time = "2025-10-14T04:40:49.376Z" },
+    { url = "https://files.pythonhosted.org/packages/65/f6/62fdd5feb60530f50f7e38b4f6a1d5203f4d16ff4f9f0952962c044e919a/charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:5ae497466c7901d54b639cf42d5b8c1b6a4fead55215500d2f486d34db48d016", size = 106978, upload-time = "2025-10-14T04:40:50.844Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/9d/0710916e6c82948b3be62d9d398cb4fcf4e97b56d6a6aeccd66c4b2f2bd5/charset_normalizer-3.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:65e2befcd84bc6f37095f5961e68a6f077bf44946771354a28ad434c2cce0ae1", size = 99969, upload-time = "2025-10-14T04:40:52.272Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/85/1637cd4af66fa687396e757dec650f28025f2a2f5a5531a3208dc0ec43f2/charset_normalizer-3.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0a98e6759f854bd25a58a73fa88833fba3b7c491169f86ce1180c948ab3fd394", size = 208425, upload-time = "2025-10-14T04:40:53.353Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6a/04130023fef2a0d9c62d0bae2649b69f7b7d8d24ea5536feef50551029df/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b290ccc2a263e8d185130284f8501e3e36c5e02750fc6b6bdeb2e9e96f1e25", size = 148162, upload-time = "2025-10-14T04:40:54.558Z" },
+    { url = "https://files.pythonhosted.org/packages/78/29/62328d79aa60da22c9e0b9a66539feae06ca0f5a4171ac4f7dc285b83688/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74bb723680f9f7a6234dcf67aea57e708ec1fbdf5699fb91dfd6f511b0a320ef", size = 144558, upload-time = "2025-10-14T04:40:55.677Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bb/b32194a4bf15b88403537c2e120b817c61cd4ecffa9b6876e941c3ee38fe/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1e34719c6ed0b92f418c7c780480b26b5d9c50349e9a9af7d76bf757530350d", size = 161497, upload-time = "2025-10-14T04:40:57.217Z" },
+    { url = "https://files.pythonhosted.org/packages/19/89/a54c82b253d5b9b111dc74aca196ba5ccfcca8242d0fb64146d4d3183ff1/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2437418e20515acec67d86e12bf70056a33abdacb5cb1655042f6538d6b085a8", size = 159240, upload-time = "2025-10-14T04:40:58.358Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/10/d20b513afe03acc89ec33948320a5544d31f21b05368436d580dec4e234d/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11d694519d7f29d6cd09f6ac70028dba10f92f6cdd059096db198c283794ac86", size = 153471, upload-time = "2025-10-14T04:40:59.468Z" },
+    { url = "https://files.pythonhosted.org/packages/61/fa/fbf177b55bdd727010f9c0a3c49eefa1d10f960e5f09d1d887bf93c2e698/charset_normalizer-3.4.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ac1c4a689edcc530fc9d9aa11f5774b9e2f33f9a0c6a57864e90908f5208d30a", size = 150864, upload-time = "2025-10-14T04:41:00.623Z" },
+    { url = "https://files.pythonhosted.org/packages/05/12/9fbc6a4d39c0198adeebbde20b619790e9236557ca59fc40e0e3cebe6f40/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:21d142cc6c0ec30d2efee5068ca36c128a30b0f2c53c1c07bd78cb6bc1d3be5f", size = 150647, upload-time = "2025-10-14T04:41:01.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/6a9a593d52e3e8c5d2b167daf8c6b968808efb57ef4c210acb907c365bc4/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5dbe56a36425d26d6cfb40ce79c314a2e4dd6211d51d6d2191c00bed34f354cc", size = 145110, upload-time = "2025-10-14T04:41:03.231Z" },
+    { url = "https://files.pythonhosted.org/packages/30/42/9a52c609e72471b0fc54386dc63c3781a387bb4fe61c20231a4ebcd58bdd/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5bfbb1b9acf3334612667b61bd3002196fe2a1eb4dd74d247e0f2a4d50ec9bbf", size = 162839, upload-time = "2025-10-14T04:41:04.715Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/5b/c0682bbf9f11597073052628ddd38344a3d673fda35a36773f7d19344b23/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:d055ec1e26e441f6187acf818b73564e6e6282709e9bcb5b63f5b23068356a15", size = 150667, upload-time = "2025-10-14T04:41:05.827Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/24/a41afeab6f990cf2daf6cb8c67419b63b48cf518e4f56022230840c9bfb2/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:af2d8c67d8e573d6de5bc30cdb27e9b95e49115cd9baad5ddbd1a6207aaa82a9", size = 160535, upload-time = "2025-10-14T04:41:06.938Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/e5/6a4ce77ed243c4a50a1fecca6aaaab419628c818a49434be428fe24c9957/charset_normalizer-3.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:780236ac706e66881f3b7f2f32dfe90507a09e67d1d454c762cf642e6e1586e0", size = 154816, upload-time = "2025-10-14T04:41:08.101Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/ef/89297262b8092b312d29cdb2517cb1237e51db8ecef2e9af5edbe7b683b1/charset_normalizer-3.4.4-cp312-cp312-win32.whl", hash = "sha256:5833d2c39d8896e4e19b689ffc198f08ea58116bee26dea51e362ecc7cd3ed26", size = 99694, upload-time = "2025-10-14T04:41:09.23Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/2d/1e5ed9dd3b3803994c155cd9aacb60c82c331bad84daf75bcb9c91b3295e/charset_normalizer-3.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:a79cfe37875f822425b89a82333404539ae63dbdddf97f84dcbc3d339aae9525", size = 107131, upload-time = "2025-10-14T04:41:10.467Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/d9/0ed4c7098a861482a7b6a95603edce4c0d9db2311af23da1fb2b75ec26fc/charset_normalizer-3.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:376bec83a63b8021bb5c8ea75e21c4ccb86e7e45ca4eb81146091b56599b80c3", size = 100390, upload-time = "2025-10-14T04:41:11.915Z" },
+    { url = "https://files.pythonhosted.org/packages/97/45/4b3a1239bbacd321068ea6e7ac28875b03ab8bc0aa0966452db17cd36714/charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794", size = 208091, upload-time = "2025-10-14T04:41:13.346Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/62/73a6d7450829655a35bb88a88fca7d736f9882a27eacdca2c6d505b57e2e/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed", size = 147936, upload-time = "2025-10-14T04:41:14.461Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c5/adb8c8b3d6625bef6d88b251bbb0d95f8205831b987631ab0c8bb5d937c2/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72", size = 144180, upload-time = "2025-10-14T04:41:15.588Z" },
+    { url = "https://files.pythonhosted.org/packages/91/ed/9706e4070682d1cc219050b6048bfd293ccf67b3d4f5a4f39207453d4b99/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328", size = 161346, upload-time = "2025-10-14T04:41:16.738Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/0d/031f0d95e4972901a2f6f09ef055751805ff541511dc1252ba3ca1f80cf5/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede", size = 158874, upload-time = "2025-10-14T04:41:17.923Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894", size = 153076, upload-time = "2025-10-14T04:41:19.106Z" },
+    { url = "https://files.pythonhosted.org/packages/75/1e/5ff781ddf5260e387d6419959ee89ef13878229732732ee73cdae01800f2/charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1", size = 150601, upload-time = "2025-10-14T04:41:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/57/71be810965493d3510a6ca79b90c19e48696fb1ff964da319334b12677f0/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490", size = 150376, upload-time = "2025-10-14T04:41:21.398Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d5/c3d057a78c181d007014feb7e9f2e65905a6c4ef182c0ddf0de2924edd65/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44", size = 144825, upload-time = "2025-10-14T04:41:22.583Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/8c/d0406294828d4976f275ffbe66f00266c4b3136b7506941d87c00cab5272/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133", size = 162583, upload-time = "2025-10-14T04:41:23.754Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/24/e2aa1f18c8f15c4c0e932d9287b8609dd30ad56dbe41d926bd846e22fb8d/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3", size = 150366, upload-time = "2025-10-14T04:41:25.27Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/5b/1e6160c7739aad1e2df054300cc618b06bf784a7a164b0f238360721ab86/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e", size = 160300, upload-time = "2025-10-14T04:41:26.725Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/10/f882167cd207fbdd743e55534d5d9620e095089d176d55cb22d5322f2afd/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc", size = 154465, upload-time = "2025-10-14T04:41:28.322Z" },
+    { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" },
+    { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" },
+    { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" },
+    { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" },
+    { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" },
+    { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" },
+    { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" },
+    { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
+]
+
+[[package]]
+name = "click"
+version = "8.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "duckdb"
+version = "1.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/36/9d/ab66a06e416d71b7bdcb9904cdf8d4db3379ef632bb8e9495646702d9718/duckdb-1.4.4.tar.gz", hash = "sha256:8bba52fd2acb67668a4615ee17ee51814124223de836d9e2fdcbc4c9021b3d3c", size = 18419763, upload-time = "2026-01-26T11:50:37.68Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/68/19233412033a2bc5a144a3f531f64e3548d4487251e3f16b56c31411a06f/duckdb-1.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5ba684f498d4e924c7e8f30dd157da8da34c8479746c5011b6c0e037e9c60ad2", size = 28883816, upload-time = "2026-01-26T11:49:01.009Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/3e/cec70e546c298ab76d80b990109e111068d82cca67942c42328eaa7d6fdb/duckdb-1.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5536eb952a8aa6ae56469362e344d4e6403cc945a80bc8c5c2ebdd85d85eb64b", size = 15339662, upload-time = "2026-01-26T11:49:04.058Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/f0/cf4241a040ec4f571859a738007ec773b642fbc27df4cbcf34b0c32ea559/duckdb-1.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:47dd4162da6a2be59a0aef640eb08d6360df1cf83c317dcc127836daaf3b7f7c", size = 13670044, upload-time = "2026-01-26T11:49:06.627Z" },
+    { url = "https://files.pythonhosted.org/packages/11/64/de2bb4ec1e35ec9ebf6090a95b930fc56934a0ad6f34a24c5972a14a77ef/duckdb-1.4.4-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6cb357cfa3403910e79e2eb46c8e445bb1ee2fd62e9e9588c6b999df4256abc1", size = 18409951, upload-time = "2026-01-26T11:49:09.808Z" },
+    { url = "https://files.pythonhosted.org/packages/79/a2/ac0f5ee16df890d141304bcd48733516b7202c0de34cd3555634d6eb4551/duckdb-1.4.4-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c25d5b0febda02b7944e94fdae95aecf952797afc8cb920f677b46a7c251955", size = 20411739, upload-time = "2026-01-26T11:49:12.652Z" },
+    { url = "https://files.pythonhosted.org/packages/37/a2/9a3402edeedaecf72de05fe9ff7f0303d701b8dfc136aea4a4be1a5f7eee/duckdb-1.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6703dd1bb650025b3771552333d305d62ddd7ff182de121483d4e042ea6e2e00", size = 12256972, upload-time = "2026-01-26T11:49:15.468Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/e6/052ea6dcdf35b259fd182eff3efd8d75a071de4010c9807556098df137b9/duckdb-1.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:bf138201f56e5d6fc276a25138341b3523e2f84733613fc43f02c54465619a95", size = 13006696, upload-time = "2026-01-26T11:49:18.054Z" },
+    { url = "https://files.pythonhosted.org/packages/58/33/beadaa69f8458afe466126f2c5ee48c4759cc9d5d784f8703d44e0b52c3c/duckdb-1.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ddcfd9c6ff234da603a1edd5fd8ae6107f4d042f74951b65f91bc5e2643856b3", size = 28896535, upload-time = "2026-01-26T11:49:21.232Z" },
+    { url = "https://files.pythonhosted.org/packages/76/66/82413f386df10467affc87f65bac095b7c88dbd9c767584164d5f4dc4cb8/duckdb-1.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6792ca647216bd5c4ff16396e4591cfa9b4a72e5ad7cdd312cec6d67e8431a7c", size = 15349716, upload-time = "2026-01-26T11:49:23.989Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/8c/c13d396fd4e9bf970916dc5b4fea410c1b10fe531069aea65f1dcf849a71/duckdb-1.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1f8d55843cc940e36261689054f7dfb6ce35b1f5b0953b0d355b6adb654b0d52", size = 13672403, upload-time = "2026-01-26T11:49:26.741Z" },
+    { url = "https://files.pythonhosted.org/packages/db/77/2446a0b44226bb95217748d911c7ca66a66ca10f6481d5178d9370819631/duckdb-1.4.4-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c65d15c440c31e06baaebfd2c06d71ce877e132779d309f1edf0a85d23c07e92", size = 18419001, upload-time = "2026-01-26T11:49:29.353Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/a3/97715bba30040572fb15d02c26f36be988d48bc00501e7ac02b1d65ef9d0/duckdb-1.4.4-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b297eff642503fd435a9de5a9cb7db4eccb6f61d61a55b30d2636023f149855f", size = 20437385, upload-time = "2026-01-26T11:49:32.302Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/0a/18b9167adf528cbe3867ef8a84a5f19f37bedccb606a8a9e59cfea1880c8/duckdb-1.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d525de5f282b03aa8be6db86b1abffdceae5f1055113a03d5b50cd2fb8cf2ef8", size = 12267343, upload-time = "2026-01-26T11:49:34.985Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/15/37af97f5717818f3d82d57414299c293b321ac83e048c0a90bb8b6a09072/duckdb-1.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:50f2eb173c573811b44aba51176da7a4e5c487113982be6a6a1c37337ec5fa57", size = 13007490, upload-time = "2026-01-26T11:49:37.413Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/fe/64810fee20030f2bf96ce28b527060564864ce5b934b50888eda2cbf99dd/duckdb-1.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:337f8b24e89bc2e12dadcfe87b4eb1c00fd920f68ab07bc9b70960d6523b8bc3", size = 28899349, upload-time = "2026-01-26T11:49:40.294Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/9b/3c7c5e48456b69365d952ac201666053de2700f5b0144a699a4dc6854507/duckdb-1.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0509b39ea7af8cff0198a99d206dca753c62844adab54e545984c2e2c1381616", size = 15350691, upload-time = "2026-01-26T11:49:43.242Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/7b/64e68a7b857ed0340045501535a0da99ea5d9d5ea3708fec0afb8663eb27/duckdb-1.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fb94de6d023de9d79b7edc1ae07ee1d0b4f5fa8a9dcec799650b5befdf7aafec", size = 13672311, upload-time = "2026-01-26T11:49:46.069Z" },
+    { url = "https://files.pythonhosted.org/packages/09/5b/3e7aa490841784d223de61beb2ae64e82331501bf5a415dc87a0e27b4663/duckdb-1.4.4-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0d636ceda422e7babd5e2f7275f6a0d1a3405e6a01873f00d38b72118d30c10b", size = 18422740, upload-time = "2026-01-26T11:49:49.034Z" },
+    { url = "https://files.pythonhosted.org/packages/53/32/256df3dbaa198c58539ad94f9a41e98c2c8ff23f126b8f5f52c7dcd0a738/duckdb-1.4.4-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7df7351328ffb812a4a289732f500d621e7de9942a3a2c9b6d4afcf4c0e72526", size = 20435578, upload-time = "2026-01-26T11:49:51.946Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/f0/620323fd87062ea43e527a2d5ed9e55b525e0847c17d3b307094ddab98a2/duckdb-1.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:6fb1225a9ea5877421481d59a6c556a9532c32c16c7ae6ca8d127e2b878c9389", size = 12268083, upload-time = "2026-01-26T11:49:54.615Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/07/a397fdb7c95388ba9c055b9a3d38dfee92093f4427bc6946cf9543b1d216/duckdb-1.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:f28a18cc790217e5b347bb91b2cab27aafc557c58d3d8382e04b4fe55d0c3f66", size = 13006123, upload-time = "2026-01-26T11:49:57.092Z" },
+    { url = "https://files.pythonhosted.org/packages/97/a6/f19e2864e651b0bd8e4db2b0c455e7e0d71e0d4cd2cd9cc052f518e43eb3/duckdb-1.4.4-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:25874f8b1355e96178079e37312c3ba6d61a2354f51319dae860cf21335c3a20", size = 28909554, upload-time = "2026-01-26T11:50:00.107Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/93/8a24e932c67414fd2c45bed83218e62b73348996bf859eda020c224774b2/duckdb-1.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:452c5b5d6c349dc5d1154eb2062ee547296fcbd0c20e9df1ed00b5e1809089da", size = 15353804, upload-time = "2026-01-26T11:50:03.382Z" },
+    { url = "https://files.pythonhosted.org/packages/62/13/e5378ff5bb1d4397655d840b34b642b1b23cdd82ae19599e62dc4b9461c9/duckdb-1.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8e5c2d8a0452df55e092959c0bfc8ab8897ac3ea0f754cb3b0ab3e165cd79aff", size = 13676157, upload-time = "2026-01-26T11:50:06.232Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/94/24364da564b27aeebe44481f15bd0197a0b535ec93f188a6b1b98c22f082/duckdb-1.4.4-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1af6e76fe8bd24875dc56dd8e38300d64dc708cd2e772f67b9fbc635cc3066a3", size = 18426882, upload-time = "2026-01-26T11:50:08.97Z" },
+    { url = "https://files.pythonhosted.org/packages/26/0a/6ae31b2914b4dc34243279b2301554bcbc5f1a09ccc82600486c49ab71d1/duckdb-1.4.4-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0440f59e0cd9936a9ebfcf7a13312eda480c79214ffed3878d75947fc3b7d6d", size = 20435641, upload-time = "2026-01-26T11:50:12.188Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/b1/fd5c37c53d45efe979f67e9bd49aaceef640147bb18f0699a19edd1874d6/duckdb-1.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:59c8d76016dde854beab844935b1ec31de358d4053e792988108e995b18c08e7", size = 12762360, upload-time = "2026-01-26T11:50:14.76Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/2d/13e6024e613679d8a489dd922f199ef4b1d08a456a58eadd96dc2f05171f/duckdb-1.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4", size = 13458633, upload-time = "2026-01-26T11:50:17.657Z" },
+]
+
+[[package]]
+name = "idna"
+version = "3.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
+]
+
+[[package]]
+name = "ruff"
+version = "0.15.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/04/eab13a954e763b0606f460443fcbf6bb5a0faf06890ea3754ff16523dce5/ruff-0.15.2.tar.gz", hash = "sha256:14b965afee0969e68bb871eba625343b8673375f457af4abe98553e8bbb98342", size = 4558148, upload-time = "2026-02-19T22:32:20.271Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/70/3a4dc6d09b13cb3e695f28307e5d889b2e1a66b7af9c5e257e796695b0e6/ruff-0.15.2-py3-none-linux_armv6l.whl", hash = "sha256:120691a6fdae2f16d65435648160f5b81a9625288f75544dc40637436b5d3c0d", size = 10430565, upload-time = "2026-02-19T22:32:41.824Z" },
+    { url = "https://files.pythonhosted.org/packages/71/0b/bb8457b56185ece1305c666dc895832946d24055be90692381c31d57466d/ruff-0.15.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a89056d831256099658b6bba4037ac6dd06f49d194199215befe2bb10457ea5e", size = 10820354, upload-time = "2026-02-19T22:32:07.366Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/c1/e0532d7f9c9e0b14c46f61b14afd563298b8b83f337b6789ddd987e46121/ruff-0.15.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e36dee3a64be0ebd23c86ffa3aa3fd3ac9a712ff295e192243f814a830b6bd87", size = 10170767, upload-time = "2026-02-19T22:32:13.188Z" },
+    { url = "https://files.pythonhosted.org/packages/47/e8/da1aa341d3af017a21c7a62fb5ec31d4e7ad0a93ab80e3a508316efbcb23/ruff-0.15.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9fb47b6d9764677f8c0a193c0943ce9a05d6763523f132325af8a858eadc2b9", size = 10529591, upload-time = "2026-02-19T22:32:02.547Z" },
+    { url = "https://files.pythonhosted.org/packages/93/74/184fbf38e9f3510231fbc5e437e808f0b48c42d1df9434b208821efcd8d6/ruff-0.15.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f376990f9d0d6442ea9014b19621d8f2aaf2b8e39fdbfc79220b7f0c596c9b80", size = 10260771, upload-time = "2026-02-19T22:32:36.938Z" },
+    { url = "https://files.pythonhosted.org/packages/05/ac/605c20b8e059a0bc4b42360414baa4892ff278cec1c91fff4be0dceedefd/ruff-0.15.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dcc987551952d73cbf5c88d9fdee815618d497e4df86cd4c4824cc59d5dd75f", size = 11045791, upload-time = "2026-02-19T22:32:31.642Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/52/db6e419908f45a894924d410ac77d64bdd98ff86901d833364251bd08e22/ruff-0.15.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42a47fd785cbe8c01b9ff45031af875d101b040ad8f4de7bbb716487c74c9a77", size = 11879271, upload-time = "2026-02-19T22:32:29.305Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d8/7992b18f2008bdc9231d0f10b16df7dda964dbf639e2b8b4c1b4e91b83af/ruff-0.15.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe9f49354866e575b4c6943856989f966421870e85cd2ac94dccb0a9dcb2fea", size = 11303707, upload-time = "2026-02-19T22:32:22.492Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/02/849b46184bcfdd4b64cde61752cc9a146c54759ed036edd11857e9b8443b/ruff-0.15.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a672c82b5f9887576087d97be5ce439f04bbaf548ee987b92d3a7dede41d3a", size = 11149151, upload-time = "2026-02-19T22:32:44.234Z" },
+    { url = "https://files.pythonhosted.org/packages/70/04/f5284e388bab60d1d3b99614a5a9aeb03e0f333847e2429bebd2aaa1feec/ruff-0.15.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ecc64f46f7019e2bcc3cdc05d4a7da958b629a5ab7033195e11a438403d956", size = 11091132, upload-time = "2026-02-19T22:32:24.691Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/ae/88d844a21110e14d92cf73d57363fab59b727ebeabe78009b9ccb23500af/ruff-0.15.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8dcf243b15b561c655c1ef2f2b0050e5d50db37fe90115507f6ff37d865dc8b4", size = 10504717, upload-time = "2026-02-19T22:32:26.75Z" },
+    { url = "https://files.pythonhosted.org/packages/64/27/867076a6ada7f2b9c8292884ab44d08fd2ba71bd2b5364d4136f3cd537e1/ruff-0.15.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dab6941c862c05739774677c6273166d2510d254dac0695c0e3f5efa1b5585de", size = 10263122, upload-time = "2026-02-19T22:32:10.036Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/ef/faf9321d550f8ebf0c6373696e70d1758e20ccdc3951ad7af00c0956be7c/ruff-0.15.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b9164f57fc36058e9a6806eb92af185b0697c9fe4c7c52caa431c6554521e5c", size = 10735295, upload-time = "2026-02-19T22:32:39.227Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/55/e8089fec62e050ba84d71b70e7834b97709ca9b7aba10c1a0b196e493f97/ruff-0.15.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:80d24fcae24d42659db7e335b9e1531697a7102c19185b8dc4a028b952865fd8", size = 11241641, upload-time = "2026-02-19T22:32:34.617Z" },
+    { url = "https://files.pythonhosted.org/packages/23/01/1c30526460f4d23222d0fabd5888868262fd0e2b71a00570ca26483cd993/ruff-0.15.2-py3-none-win32.whl", hash = "sha256:fd5ff9e5f519a7e1bd99cbe8daa324010a74f5e2ebc97c6242c08f26f3714f6f", size = 10507885, upload-time = "2026-02-19T22:32:15.635Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/10/3d18e3bbdf8fc50bbb4ac3cc45970aa5a9753c5cb51bf9ed9a3cd8b79fa3/ruff-0.15.2-py3-none-win_amd64.whl", hash = "sha256:d20014e3dfa400f3ff84830dfb5755ece2de45ab62ecea4af6b7262d0fb4f7c5", size = 11623725, upload-time = "2026-02-19T22:32:04.947Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649, upload-time = "2026-02-19T22:32:18.108Z" },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+]

From bacc72de3a370e7d539162dbeb2cfbe3900a7d3e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 17:36:52 -0500
Subject: [PATCH 20/66] Update CLAUDE.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 CLAUDE.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index ae2e78f..3cb238c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -72,10 +72,10 @@ uv run ruff format
 ### Core Components
 
 1. **BabelDownloader** (`src/babel_explorer/core/downloader.py`):
-   - Downloads Babel intermediate files from a remote server using `wget`
+   - Downloads Babel intermediate files from a remote HTTP(S) server using Python's `requests` library (streaming downloads)
    - Caches files locally in configurable directory (default: `data/2025nov19/`)
    - Uses `@functools.lru_cache` to avoid re-downloading
-   - **Important**: Requires `wget` to be installed on the system
+   - **Important**: Requires network access but no external tools like `wget`
 
 2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`):
    - Main query engine for cross-references

From 96d9609c9a499047f8e2efdcef3fd149090a0b06 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 17:38:01 -0500
Subject: [PATCH 21/66] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 922fa1b..eafcfc6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "babel-explorer"
 version = "0.1.0"
-description = "Add your description here"
+description = "Tool for querying and exploring Babel APIs and intermediate files"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [

From 0c33e7ea60e1ad110a272f1ea94d9ed8b1aec75c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 17:46:17 -0500
Subject: [PATCH 22/66] Update src/babel_explorer/core/babel_xrefs.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/babel_xrefs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 1e82125..95fda85 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -42,7 +42,7 @@ def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: st
         self.obj_biolink_type = obj_biolink_type
 
     def __str__(self):
-        return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", obj_label="{self.obj_label}", subj_label="{self.subj_label}", obj_label="{self.obj_label}")"""
+        return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")"""
 
 @dataclasses.dataclass(frozen=True)
 class IdentifierRecord:

From 1aff01367ec6ea1be038e41352bf108b469b8c9d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 17:47:47 -0500
Subject: [PATCH 23/66] Update src/babel_explorer/core/nodenorm.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/nodenorm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index 018f106..a9c6752 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -32,7 +32,7 @@ def __init__(self, nodenorm_url: str=""):
         self.nodenorm_url = nodenorm_url
 
     @functools.lru_cache(maxsize=None)
-    def get_identifier(self, curie):
+    def get_identifier(self, curie: str):
         result = self.normalize_curie(curie)
         logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}")
         if not result:

From af76c151689429c517766f53adbc42bb43366bea Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 2 Mar 2026 18:26:05 -0500
Subject: [PATCH 24/66] Replace MD5 checksumming with HTTP header caching and
 freshness window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove _calculate_md5/_fetch_remote_md5 (too slow on 2.5-3.9 GB files)
- Add sidecar .meta JSON files (ETag, Last-Modified, Content-Length, last_checked)
- Three-tier logic: freshness window → HEAD/ETag check → full re-download
- Add freshness_seconds param to BabelDownloader (default 3h)
- Add --check-download CLI option to xrefs and ids commands (e.g. 3h, never)
- Update tests: replace MD5 test classes with meta/ETag/tier coverage

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/cli.py             |  28 +-
 src/babel_explorer/core/downloader.py | 203 ++++++-----
 tests/test_downloader.py              | 463 +++++++++++++++++++-------
 3 files changed, 490 insertions(+), 204 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 8dd5fc4..251ca0f 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -6,6 +6,17 @@
 from babel_explorer.core.nodenorm import NodeNorm
 
 
+def parse_duration(value: str) -> float:
+    """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds."""
+    units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
+    lower = value.lower()
+    if lower == "never":
+        return float("inf")
+    if lower[-1] in units:
+        return int(lower[:-1]) * units[lower[-1]]
+    return int(lower)  # bare seconds
+
+
 @click.group()
 def cli():
     pass
@@ -17,7 +28,11 @@ def cli():
 @click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
 @click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs")
 @click.option("--labels", is_flag=True, help="Include labels for CURIEs")
-def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool):
+@click.option("--check-download", type=str, default="3h", show_default=True,
+              help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
+                   "'never' always checks via HTTP HEAD; '0' same as 'never'.")
+def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool,
+          check_download: str):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
 
@@ -35,7 +50,8 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan
     """
     logging.basicConfig(level=logging.INFO)
 
-    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir), NodeNorm(nodenorm_url))
+    freshness = parse_duration(check_download)
+    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url))
     xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels)
     for xref in xrefs:
         print(xref)
@@ -44,7 +60,10 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan
 @click.argument("curies", type=str, required=True, nargs=-1)
 @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
 @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
-def ids(curies: list[str], babel_url: str, local_dir: str):
+@click.option("--check-download", type=str, default="3h", show_default=True,
+              help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
+                   "'never' always checks via HTTP HEAD; '0' same as 'never'.")
+def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     """
     Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided.
 
@@ -60,7 +79,8 @@ def ids(curies: list[str], babel_url: str, local_dir: str):
     """
     logging.basicConfig(level=logging.INFO)
 
-    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir))
+    freshness = parse_duration(check_download)
+    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness))
     xrefs = bxref.get_curie_ids(curies)
     for xref in xrefs:
         print(xref)
diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 93081c6..43c3daf 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -1,9 +1,10 @@
 import functools
+import json
 import os
 import urllib.parse
 import time
-import hashlib
 import requests
+from datetime import datetime, timezone
 from tqdm import tqdm
 import logging
 
@@ -13,10 +14,11 @@ class BabelDownloader:
     Class for downloading Babel cross-reference files to a local directory as needed.
     """
 
-    def __init__(self, url_base, local_path=None, retries=10):
+    def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600):
         # We assume the URL base is correct (if not, we can fix it later).
         self.url_base = url_base
         self.retries = retries
+        self.freshness_seconds = freshness_seconds
         self.logger = logging.getLogger(BabelDownloader.__name__)
 
         if local_path is None:
@@ -41,55 +43,115 @@ def get_output_file(self, filename):
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
         return filepath
 
-    def _calculate_md5(self, file_path, chunk_size=1024*1024):
+    def _get_meta_path(self, local_path):
+        """Return the sidecar metadata file path for a given local file."""
+        return local_path + ".meta"
+
+    def _load_meta(self, local_path):
+        """Load sidecar metadata JSON, or return None if not found/invalid."""
+        meta_path = self._get_meta_path(local_path)
+        if not os.path.exists(meta_path):
+            return None
+        try:
+            with open(meta_path, "r") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, OSError):
+            return None
+
+    def _save_meta(self, local_path, headers, update_last_checked=True):
+        """
+        Write a sidecar .meta JSON file next to local_path.
+
+        Args:
+            local_path: Path to the downloaded file
+            headers: Response headers dict (or requests.structures.CaseInsensitiveDict)
+            update_last_checked: If True, set last_checked to now
+        """
+        meta = {}
+        if "ETag" in headers:
+            meta["etag"] = headers["ETag"]
+        if "Last-Modified" in headers:
+            meta["last_modified"] = headers["Last-Modified"]
+        if "Content-Length" in headers:
+            meta["content_length"] = int(headers["Content-Length"])
+        if update_last_checked:
+            meta["last_checked"] = datetime.now(timezone.utc).isoformat()
+
+        meta_path = self._get_meta_path(local_path)
+        with open(meta_path, "w") as f:
+            json.dump(meta, f, indent=2)
+
+    def _is_within_freshness(self, meta, freshness_seconds):
         """
-        Calculate MD5 checksum of a file.
+        Return True if last_checked is within freshness_seconds of now.
 
         Args:
-            file_path: Path to the file to checksum
-            chunk_size: Size of chunks to read (default 1MB)
+            meta: dict loaded from .meta file
+            freshness_seconds: Number of seconds; float('inf') means always fresh
 
         Returns:
-            str: Hexadecimal MD5 checksum
+            bool
         """
-        md5_hash = hashlib.md5()
-        with open(file_path, 'rb') as f:
-            for chunk in iter(lambda: f.read(chunk_size), b''):
-                md5_hash.update(chunk)
-        return md5_hash.hexdigest()
+        if freshness_seconds == float("inf"):
+            return True
+        last_checked_str = meta.get("last_checked")
+        if not last_checked_str:
+            return False
+        try:
+            last_checked = datetime.fromisoformat(last_checked_str)
+            age = (datetime.now(timezone.utc) - last_checked).total_seconds()
+            return age < freshness_seconds
+        except (ValueError, TypeError):
+            return False
 
-    def _fetch_remote_md5(self, url):
+    def _etag_matches(self, url, meta):
         """
-        Fetch MD5 checksum from remote .md5 file.
+        Do a HEAD request and check if the ETag (or Last-Modified + Content-Length)
+        matches the stored metadata. If they match, update last_checked in the .meta file.
 
         Args:
-            url: URL to the .md5 file
+            url: URL to HEAD
+            meta: dict loaded from .meta file (may have etag, last_modified, content_length)
 
         Returns:
-            str: MD5 checksum if found, None if file doesn't exist or is malformed
+            bool: True if remote matches local meta (file is still current)
         """
         try:
-            response = requests.get(url, timeout=10)
-            if response.status_code == 404:
-                self.logger.debug(f"No .md5 file found at {url}")
-                return None
+            response = requests.head(url, timeout=30)
             response.raise_for_status()
+        except requests.RequestException as e:
+            self.logger.warning(f"HEAD request failed for {url}: {e}")
+            return False
+
+        remote_headers = response.headers
+
+        # Primary check: ETag
+        local_etag = meta.get("etag")
+        remote_etag = remote_headers.get("ETag")
+        if local_etag and remote_etag:
+            if local_etag == remote_etag:
+                self.logger.info(f"ETag matches ({remote_etag}), file is current")
+                # Update last_checked in the .meta file
+                # We need the local_path to update — derive it from URL
+                # Caller will handle updating; return True
+                return True
+            else:
+                self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading")
+                return False
 
-            # Parse MD5 file content
-            # Format is typically: "md5hash  filename" or just "md5hash"
-            content = response.text.strip()
-            md5_match = content.split()[0]  # Take first token
+        # Fallback: Last-Modified + Content-Length
+        local_lm = meta.get("last_modified")
+        remote_lm = remote_headers.get("Last-Modified")
+        local_cl = meta.get("content_length")
+        remote_cl = remote_headers.get("Content-Length")
 
-            # Validate it's a valid MD5 (32 hex characters)
-            if len(md5_match) == 32 and all(c in '0123456789abcdef' for c in md5_match.lower()):
-                return md5_match.lower()
-            else:
-                self.logger.warning(f"Malformed .md5 file at {url}: {content}")
-                return None
+        if local_lm and remote_lm and local_lm == remote_lm:
+            if local_cl is None or remote_cl is None or int(remote_cl) == local_cl:
+                self.logger.info(f"Last-Modified matches ({remote_lm}), file is current")
+                return True
 
-        except requests.RequestException as e:
-            self.logger.debug(f"Could not fetch .md5 file from {url}: {e}")
-            return None
+        self.logger.info("Cannot confirm file is current (no matching ETag or Last-Modified), will re-download")
+        return False
 
     def _stream_download(self, response, local_path, resume_byte_pos, chunk_size):
         """
@@ -134,6 +196,9 @@ def _download_with_retry(self, url, local_path, chunk_size):
             local_path: Local file path to save to
             chunk_size: Size of chunks to read/write
 
+        Returns:
+            requests.structures.CaseInsensitiveDict: Response headers from the final request
+
         Raises:
             RuntimeError: If all retry attempts fail
         """
@@ -157,7 +222,7 @@ def _download_with_retry(self, url, local_path, chunk_size):
                 if response.status_code == 416:
                     # Range Not Satisfiable - file already complete
                     self.logger.info(f"File already complete: {local_path}")
-                    return
+                    return response.headers
                 elif response.status_code == 206:
                     # Partial Content - resume successful
                     self.logger.info(f"Resuming download (HTTP 206)")
@@ -176,7 +241,7 @@ def _download_with_retry(self, url, local_path, chunk_size):
                 self._stream_download(response, local_path, resume_byte_pos, chunk_size)
 
                 # Success - exit retry loop
-                return
+                return response.headers
 
             except (requests.RequestException, IOError) as e:
                 self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}")
@@ -193,13 +258,12 @@ def _download_with_retry(self, url, local_path, chunk_size):
     @functools.lru_cache(maxsize=None)
     def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
         """
-        Download a file from the Babel server to local storage with MD5 validation.
+        Download a file from the Babel server to local storage with ETag-based caching.
 
-        If a .md5 file exists on the server, this method will:
-        1. Check if the local file exists
-        2. Verify its MD5 checksum matches the expected value
-        3. Delete and re-download if checksums don't match
-        4. Skip download if checksums match
+        Three-tier freshness logic:
+        1. If .meta exists and last_checked is within freshness window → return immediately
+        2. If .meta exists but stale → HEAD request to compare ETag; return if unchanged
+        3. If ETag changed or no .meta → full re-download
 
         Args:
             dirpath: Relative path from url_base to the file
@@ -212,48 +276,37 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
         os.makedirs(os.path.dirname(local_path_to_download_to), exist_ok=True)
 
         url_to_download = urllib.parse.urljoin(self.url_base, dirpath)
-        md5_url = url_to_download + '.md5'
 
-        # Check if file already exists and validate with MD5 if available
         if os.path.exists(local_path_to_download_to):
-            self.logger.info(f"Local file exists: {local_path_to_download_to}")
-
-            # Try to fetch remote MD5 checksum
-            expected_md5 = self._fetch_remote_md5(md5_url)
-
-            if expected_md5:
-                self.logger.info(f"Validating MD5 checksum (expected: {expected_md5})")
-
-                # Calculate local file's MD5
-                actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size)
-                self.logger.info(f"Local file MD5: {actual_md5}")
+            meta = self._load_meta(local_path_to_download_to)
+            if meta is not None:
+                # Tier 1: within freshness window — skip all network calls
+                if self._is_within_freshness(meta, self.freshness_seconds):
+                    self.logger.info(f"File within freshness window, skipping check: {local_path_to_download_to}")
+                    return local_path_to_download_to
 
-                if actual_md5 == expected_md5:
-                    # File is valid, skip download
-                    self.logger.info(f"MD5 checksum matches - file is valid, skipping download")
-                    bytes_downloaded = os.path.getsize(local_path_to_download_to)
-                    self.logger.info(f"Using existing file: {local_path_to_download_to} ({bytes_downloaded} bytes)")
+                # Tier 2: stale but maybe unchanged — HEAD request
+                if self._etag_matches(url_to_download, meta):
+                    # Update last_checked timestamp
+                    meta["last_checked"] = datetime.now(timezone.utc).isoformat()
+                    meta_path = self._get_meta_path(local_path_to_download_to)
+                    with open(meta_path, "w") as f:
+                        json.dump(meta, f, indent=2)
+                    self.logger.info(f"ETag matches, using existing file: {local_path_to_download_to}")
                     return local_path_to_download_to
-                else:
-                    # Checksums don't match - delete and re-download
-                    self.logger.warning(f"MD5 checksum mismatch! Expected {expected_md5}, got {actual_md5}")
-                    self.logger.warning(f"Deleting corrupted file and re-downloading: {local_path_to_download_to}")
-                    os.remove(local_path_to_download_to)
+
+                # Tier 3: ETag changed — delete and re-download
+                self.logger.warning(f"Remote file changed, re-downloading: {local_path_to_download_to}")
+                os.remove(local_path_to_download_to)
 
         self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}")
 
-        # Download with retry logic
-        self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size)
+        # Download with retry logic; get response headers back
+        response_headers = self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size)
 
-        # Verify MD5 after download if available
-        expected_md5 = self._fetch_remote_md5(md5_url)
-        if expected_md5:
-            actual_md5 = self._calculate_md5(local_path_to_download_to, chunk_size)
-            if actual_md5 == expected_md5:
-                self.logger.info(f"Post-download MD5 verification passed: {actual_md5}")
-            else:
-                self.logger.error(f"Post-download MD5 verification failed! Expected {expected_md5}, got {actual_md5}")
-                raise RuntimeError(f"Downloaded file has incorrect MD5 checksum")
+        # Save sidecar metadata
+        if response_headers is not None:
+            self._save_meta(local_path_to_download_to, response_headers)
 
         bytes_downloaded = os.path.getsize(local_path_to_download_to)
         self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 912cd0a..045e402 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -5,13 +5,14 @@
 Integration tests download real files from the Babel server.
 """
 
-import hashlib
+import json
 import os
 import tempfile
+from datetime import datetime, timezone, timedelta
 
 import pytest
 import requests
-from unittest.mock import Mock, patch
+from unittest.mock import Mock, patch, MagicMock
 
 from babel_explorer.core.downloader import BabelDownloader
 
@@ -45,6 +46,14 @@ def test_default_retries(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
         assert dl.retries == 10
 
+    def test_default_freshness_seconds(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        assert dl.freshness_seconds == 3 * 3600
+
+    def test_custom_freshness_seconds(self, tmp_path):
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), freshness_seconds=0)
+        assert dl.freshness_seconds == 0
+
     def test_invalid_path_raises_value_error(self):
         """Using a file path (not a directory) should raise ValueError."""
         with tempfile.NamedTemporaryFile(delete=False) as f:
@@ -77,161 +86,360 @@ def test_lru_caching(self, tmp_path):
         assert result1 is result2  # identity check — same cached object
 
 
-class TestCalculateMd5:
-    """Tests for _calculate_md5."""
+class TestSaveMeta:
+    """Tests for _save_meta."""
 
-    def test_correct_hash(self, tmp_path):
-        content = b"Hello, world!"
-        expected = hashlib.md5(content).hexdigest()
-        file_path = tmp_path / "test.bin"
-        file_path.write_bytes(content)
+    def _make_dl(self, tmp_path):
+        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
 
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        assert dl._calculate_md5(str(file_path)) == expected
+    def test_writes_all_fields(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        file_path = str(tmp_path / "test.parquet")
+        # Create the file so the path is valid
+        open(file_path, 'wb').close()
+
+        headers = {
+            "ETag": '"abc123"',
+            "Last-Modified": "Wed, 03 Dec 2025 15:54:19 GMT",
+            "Content-Length": "12345",
+        }
+        dl._save_meta(file_path, headers)
+
+        meta_path = file_path + ".meta"
+        assert os.path.exists(meta_path)
+        with open(meta_path) as f:
+            meta = json.load(f)
+
+        assert meta["etag"] == '"abc123"'
+        assert meta["last_modified"] == "Wed, 03 Dec 2025 15:54:19 GMT"
+        assert meta["content_length"] == 12345
+        assert "last_checked" in meta
+
+    def test_last_checked_is_recent_utc(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        file_path = str(tmp_path / "f.parquet")
+        open(file_path, 'wb').close()
 
-    def test_different_chunk_sizes_same_result(self, tmp_path):
-        content = b"A" * 5000
-        expected = hashlib.md5(content).hexdigest()
-        file_path = tmp_path / "chunks.bin"
-        file_path.write_bytes(content)
+        dl._save_meta(file_path, {"ETag": '"x"'})
 
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        assert dl._calculate_md5(str(file_path), chunk_size=100) == expected
-        assert dl._calculate_md5(str(file_path), chunk_size=4096) == expected
+        with open(file_path + ".meta") as f:
+            meta = json.load(f)
+
+        last_checked = datetime.fromisoformat(meta["last_checked"])
+        age = (datetime.now(timezone.utc) - last_checked).total_seconds()
+        assert age < 5  # written less than 5 seconds ago
+
+    def test_missing_headers_not_written(self, tmp_path):
+        """Headers not present in the response should not appear in .meta."""
+        dl = self._make_dl(tmp_path)
+        file_path = str(tmp_path / "sparse.parquet")
+        open(file_path, 'wb').close()
+
+        dl._save_meta(file_path, {})
 
+        with open(file_path + ".meta") as f:
+            meta = json.load(f)
 
-class TestFetchRemoteMd5:
-    """Tests for _fetch_remote_md5."""
+        assert "etag" not in meta
+        assert "last_modified" not in meta
+        assert "content_length" not in meta
+        assert "last_checked" in meta
+
+
+class TestLoadMeta:
+    """Tests for _load_meta."""
 
     def _make_dl(self, tmp_path):
         return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
 
-    def test_valid_md5_response(self, tmp_path):
+    def test_returns_none_if_no_meta_file(self, tmp_path):
         dl = self._make_dl(tmp_path)
-        mock_resp = Mock()
-        mock_resp.status_code = 200
-        mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e  filename.parquet\n"
-        mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
-            result = dl._fetch_remote_md5("https://example.com/file.md5")
-        assert result == "d41d8cd98f00b204e9800998ecf8427e"
+        assert dl._load_meta(str(tmp_path / "nonexistent.parquet")) is None
+
+    def test_returns_dict_for_valid_meta(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        file_path = str(tmp_path / "f.parquet")
+        open(file_path, 'wb').close()
+        meta_data = {"etag": '"abc"', "last_checked": "2026-01-01T00:00:00+00:00"}
+        with open(file_path + ".meta", "w") as f:
+            json.dump(meta_data, f)
+
+        result = dl._load_meta(file_path)
+        assert result == meta_data
+
+    def test_returns_none_for_corrupt_meta(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        file_path = str(tmp_path / "corrupt.parquet")
+        open(file_path, 'wb').close()
+        with open(file_path + ".meta", "w") as f:
+            f.write("not valid json {{{")
+
+        assert dl._load_meta(file_path) is None
+
+
+class TestIsWithinFreshness:
+    """Tests for _is_within_freshness."""
 
-    def test_hash_only_format(self, tmp_path):
+    def _make_dl(self, tmp_path):
+        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+
+    def test_returns_true_when_recent(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        recent = datetime.now(timezone.utc).isoformat()
+        meta = {"last_checked": recent}
+        assert dl._is_within_freshness(meta, 3600) is True
+
+    def test_returns_false_when_stale(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        old = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat()
+        meta = {"last_checked": old}
+        assert dl._is_within_freshness(meta, 3600) is False
+
+    def test_returns_false_when_missing_last_checked(self, tmp_path):
         dl = self._make_dl(tmp_path)
+        assert dl._is_within_freshness({}, 3600) is False
+
+    def test_returns_true_when_freshness_is_inf(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        old = (datetime.now(timezone.utc) - timedelta(days=365)).isoformat()
+        meta = {"last_checked": old}
+        assert dl._is_within_freshness(meta, float("inf")) is True
+
+    def test_returns_false_when_freshness_is_zero(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        just_now = datetime.now(timezone.utc).isoformat()
+        meta = {"last_checked": just_now}
+        # Even with freshness=0, age >= 0 so it's not < 0
+        assert dl._is_within_freshness(meta, 0) is False
+
+
+class TestEtagMatches:
+    """Tests for _etag_matches."""
+
+    def _make_dl(self, tmp_path):
+        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+
+    def test_returns_true_on_matching_etag(self, tmp_path):
+        dl = self._make_dl(tmp_path)
+        meta = {"etag": '"abc123"'}
         mock_resp = Mock()
-        mock_resp.status_code = 200
-        mock_resp.text = "d41d8cd98f00b204e9800998ecf8427e\n"
+        mock_resp.headers = {"ETag": '"abc123"'}
         mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
-            result = dl._fetch_remote_md5("https://example.com/file.md5")
-        assert result == "d41d8cd98f00b204e9800998ecf8427e"
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+            assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
-    def test_404_returns_none(self, tmp_path):
+    def test_returns_false_on_different_etag(self, tmp_path):
         dl = self._make_dl(tmp_path)
+        meta = {"etag": '"old"'}
         mock_resp = Mock()
-        mock_resp.status_code = 404
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
-            assert dl._fetch_remote_md5("https://example.com/missing.md5") is None
+        mock_resp.headers = {"ETag": '"new"'}
+        mock_resp.raise_for_status = Mock()
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+            assert dl._etag_matches("https://example.com/f.parquet", meta) is False
 
-    def test_malformed_returns_none(self, tmp_path):
+    def test_fallback_last_modified_match(self, tmp_path):
         dl = self._make_dl(tmp_path)
+        lm = "Wed, 03 Dec 2025 15:54:19 GMT"
+        meta = {"last_modified": lm, "content_length": 100}
         mock_resp = Mock()
-        mock_resp.status_code = 200
-        mock_resp.text = "not-a-valid-md5-hash\n"
+        mock_resp.headers = {"Last-Modified": lm, "Content-Length": "100"}
         mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_resp):
-            assert dl._fetch_remote_md5("https://example.com/bad.md5") is None
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+            assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
-    def test_network_error_returns_none(self, tmp_path):
+    def test_returns_false_on_request_error(self, tmp_path):
         dl = self._make_dl(tmp_path)
-        with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")):
-            assert dl._fetch_remote_md5("https://example.com/err.md5") is None
+        meta = {"etag": '"abc"'}
+        with patch("babel_explorer.core.downloader.requests.head",
+                   side_effect=requests.ConnectionError("fail")):
+            assert dl._etag_matches("https://example.com/f.parquet", meta) is False
 
 
-class TestMd5ValidationFlow:
-    """Tests for the MD5 validation logic inside get_downloaded_file."""
+class TestGetDownloadedFileTiers:
+    """Tests for the three-tier logic in get_downloaded_file."""
 
-    def test_matching_checksum_skips_download(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        test_file = "test.txt"
-        content = b"test content"
-        local_path = tmp_path / test_file
-        local_path.write_bytes(content)
-        expected_md5 = hashlib.md5(content).hexdigest()
-
-        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
-            with patch.object(dl, '_download_with_retry') as mock_dl:
+    def _make_dl(self, tmp_path, freshness=3600):
+        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path),
+                               freshness_seconds=freshness)
+
+    # --- Tier 1: within freshness window ---
+
+    def test_tier1_returns_immediately_no_http(self, tmp_path):
+        """File + fresh .meta → no network calls at all."""
+        dl = self._make_dl(tmp_path, freshness=3600)
+        test_file = "duckdb/test.parquet"
+        local = tmp_path / "duckdb" / "test.parquet"
+        local.parent.mkdir(parents=True)
+        local.write_bytes(b"data")
+
+        meta = {"etag": '"abc"', "last_checked": datetime.now(timezone.utc).isoformat()}
+        with open(str(local) + ".meta", "w") as f:
+            json.dump(meta, f)
+
+        with patch("babel_explorer.core.downloader.requests.head") as mock_head:
+            with patch("babel_explorer.core.downloader.requests.get") as mock_get:
                 dl.get_downloaded_file.cache_clear()
                 result = dl.get_downloaded_file(test_file)
-                mock_dl.assert_not_called()
-                assert result == str(local_path)
-
-    def test_mismatched_checksum_triggers_redownload(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        test_file = "mismatch.txt"
-        local_path = tmp_path / test_file
-        local_path.write_bytes(b"wrong content")
-        correct_content = b"correct content"
-        expected_md5 = hashlib.md5(correct_content).hexdigest()
+                mock_head.assert_not_called()
+                mock_get.assert_not_called()
+        assert result == str(local)
+
+    # --- Tier 2: stale .meta, ETag matches ---
+
+    def test_tier2_head_check_no_redownload(self, tmp_path):
+        """Stale .meta + matching ETag → HEAD only, no GET."""
+        dl = self._make_dl(tmp_path, freshness=0)
+        test_file = "duckdb/test.parquet"
+        local = tmp_path / "duckdb" / "test.parquet"
+        local.parent.mkdir(parents=True)
+        local.write_bytes(b"data")
+
+        old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat()
+        meta = {"etag": '"abc"', "last_checked": old_ts}
+        with open(str(local) + ".meta", "w") as f:
+            json.dump(meta, f)
+
+        mock_head_resp = Mock()
+        mock_head_resp.headers = {"ETag": '"abc"'}
+        mock_head_resp.raise_for_status = Mock()
+
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
+            with patch("babel_explorer.core.downloader.requests.get") as mock_get:
+                dl.get_downloaded_file.cache_clear()
+                result = dl.get_downloaded_file(test_file)
+                mock_get.assert_not_called()
+        assert result == str(local)
+
+    def test_tier2_updates_last_checked_after_head(self, tmp_path):
+        """After successful HEAD match, last_checked in .meta is updated."""
+        dl = self._make_dl(tmp_path, freshness=0)
+        test_file = "duckdb/upd.parquet"
+        local = tmp_path / "duckdb" / "upd.parquet"
+        local.parent.mkdir(parents=True)
+        local.write_bytes(b"data")
+
+        old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat()
+        meta = {"etag": '"abc"', "last_checked": old_ts}
+        with open(str(local) + ".meta", "w") as f:
+            json.dump(meta, f)
+
+        mock_head_resp = Mock()
+        mock_head_resp.headers = {"ETag": '"abc"'}
+        mock_head_resp.raise_for_status = Mock()
+
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
+            dl.get_downloaded_file.cache_clear()
+            dl.get_downloaded_file(test_file)
+
+        with open(str(local) + ".meta") as f:
+            updated_meta = json.load(f)
+        updated_ts = datetime.fromisoformat(updated_meta["last_checked"])
+        assert (datetime.now(timezone.utc) - updated_ts).total_seconds() < 5
+
+    # --- Tier 3: ETag changed, re-download ---
+
+    def test_tier3_redownloads_when_etag_changed(self, tmp_path):
+        """Changed ETag → file deleted and re-downloaded."""
+        dl = self._make_dl(tmp_path, freshness=0)
+        test_file = "duckdb/changed.parquet"
+        local = tmp_path / "duckdb" / "changed.parquet"
+        local.parent.mkdir(parents=True)
+        local.write_bytes(b"old data")
+
+        old_ts = (datetime.now(timezone.utc) - timedelta(hours=5)).isoformat()
+        meta = {"etag": '"old"', "last_checked": old_ts}
+        with open(str(local) + ".meta", "w") as f:
+            json.dump(meta, f)
+
+        mock_head_resp = Mock()
+        mock_head_resp.headers = {"ETag": '"new"'}
+        mock_head_resp.raise_for_status = Mock()
+
+        new_content = b"new data"
 
         def fake_download(url, path, chunk_size):
             with open(path, 'wb') as f:
-                f.write(correct_content)
+                f.write(new_content)
+            return {"ETag": '"new"', "Content-Length": str(len(new_content))}
 
-        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
+        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
             with patch.object(dl, '_download_with_retry', side_effect=fake_download):
                 dl.get_downloaded_file.cache_clear()
                 result = dl.get_downloaded_file(test_file)
-                assert os.path.exists(result)
-                with open(result, 'rb') as f:
-                    assert f.read() == correct_content
 
-    def test_no_md5_proceeds_normally(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        test_file = "no_md5.txt"
-        content = b"downloaded content"
+        assert open(result, 'rb').read() == new_content
+
+    # --- No .meta: fresh download ---
+
+    def test_downloads_when_no_meta(self, tmp_path):
+        """No file and no .meta → download happens, .meta is saved."""
+        dl = self._make_dl(tmp_path)
+        test_file = "duckdb/new.parquet"
+        local_path = str(tmp_path / "duckdb" / "new.parquet")
+        content = b"fresh download"
 
         def fake_download(url, path, chunk_size):
+            os.makedirs(os.path.dirname(path), exist_ok=True)
             with open(path, 'wb') as f:
                 f.write(content)
-
-        with patch.object(dl, '_fetch_remote_md5', return_value=None):
-            with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
-                dl.get_downloaded_file.cache_clear()
-                result = dl.get_downloaded_file(test_file)
-                mock_dl.assert_called_once()
-                assert os.path.exists(result)
-
-    def test_post_download_validation_fail_raises(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        test_file = "post_fail.txt"
-        correct_md5 = hashlib.md5(b"expected").hexdigest()
+            return {"ETag": '"fresh"', "Content-Length": str(len(content))}
+
+        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+            dl.get_downloaded_file.cache_clear()
+            result = dl.get_downloaded_file(test_file)
+            mock_dl.assert_called_once()
+
+        assert os.path.exists(result)
+        assert open(result, 'rb').read() == content
+        # .meta should be saved
+        meta_path = result + ".meta"
+        assert os.path.exists(meta_path)
+        with open(meta_path) as f:
+            saved_meta = json.load(f)
+        assert saved_meta["etag"] == '"fresh"'
+
+    def test_downloads_when_file_exists_but_no_meta(self, tmp_path):
+        """File exists but no .meta → treats as unknown, triggers full download flow."""
+        dl = self._make_dl(tmp_path, freshness=3600)
+        test_file = "duckdb/nometa.parquet"
+        local = tmp_path / "duckdb" / "nometa.parquet"
+        local.parent.mkdir(parents=True)
+        local.write_bytes(b"old content")
+        # No .meta file
+
+        new_content = b"refreshed"
 
         def fake_download(url, path, chunk_size):
             with open(path, 'wb') as f:
-                f.write(b"wrong data after download")
+                f.write(new_content)
+            return {"ETag": '"new"'}
 
-        with patch.object(dl, '_fetch_remote_md5', return_value=correct_md5):
-            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
-                dl.get_downloaded_file.cache_clear()
-                with pytest.raises(RuntimeError, match="incorrect MD5 checksum"):
-                    dl.get_downloaded_file(test_file)
+        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+            dl.get_downloaded_file.cache_clear()
+            result = dl.get_downloaded_file(test_file)
+            mock_dl.assert_called_once()
+
+        assert open(result, 'rb').read() == new_content
+
+
+class TestGetDownloadedFileCaching:
+    """Tests for get_downloaded_file LRU caching."""
 
-    def test_post_download_validation_pass(self, tmp_path):
+    def test_cache_returns_same_result(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        test_file = "post_pass.txt"
-        content = b"correct content"
-        expected_md5 = hashlib.md5(content).hexdigest()
+        content = b"cached content"
 
         def fake_download(url, path, chunk_size):
             with open(path, 'wb') as f:
                 f.write(content)
+            return {}
 
-        with patch.object(dl, '_fetch_remote_md5', return_value=expected_md5):
-            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
-                dl.get_downloaded_file.cache_clear()
-                result = dl.get_downloaded_file(test_file)
-                assert os.path.exists(result)
+        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+            dl.get_downloaded_file.cache_clear()
+            r1 = dl.get_downloaded_file("cached.txt")
+            r2 = dl.get_downloaded_file("cached.txt")
+            assert r1 == r2
+            mock_dl.assert_called_once()  # only one actual download
 
 
 class TestDownloadWithRetry:
@@ -282,6 +490,7 @@ def test_http_416_file_already_complete(self, tmp_path):
 
         mock_response = Mock()
         mock_response.status_code = 416
+        mock_response.headers = {}
 
         with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
@@ -303,6 +512,20 @@ def test_server_no_resume_restarts_download(self, tmp_path):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
         assert out_path.read_bytes() == b"full content"
 
+    def test_returns_response_headers(self, tmp_path):
+        """_download_with_retry should return response headers."""
+        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        out_path = str(tmp_path / "headers.bin")
+
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.headers = {'Content-Length': '5', 'ETag': '"abc"'}
+        mock_response.iter_content = Mock(return_value=[b"hello"])
+
+        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
+            headers = dl._download_with_retry("https://example.com/file", out_path, 1024)
+        assert headers['ETag'] == '"abc"'
+
 
 class TestStreamDownload:
     """Tests for _stream_download."""
@@ -332,26 +555,6 @@ def test_append_mode_on_resume(self, tmp_path):
         assert out_path.read_bytes() == b"startend"
 
 
-class TestGetDownloadedFileCaching:
-    """Tests for get_downloaded_file LRU caching."""
-
-    def test_cache_returns_same_result(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        content = b"cached content"
-
-        def fake_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
-                f.write(content)
-
-        with patch.object(dl, '_fetch_remote_md5', return_value=None):
-            with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
-                dl.get_downloaded_file.cache_clear()
-                r1 = dl.get_downloaded_file("cached.txt")
-                r2 = dl.get_downloaded_file("cached.txt")
-                assert r1 == r2
-                mock_dl.assert_called_once()  # only one actual download
-
-
 class TestGetDownloadedDir:
     """Tests for get_downloaded_dir."""
 
@@ -382,6 +585,16 @@ def test_download_metadata_parquet(downloaded_metadata):
     assert os.path.getsize(downloaded_metadata) > 0
 
 
+@pytest.mark.integration
+def test_download_creates_meta_file(downloaded_concord):
+    """After download, a .meta sidecar file should exist."""
+    meta_path = downloaded_concord + ".meta"
+    assert os.path.isfile(meta_path), f"Missing .meta file: {meta_path}"
+    with open(meta_path) as f:
+        meta = json.load(f)
+    assert "last_checked" in meta
+
+
 @pytest.mark.integration
 def test_download_caching_real_files(shared_downloader, downloaded_concord):
     """Second call returns same path and file is not re-downloaded."""

From fb41da09ae1622740c43ed9cb82105e25306d0af Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 14:38:42 -0500
Subject: [PATCH 25/66] Added some CURIEs to test.

---
 tests/data/valid_curies.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/data/valid_curies.txt b/tests/data/valid_curies.txt
index 9f2f87c..89a53b3 100644
--- a/tests/data/valid_curies.txt
+++ b/tests/data/valid_curies.txt
@@ -1,3 +1,5 @@
 # Valid CURIEs for integration tests.
 # Add new CURIEs here to expand test coverage — tests are parametrized over this list.
 MONDO:0004979
+MONDO:0005044
+NCIT:C55060

From 5c544a2d43fddcad0fc9f07774a1b67e7338c261 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 14:49:07 -0500
Subject: [PATCH 26/66] Partially changed --expand to --recurse.

---
 src/babel_explorer/core/babel_xrefs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 95fda85..012f009 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -116,12 +116,12 @@ def get_curie_xref(self, curie: str, label_curies: bool = False):
 
         return xrefs
 
-    def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False):
+    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
         :param curie: A CURIE to search for.
-        :param expand: Whether to expand the cross-references (i.e. recursively follow all identifiers).
+        :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers).
         :return: A list of cross-references containing that CURIE.
         """
 
@@ -133,11 +133,11 @@ def get_curie_xrefs(self, curies: list[str], expand: bool = False, ignore_curies
             logging.info(f"Searching for cross-references for {curie}")
             xrefs.update(self.get_curie_xref(curie, label_curies))
 
-        if expand:
+        if recurse:
             # Get a unique set of referenced curies, not including the ones currently queried.
             new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion)
             if new_curies:
                 logging.info(f"Expanding cross-references to {new_curies}")
-                xrefs.update(self.get_curie_xrefs(new_curies, expand=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies))
+                xrefs.update(self.get_curie_xrefs(new_curies, recurse=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies))
 
         return sorted(xrefs)

From 280212aa6bbfdb9bfbd1dbdcff5655183317db6c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 14:49:47 -0500
Subject: [PATCH 27/66] More fully changed --expand to --recurse.

---
 src/babel_explorer/cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 251ca0f..4af31a8 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -26,12 +26,12 @@ def cli():
 @click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
 @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
 @click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
-@click.option("--expand", is_flag=True, help="Also display xrefs for returned CURIEs")
+@click.option("--recurse", is_flag=True, help="Recursively query returned xrefs")
 @click.option("--labels", is_flag=True, help="Include labels for CURIEs")
 @click.option("--check-download", type=str, default="3h", show_default=True,
               help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
                    "'never' always checks via HTTP HEAD; '0' same as 'never'.")
-def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expand: bool, labels: bool,
+def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool,
           check_download: str):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
@@ -52,7 +52,7 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, expan
 
     freshness = parse_duration(check_download)
     bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url))
-    xrefs = bxref.get_curie_xrefs(curies, expand, label_curies=labels)
+    xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels)
     for xref in xrefs:
         print(xref)
 

From b522e6e5e779f78879193e931ba070d59b4eb0a7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 14:52:07 -0500
Subject: [PATCH 28/66] Add pytest-xdist for parallel test execution

- Add pytest-xdist[psutil] and filelock to dev dependencies
- Enable parallel execution by default with addopts = "-n auto"
- Switch DuckDB connections to in-memory mode (duckdb.connect()) to
  eliminate file locking that would deadlock parallel workers
- Make test_data_dir teardown worker-aware (only gw0 cleans up)
- Wrap download fixtures with FileLock to serialize concurrent downloads
- Fix test_babel_xrefs.py: update expand= to recurse= to match renamed param

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml                         |  3 ++
 src/babel_explorer/core/babel_xrefs.py |  8 ++-
 tests/conftest.py                      | 33 ++++++++-----
 tests/test_babel_xrefs.py              | 16 +++---
 uv.lock                                | 68 ++++++++++++++++++++++++++
 5 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index eafcfc6..59c1b68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,9 @@ build-backend = "hatchling.build"
 
 [dependency-groups]
 dev = [
+    "filelock>=3.16",
     "pytest>=8.3.5",
+    "pytest-xdist[psutil]>=3.6",
     "ruff>=0.11.0",
 ]
 
@@ -25,6 +27,7 @@ dev = [
 babel-explorer = "babel_explorer.cli:cli"
 
 [tool.pytest.ini_options]
+addopts = "-n auto"
 markers = [
     "integration: tests requiring network access (deselect with '-m \"not integration\"')",
     "slow: tests downloading very large files 2GB+ (deselect with '-m \"not slow\"')",
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 012f009..de8e661 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -82,9 +82,8 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
         identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
         concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
-        # Query the Parquet files using DuckDB.
-        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
-        db = duckdb.connect(duckdb_path)
+        # Query the Parquet files using DuckDB (in-memory; nothing is persisted).
+        db = duckdb.connect()
         identifier_table = db.read_parquet(identifier_parquet)
         result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies])
 
@@ -96,8 +95,7 @@ def get_curie_xref(self, curie: str, label_curies: bool = False):
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
         concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
-        duckdb_path = self.downloader.get_output_file('output/duckdbs/xrefs.duckdb')
-        db = duckdb.connect(duckdb_path)
+        db = duckdb.connect()
         concord_table = db.read_parquet(concord_parquet)
         xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
         xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples))
diff --git a/tests/conftest.py b/tests/conftest.py
index f3df2fe..fc61599 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,6 +9,7 @@
 import shutil
 
 import pytest
+from filelock import FileLock
 
 from babel_explorer.core.downloader import BabelDownloader
 from babel_explorer.core.babel_xrefs import BabelXRefs
@@ -39,20 +40,22 @@ def valid_curies() -> list[str]:
 
 
 @pytest.fixture(scope="session")
-def test_data_dir():
+def test_data_dir(request):
     """
-    Provide a clean test data directory for the entire session.
+    Provide a test data directory for the entire session.
 
     Creates the directory before tests, removes it after all tests complete.
+    When running under pytest-xdist, only the first worker (gw0) performs cleanup.
     """
-    if os.path.exists(TEST_DATA_DIR):
-        shutil.rmtree(TEST_DATA_DIR)
+    worker_id = getattr(request.config, "workerinput", {}).get("workerid", "master")
     os.makedirs(TEST_DATA_DIR, exist_ok=True)
 
     yield TEST_DATA_DIR
 
-    if os.path.exists(TEST_DATA_DIR):
-        shutil.rmtree(TEST_DATA_DIR)
+    # Only the first xdist worker (or a non-xdist run) cleans up the directory.
+    if worker_id in ("master", "gw0"):
+        if os.path.exists(TEST_DATA_DIR):
+            shutil.rmtree(TEST_DATA_DIR)
 
 
 @pytest.fixture(scope="session")
@@ -62,15 +65,19 @@ def shared_downloader(test_data_dir) -> BabelDownloader:
 
 
 @pytest.fixture(scope="session")
-def downloaded_concord(shared_downloader) -> str:
+def downloaded_concord(shared_downloader, test_data_dir) -> str:
     """Download duckdb/Concord.parquet (~626 MB). Returns the local path."""
-    return shared_downloader.get_downloaded_file(CONCORD_FILE)
+    lock_path = os.path.join(test_data_dir, "concord.lock")
+    with FileLock(lock_path):
+        return shared_downloader.get_downloaded_file(CONCORD_FILE)
 
 
 @pytest.fixture(scope="session")
-def downloaded_metadata(shared_downloader) -> str:
+def downloaded_metadata(shared_downloader, test_data_dir) -> str:
     """Download duckdb/Metadata.parquet (small). Returns the local path."""
-    return shared_downloader.get_downloaded_file(METADATA_FILE)
+    lock_path = os.path.join(test_data_dir, "metadata.lock")
+    with FileLock(lock_path):
+        return shared_downloader.get_downloaded_file(METADATA_FILE)
 
 
 @pytest.fixture(scope="session")
@@ -83,9 +90,11 @@ def downloaded_parquet_files(downloaded_concord, downloaded_metadata) -> dict[st
 
 
 @pytest.fixture(scope="session")
-def downloaded_identifiers(shared_downloader) -> str:
+def downloaded_identifiers(shared_downloader, test_data_dir) -> str:
     """Download duckdb/Identifiers.parquet (2 GB+). Returns the local path."""
-    return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE)
+    lock_path = os.path.join(test_data_dir, "identifiers.lock")
+    with FileLock(lock_path):
+        return shared_downloader.get_downloaded_file(IDENTIFIERS_FILE)
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 052d09c..774ccae 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -203,7 +203,7 @@ def test_get_curie_xrefs_no_expand(self, tmp_path):
         xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
         with patch.object(bx, 'get_curie_xref', return_value=[xr]):
             bx.get_curie_xref.cache_clear()
-            result = bx.get_curie_xrefs(["A:1"], expand=False)
+            result = bx.get_curie_xrefs(["A:1"], recurse=False)
             assert len(result) == 1
             assert result[0] == xr
 
@@ -220,7 +220,7 @@ def mock_get_curie_xref(curie, label_curies=False):
             return []
 
         with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref):
-            result = bx.get_curie_xrefs(["A:1"], expand=True)
+            result = bx.get_curie_xrefs(["A:1"], recurse=True)
             assert xr1 in result
             assert xr2 in result
 
@@ -230,7 +230,7 @@ def test_results_are_sorted(self, tmp_path):
         xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1")
 
         with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]):
-            result = bx.get_curie_xrefs(["X:1"], expand=False)
+            result = bx.get_curie_xrefs(["X:1"], recurse=False)
             assert result == [xr_a, xr_b]
 
 
@@ -265,7 +265,7 @@ def test_get_curie_xref_returns_known_xrefs(babel_xrefs, curie):
 def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie):
     """get_curie_xrefs without expansion returns sorted, non-empty results."""
     babel_xrefs.get_curie_xref.cache_clear()
-    results = babel_xrefs.get_curie_xrefs([curie], expand=False)
+    results = babel_xrefs.get_curie_xrefs([curie], recurse=False)
     assert len(results) > 0
     assert results == sorted(results)
 
@@ -275,9 +275,9 @@ def test_get_curie_xrefs_single_no_expand(babel_xrefs, curie):
 def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie):
     """Expanded results are at least as many as non-expanded."""
     babel_xrefs.get_curie_xref.cache_clear()
-    non_expanded = babel_xrefs.get_curie_xrefs([curie], expand=False)
+    non_expanded = babel_xrefs.get_curie_xrefs([curie], recurse=False)
     babel_xrefs.get_curie_xref.cache_clear()
-    expanded = babel_xrefs.get_curie_xrefs([curie], expand=True)
+    expanded = babel_xrefs.get_curie_xrefs([curie], recurse=True)
     assert len(expanded) >= len(non_expanded)
 
 
@@ -286,9 +286,9 @@ def test_get_curie_xrefs_expansion_finds_more(babel_xrefs, curie):
 def test_get_curie_xrefs_expanded_includes_original(babel_xrefs, curie):
     """Non-expanded results are a subset of expanded results."""
     babel_xrefs.get_curie_xref.cache_clear()
-    non_expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=False))
+    non_expanded = set(babel_xrefs.get_curie_xrefs([curie], recurse=False))
     babel_xrefs.get_curie_xref.cache_clear()
-    expanded = set(babel_xrefs.get_curie_xrefs([curie], expand=True))
+    expanded = set(babel_xrefs.get_curie_xrefs([curie], recurse=True))
     assert non_expanded.issubset(expanded)
 
 
diff --git a/uv.lock b/uv.lock
index 56af50a..b8496b5 100644
--- a/uv.lock
+++ b/uv.lock
@@ -15,7 +15,9 @@ dependencies = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "filelock" },
     { name = "pytest" },
+    { name = "pytest-xdist", extra = ["psutil"] },
     { name = "ruff" },
 ]
 
@@ -29,7 +31,9 @@ requires-dist = [
 
 [package.metadata.requires-dev]
 dev = [
+    { name = "filelock", specifier = ">=3.16" },
     { name = "pytest", specifier = ">=8.3.5" },
+    { name = "pytest-xdist", extras = ["psutil"], specifier = ">=3.6" },
     { name = "ruff", specifier = ">=0.11.0" },
 ]
 
@@ -172,6 +176,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dd/2d/13e6024e613679d8a489dd922f199ef4b1d08a456a58eadd96dc2f05171f/duckdb-1.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:53cd6423136ab44383ec9955aefe7599b3fb3dd1fe006161e6396d8167e0e0d4", size = 13458633, upload-time = "2026-01-26T11:50:17.657Z" },
 ]
 
+[[package]]
+name = "execnet"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" },
+]
+
+[[package]]
+name = "filelock"
+version = "3.25.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" },
+]
+
 [[package]]
 name = "idna"
 version = "3.11"
@@ -208,6 +230,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "psutil"
+version = "7.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740, upload-time = "2026-01-28T18:14:54.428Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/08/510cbdb69c25a96f4ae523f733cdc963ae654904e8db864c07585ef99875/psutil-7.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2edccc433cbfa046b980b0df0171cd25bcaeb3a68fe9022db0979e7aa74a826b", size = 130595, upload-time = "2026-01-28T18:14:57.293Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/f5/97baea3fe7a5a9af7436301f85490905379b1c6f2dd51fe3ecf24b4c5fbf/psutil-7.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e78c8603dcd9a04c7364f1a3e670cea95d51ee865e4efb3556a3a63adef958ea", size = 131082, upload-time = "2026-01-28T18:14:59.732Z" },
+    { url = "https://files.pythonhosted.org/packages/37/d6/246513fbf9fa174af531f28412297dd05241d97a75911ac8febefa1a53c6/psutil-7.2.2-cp313-cp313t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a571f2330c966c62aeda00dd24620425d4b0cc86881c89861fbc04549e5dc63", size = 181476, upload-time = "2026-01-28T18:15:01.884Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/b5/9182c9af3836cca61696dabe4fd1304e17bc56cb62f17439e1154f225dd3/psutil-7.2.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:917e891983ca3c1887b4ef36447b1e0873e70c933afc831c6b6da078ba474312", size = 184062, upload-time = "2026-01-28T18:15:04.436Z" },
+    { url = "https://files.pythonhosted.org/packages/16/ba/0756dca669f5a9300d0cbcbfae9a4c30e446dfc7440ffe43ded5724bfd93/psutil-7.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:ab486563df44c17f5173621c7b198955bd6b613fb87c71c161f827d3fb149a9b", size = 139893, upload-time = "2026-01-28T18:15:06.378Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/61/8fa0e26f33623b49949346de05ec1ddaad02ed8ba64af45f40a147dbfa97/psutil-7.2.2-cp313-cp313t-win_arm64.whl", hash = "sha256:ae0aefdd8796a7737eccea863f80f81e468a1e4cf14d926bd9b6f5f2d5f90ca9", size = 135589, upload-time = "2026-01-28T18:15:08.03Z" },
+    { url = "https://files.pythonhosted.org/packages/81/69/ef179ab5ca24f32acc1dac0c247fd6a13b501fd5534dbae0e05a1c48b66d/psutil-7.2.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:eed63d3b4d62449571547b60578c5b2c4bcccc5387148db46e0c2313dad0ee00", size = 130664, upload-time = "2026-01-28T18:15:09.469Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/64/665248b557a236d3fa9efc378d60d95ef56dd0a490c2cd37dafc7660d4a9/psutil-7.2.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7b6d09433a10592ce39b13d7be5a54fbac1d1228ed29abc880fb23df7cb694c9", size = 131087, upload-time = "2026-01-28T18:15:11.724Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/2e/e6782744700d6759ebce3043dcfa661fb61e2fb752b91cdeae9af12c2178/psutil-7.2.2-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fa4ecf83bcdf6e6c8f4449aff98eefb5d0604bf88cb883d7da3d8d2d909546a", size = 182383, upload-time = "2026-01-28T18:15:13.445Z" },
+    { url = "https://files.pythonhosted.org/packages/57/49/0a41cefd10cb7505cdc04dab3eacf24c0c2cb158a998b8c7b1d27ee2c1f5/psutil-7.2.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e452c464a02e7dc7822a05d25db4cde564444a67e58539a00f929c51eddda0cf", size = 185210, upload-time = "2026-01-28T18:15:16.002Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/2c/ff9bfb544f283ba5f83ba725a3c5fec6d6b10b8f27ac1dc641c473dc390d/psutil-7.2.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c7663d4e37f13e884d13994247449e9f8f574bc4655d509c3b95e9ec9e2b9dc1", size = 141228, upload-time = "2026-01-28T18:15:18.385Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/fc/f8d9c31db14fcec13748d373e668bc3bed94d9077dbc17fb0eebc073233c/psutil-7.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:11fe5a4f613759764e79c65cf11ebdf26e33d6dd34336f8a337aa2996d71c841", size = 136284, upload-time = "2026-01-28T18:15:19.912Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090, upload-time = "2026-01-28T18:15:22.168Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859, upload-time = "2026-01-28T18:15:23.795Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560, upload-time = "2026-01-28T18:15:25.976Z" },
+    { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997, upload-time = "2026-01-28T18:15:27.794Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972, upload-time = "2026-01-28T18:15:29.342Z" },
+    { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266, upload-time = "2026-01-28T18:15:31.597Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737, upload-time = "2026-01-28T18:15:33.849Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617, upload-time = "2026-01-28T18:15:36.514Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -233,6 +283,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 
+[[package]]
+name = "pytest-xdist"
+version = "3.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "execnet" },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" },
+]
+
+[package.optional-dependencies]
+psutil = [
+    { name = "psutil" },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.5"

From e137c31c199f9da2a4c7fa55d8956d6f74faac3a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 15:06:18 -0500
Subject: [PATCH 29/66] Replace Python recursion in get_curie_xrefs with DuckDB
 WITH RECURSIVE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recurse=True path previously issued one DuckDB query per CURIE and
called itself recursively (O(diameter) queries, Python stack growth).
It now delegates to _get_curie_xrefs_recursive, which traverses the
full connected component in a single SQL query using WITH RECURSIVE.

A bidirectional `edges` CTE (subj→obj and obj→subj) collapses the two
traversal directions into one recursive arm; UNION (not UNION ALL)
provides automatic cycle detection. ignore_curies_in_expansion is now
a no-op on the recurse=True path and emits a DeprecationWarning.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/babel_xrefs.py | 72 ++++++++++++++++++++++----
 tests/test_babel_xrefs.py              | 48 ++++++++++++++---
 2 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index de8e661..fba5b61 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -3,6 +3,7 @@
 # why we consider two identifiers to be identical.
 import dataclasses
 import logging
+import warnings
 import duckdb
 import functools
 
@@ -114,28 +115,77 @@ def get_curie_xref(self, curie: str, label_curies: bool = False):
 
         return xrefs
 
+    def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False):
+        """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query."""
+        if not curies:
+            return []
+
+        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
+        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
+
+        db = duckdb.connect()
+        concord_table = db.read_parquet(concord_parquet)
+        result = db.execute("""
+            WITH RECURSIVE
+            edges(a, b) AS (
+                SELECT subj, obj FROM concord_table
+                UNION ALL
+                SELECT obj, subj FROM concord_table
+            ),
+            frontier(curie) AS (
+                SELECT unnest($1::VARCHAR[])
+                UNION
+                SELECT e.b
+                FROM   edges e
+                INNER JOIN frontier f ON e.a = f.curie
+            )
+            SELECT DISTINCT c.filename, c.subj, c.pred, c.obj
+            FROM concord_table c
+            WHERE c.subj IN (SELECT curie FROM frontier)
+               OR c.obj  IN (SELECT curie FROM frontier)
+            ORDER BY c.filename, c.subj, c.obj, c.pred
+        """, [curies])
+
+        xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()]
+
+        if label_curies:
+            xrefs = [LabeledCrossReference(
+                subj=xref.subj,
+                obj=xref.obj,
+                filename=xref.filename,
+                pred=xref.pred,
+                subj_label=self.nodenorm.get_identifier(xref.subj).label,
+                subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
+                obj_label=self.nodenorm.get_identifier(xref.obj).label,
+                obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
+            ) for xref in xrefs]
+
+        return xrefs
+
     def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
-        :param curie: A CURIE to search for.
+        :param curies: A list of CURIEs to search for.
         :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers).
-        :return: A list of cross-references containing that CURIE.
+        :param ignore_curies_in_expansion: Deprecated when recurse=True; has no effect.
+        :param label_curies: Whether to annotate results with labels from NodeNorm.
+        :return: A list of cross-references containing those CURIEs.
         """
 
-        if ignore_curies_in_expansion:
-            logging.info(f"Ignoring {len(ignore_curies_in_expansion)}: {ignore_curies_in_expansion}")
+        if recurse:
+            if ignore_curies_in_expansion:
+                warnings.warn(
+                    "ignore_curies_in_expansion has no effect when recurse=True; "
+                    "cycle detection is handled automatically by the SQL query.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+            return self._get_curie_xrefs_recursive(curies, label_curies)
 
         xrefs = set()
         for curie in curies:
             logging.info(f"Searching for cross-references for {curie}")
             xrefs.update(self.get_curie_xref(curie, label_curies))
 
-        if recurse:
-            # Get a unique set of referenced curies, not including the ones currently queried.
-            new_curies = list(set([curie for xref in xrefs for curie in xref.curies]) - set(curies) - ignore_curies_in_expansion)
-            if new_curies:
-                logging.info(f"Expanding cross-references to {new_curies}")
-                xrefs.update(self.get_curie_xrefs(new_curies, recurse=True, ignore_curies_in_expansion=ignore_curies_in_expansion | set(curies) | set(new_curies), label_curies=label_curies))
-
         return sorted(xrefs)
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 774ccae..42fa6aa 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -212,18 +212,50 @@ def test_get_curie_xrefs_with_expand(self, tmp_path):
         xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
         xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3")
 
-        def mock_get_curie_xref(curie, label_curies=False):
-            if curie == "A:1":
-                return [xr1]
-            elif curie == "B:2":
-                return [xr2]
-            return []
-
-        with patch.object(bx, 'get_curie_xref', side_effect=mock_get_curie_xref):
+        with patch.object(bx, '_get_curie_xrefs_recursive', return_value=[xr1, xr2]) as mock_rec:
             result = bx.get_curie_xrefs(["A:1"], recurse=True)
+            mock_rec.assert_called_once_with(["A:1"], False)
             assert xr1 in result
             assert xr2 in result
 
+    def test_get_curie_xrefs_recursive_sql_traversal(self, tmp_path):
+        """_get_curie_xrefs_recursive uses SQL graph traversal, not Python recursion."""
+        import duckdb as real_duckdb
+
+        bx = self._make_bx(tmp_path)
+
+        # Write a tiny Parquet file: graph A-B, B-C, D-E (disconnected from A-B-C)
+        parquet_path = str(tmp_path / "test_concord.parquet")
+        setup_db = real_duckdb.connect()
+        setup_db.execute(f"""
+            COPY (
+                SELECT * FROM (VALUES
+                    ('f1.tsv', 'A:1', 'skos:exactMatch', 'B:2'),
+                    ('f1.tsv', 'B:2', 'skos:exactMatch', 'C:3'),
+                    ('f2.tsv', 'D:4', 'skos:exactMatch', 'E:5')
+                ) AS t(filename, subj, pred, obj)
+            ) TO '{parquet_path}' (FORMAT PARQUET)
+        """)
+        setup_db.close()
+
+        with patch.object(bx.downloader, 'get_downloaded_file', return_value=parquet_path):
+            # Starting from A:1 should reach B:2 and C:3 but not the D-E component
+            result = bx._get_curie_xrefs_recursive(["A:1"])
+            pairs = {(xr.subj, xr.obj) for xr in result}
+            assert ("A:1", "B:2") in pairs
+            assert ("B:2", "C:3") in pairs
+            assert ("D:4", "E:5") not in pairs
+
+            # Starting from D:4 should only reach E:5
+            result = bx._get_curie_xrefs_recursive(["D:4"])
+            pairs = {(xr.subj, xr.obj) for xr in result}
+            assert ("D:4", "E:5") in pairs
+            assert ("A:1", "B:2") not in pairs
+
+            # Empty input returns empty list
+            result = bx._get_curie_xrefs_recursive([])
+            assert result == []
+
     def test_results_are_sorted(self, tmp_path):
         bx = self._make_bx(tmp_path)
         xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1")

From b115d0293262d78f04db6c3dc78c3ec03fa543da Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 15:44:56 -0500
Subject: [PATCH 30/66] Fix xdist race condition: skip test-data cleanup in
 parallel runs

When pytest-xdist runs 8 workers, each worker session ends independently.
gw0 was deleting data/test/ as soon as it finished its own tests, but
other workers were still reading Concord.parquet.  This caused sporadic
IOException failures on any test that opened a fresh DuckDB connection
(e.g. _get_curie_xrefs_recursive) after gw0's teardown deleted the file.

Fix: only delete the shared test data directory in a sequential
(non-xdist) run where worker_id == "master".  In parallel runs the
directory persists; BabelDownloader's freshness-window logic re-validates
or re-downloads the files on the next run as needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/conftest.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index fc61599..f1e0df6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -45,15 +45,21 @@ def test_data_dir(request):
     Provide a test data directory for the entire session.
 
     Creates the directory before tests, removes it after all tests complete.
-    When running under pytest-xdist, only the first worker (gw0) performs cleanup.
+    When running under pytest-xdist, cleanup is skipped: worker sessions end at
+    unpredictable times and deleting the shared directory from one worker while
+    others are still reading the same files causes flaky IO errors.  The files
+    are re-used (or re-validated) on the next run via the freshness-window logic
+    in BabelDownloader.get_downloaded_file.
     """
     worker_id = getattr(request.config, "workerinput", {}).get("workerid", "master")
     os.makedirs(TEST_DATA_DIR, exist_ok=True)
 
     yield TEST_DATA_DIR
 
-    # Only the first xdist worker (or a non-xdist run) cleans up the directory.
-    if worker_id in ("master", "gw0"):
+    # Only clean up when running without xdist (sequential run).  In a parallel
+    # run each worker session may finish at a different time; gw0 cleaning up
+    # while gw5 is still reading Concord.parquet causes spurious failures.
+    if worker_id == "master":
         if os.path.exists(TEST_DATA_DIR):
             shutil.rmtree(TEST_DATA_DIR)
 

From 5a0f75874cd4acd51be04b65fd4d0205312c2cd9 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Tue, 3 Mar 2026 15:50:18 -0500
Subject: [PATCH 31/66] Made output a bit prettier.

---
 src/babel_explorer/core/downloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 43c3daf..653ccc8 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -282,7 +282,7 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
             if meta is not None:
                 # Tier 1: within freshness window — skip all network calls
                 if self._is_within_freshness(meta, self.freshness_seconds):
-                    self.logger.info(f"File within freshness window, skipping check: {local_path_to_download_to}")
+                    self.logger.info(f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}")
                     return local_path_to_download_to
 
                 # Tier 2: stale but maybe unchanged — HEAD request

From be2fa36ceae4cf4abe81fb59c390222243638d23 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 00:41:23 -0400
Subject: [PATCH 32/66] Update src/babel_explorer/core/nodenorm.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/nodenorm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index a9c6752..1e1e24e 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -57,11 +57,17 @@ def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True
         response.raise_for_status()
         result = response.json()
 
-        return result[curie]
+        try:
+            return result[curie]
+        except KeyError:
+            logging.debug(f"NodeNorm response did not contain CURIE {curie!r}; returning None")
+            return None
 
     @functools.lru_cache(maxsize=None)
     def get_clique_identifiers(self, curie, **kwargs):
         result = self.normalize_curie(curie, **kwargs)
+        if not result:
+            return None
         if 'equivalent_identifiers' not in result:
             return None
         return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers']))

From 2b2aa7f20561e57ad811697bd33394a846a64bfc Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 01:07:11 -0400
Subject: [PATCH 33/66] Simplify babel_xrefs: extract helper, remove dead
 fetches, fix default arg

- Extract _to_labeled_xref() to eliminate duplicated LabeledCrossReference
  construction in get_curie_xref and _get_curie_xrefs_recursive
- Remove unused concord_metadata_parquet fetches from get_curie_ids,
  get_curie_xref, and _get_curie_xrefs_recursive (Metadata.parquet was
  downloaded but never queried in any of these methods)
- Fix mutable default argument: ignore_curies_in_expansion: set = set()
  -> set | None = None
- Return list (not lazy map) from get_curie_xref for consistency
- Update test expectation to match single downloader call (Concord only)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/babel_xrefs.py | 42 +++++++++++---------------
 tests/test_babel_xrefs.py              |  4 +--
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index fba5b61..c07c095 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -81,7 +81,6 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
         """
 
         identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
-        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
         # Query the Parquet files using DuckDB (in-memory; nothing is persisted).
         db = duckdb.connect()
@@ -94,34 +93,36 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
-        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
         db = duckdb.connect()
         concord_table = db.read_parquet(concord_parquet)
         xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
-        xrefs = list(map(lambda rec: CrossReference.from_tuple(rec), xref_tuples))
+        xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples]
 
         if label_curies:
-            xrefs = map(lambda xref: LabeledCrossReference(
-                subj=xref.subj,
-                obj=xref.obj,
-                filename=xref.filename,
-                pred=xref.pred,
-                subj_label=self.nodenorm.get_identifier(xref.subj).label,
-                subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
-                obj_label=self.nodenorm.get_identifier(xref.obj).label,
-                obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
-            ), xrefs)
+            xrefs = [self._to_labeled_xref(xref) for xref in xrefs]
 
         return xrefs
 
+    def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference:
+        """Convert a CrossReference to a LabeledCrossReference using NodeNorm."""
+        return LabeledCrossReference(
+            subj=xref.subj,
+            obj=xref.obj,
+            filename=xref.filename,
+            pred=xref.pred,
+            subj_label=self.nodenorm.get_identifier(xref.subj).label,
+            subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
+            obj_label=self.nodenorm.get_identifier(xref.obj).label,
+            obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
+        )
+
     def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False):
         """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query."""
         if not curies:
             return []
 
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
-        concord_metadata_parquet = self.downloader.get_downloaded_file('duckdb/Metadata.parquet')
 
         db = duckdb.connect()
         concord_table = db.read_parquet(concord_parquet)
@@ -149,20 +150,11 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
         xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()]
 
         if label_curies:
-            xrefs = [LabeledCrossReference(
-                subj=xref.subj,
-                obj=xref.obj,
-                filename=xref.filename,
-                pred=xref.pred,
-                subj_label=self.nodenorm.get_identifier(xref.subj).label,
-                subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
-                obj_label=self.nodenorm.get_identifier(xref.obj).label,
-                obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
-            ) for xref in xrefs]
+            xrefs = [self._to_labeled_xref(xref) for xref in xrefs]
 
         return xrefs
 
-    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set = set(), label_curies: bool = False):
+    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set | None = None, label_curies: bool = False):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 42fa6aa..41ad777 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -192,8 +192,8 @@ def test_get_curie_xref_calls_downloader(self, tmp_path):
                 with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db):
                     bx.get_curie_xref.cache_clear()
                     result = bx.get_curie_xref("A:1")
-                    # Downloader should be called for Concord and Metadata
-                    assert mock_dl.call_count == 2
+                    # Downloader should be called for Concord only (Metadata unused here)
+                    assert mock_dl.call_count == 1
                     result_list = list(result)
                     assert len(result_list) == 1
                     assert isinstance(result_list[0], CrossReference)

From 3cdd19c787517a2d4ec9ea6d683e598119901454 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 01:45:18 -0400
Subject: [PATCH 34/66] Fix LabeledCrossReference: make it a frozen dataclass
 subclass

Hand-written __init__ with post-construction setattr raised
FrozenInstanceError since CrossReference is frozen=True.
Adding @dataclasses.dataclass(frozen=True) lets Python generate
the correct __init__ using object.__setattr__ internally.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/babel_xrefs.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index c07c095..51727ab 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -29,19 +29,13 @@ def curies(self):
     def __lt__(self, other):
         return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred)
 
+@dataclasses.dataclass(frozen=True)
 class LabeledCrossReference(CrossReference):
     subj_label: str
     subj_biolink_type: str
     obj_label: str
     obj_biolink_type: str
 
-    def __init__(self, subj: str, pred: str, obj: str, filename: str, subj_label: str, subj_biolink_type: str, obj_label: str, obj_biolink_type: str):
-        super().__init__(subj=subj, obj=obj, filename=filename, pred=pred)
-        self.subj_label = subj_label
-        self.subj_biolink_type = subj_biolink_type
-        self.obj_label = obj_label
-        self.obj_biolink_type = obj_biolink_type
-
     def __str__(self):
         return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")"""
 

From c7a3f16f9cf45c83b554f62544f7daebb67aa18d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 01:53:41 -0400
Subject: [PATCH 35/66] Fix BabelDownloader: use tempfile.gettempdir() when
 local_path is None

Replace the fragile TMPDIR-only env var check with tempfile.gettempdir(),
which has a cross-platform fallback chain and always returns a valid path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/downloader.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 653ccc8..57a0911 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -1,6 +1,7 @@
 import functools
 import json
 import os
+import tempfile
 import urllib.parse
 import time
 import requests
@@ -22,11 +23,7 @@ def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 *
         self.logger = logging.getLogger(BabelDownloader.__name__)
 
         if local_path is None:
-            # Default to using TMPDIR.
-            # TODO: replace with a real temporary directory.
-            tmpdir = os.environ.get("TMPDIR")
-            if tmpdir:
-                local_path = tmpdir
+            local_path = tempfile.gettempdir()
 
         # Make sure the local path is an existing directory or that we can create it.
         if not os.path.exists(local_path):

From c6635bc42c88240b128e074ab694eac7e2ff2ee0 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 02:09:28 -0400
Subject: [PATCH 36/66] Fix test-concord: guard against None from
 get_clique_identifiers

When NodeNorm doesn't recognise a CURIE, get_clique_identifiers returns
None, causing a TypeError on iteration. Use (identifiers or []) to skip
gracefully.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 4af31a8..fc5d314 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -95,7 +95,7 @@ def test_concord(curies, nodenorm_url):
     nodenorm = NodeNorm(nodenorm_url)
     for curie in curies:
         identifiers = nodenorm.get_clique_identifiers(curie)
-        for identifier in identifiers:
+        for identifier in (identifiers or []):
             if identifier.label:
                 print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}")
             else:

From d74110ebb88d38897c1fd76f76652bff5ecb3629 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:10:07 -0400
Subject: [PATCH 37/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/nodenorm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index 1e1e24e..b48c4ec 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -30,6 +30,8 @@ def from_dict(d: dict):
 class NodeNorm:
     def __init__(self, nodenorm_url: str=""):
         self.nodenorm_url = nodenorm_url
+        if self.nodenorm_url and not self.nodenorm_url.endswith("/"):
+            self.nodenorm_url += "/"
 
     @functools.lru_cache(maxsize=None)
     def get_identifier(self, curie: str):

From 4338bdcf12aecbd377941d73677d690e9abc9c9b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:10:46 -0400
Subject: [PATCH 38/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index fc5d314..56afde7 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -30,7 +30,7 @@ def cli():
 @click.option("--labels", is_flag=True, help="Include labels for CURIEs")
 @click.option("--check-download", type=str, default="3h", show_default=True,
               help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
-                   "'never' always checks via HTTP HEAD; '0' same as 'never'.")
+                   "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.")
 def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool,
           check_download: str):
     """
@@ -62,7 +62,7 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recur
 @click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
 @click.option("--check-download", type=str, default="3h", show_default=True,
               help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
-                   "'never' always checks via HTTP HEAD; '0' same as 'never'.")
+                   "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.")
 def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     """
     Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided.

From be3e42756dc014c7cb7997fc9d2f1b7eb7f6683e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:11:30 -0400
Subject: [PATCH 39/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 78fa8e9..00fff8c 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ uv sync --group dev
 uv run babel-explorer xrefs MONDO:0004979
 
 # Get cross-references with expansion (recursive lookup)
-uv run babel-explorer xrefs MONDO:0004979 --expand
+uv run babel-explorer xrefs MONDO:0004979 --recurse
 
 # Get cross-references with labels from NodeNorm
 uv run babel-explorer xrefs MONDO:0004979 --labels

From f8b718b5d346fa1aa590661e67559d4b8b9be3c2 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:13:26 -0400
Subject: [PATCH 40/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/cli.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 56afde7..23854c5 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -9,12 +9,33 @@
 def parse_duration(value: str) -> float:
     """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds."""
     units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
-    lower = value.lower()
+    lower = (value or "").strip().lower()
+    if not lower:
+        raise click.BadParameter(
+            "Invalid duration: value cannot be empty. "
+            "Use an integer number of seconds, optionally followed by 's', 'm', 'h', or 'd', "
+            "or 'never'."
+        )
     if lower == "never":
         return float("inf")
+    # Value with unit suffix (e.g. '3h', '30m')
     if lower[-1] in units:
-        return int(lower[:-1]) * units[lower[-1]]
-    return int(lower)  # bare seconds
+        try:
+            amount = int(lower[:-1])
+        except ValueError:
+            raise click.BadParameter(
+                f"Invalid duration {value!r}: expected an integer followed by an optional unit "
+                "('s', 'm', 'h', or 'd'), or 'never'."
+            )
+        return amount * units[lower[-1]]
+    # Bare integer seconds
+    try:
+        return int(lower)
+    except ValueError:
+        raise click.BadParameter(
+            f"Invalid duration {value!r}: expected an integer number of seconds, optionally "
+            "followed by 's', 'm', 'h', or 'd', or 'never'."
+        )
 
 
 @click.group()

From 48c8e960cfb9f6e300945372e4109959ee63e42d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:13:40 -0400
Subject: [PATCH 41/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 CLAUDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 3cb238c..7ad79fb 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -30,7 +30,7 @@ uv run babel-explorer --help
 uv run babel-explorer xrefs MONDO:0004979
 
 # Get cross-references with expansion (recursive lookup)
-uv run babel-explorer xrefs MONDO:0004979 --expand
+uv run babel-explorer xrefs MONDO:0004979 --recurse
 
 # Get cross-references with labels from NodeNorm
 uv run babel-explorer xrefs MONDO:0004979 --labels

From 8fb37d65916b9d43aa7bde9526377292347ae38e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:15:10 -0400
Subject: [PATCH 42/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/downloader.py | 50 +++++++++++++--------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 57a0911..39ea3fa 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -213,32 +213,32 @@ def _download_with_retry(self, url, local_path, chunk_size):
                     self.logger.info(f"Resuming download from byte {resume_byte_pos}")
 
                 # Make streaming request with timeout for connection (not total time)
-                response = requests.get(url, headers=headers, stream=True, timeout=30)
-
-                # Handle different response codes
-                if response.status_code == 416:
-                    # Range Not Satisfiable - file already complete
-                    self.logger.info(f"File already complete: {local_path}")
+                with requests.get(url, headers=headers, stream=True, timeout=30) as response:
+
+                    # Handle different response codes
+                    if response.status_code == 416:
+                        # Range Not Satisfiable - file already complete
+                        self.logger.info(f"File already complete: {local_path}")
+                        return response.headers
+                    elif response.status_code == 206:
+                        # Partial Content - resume successful
+                        self.logger.info(f"Resuming download (HTTP 206)")
+                    elif response.status_code == 200:
+                        # OK - server doesn't support resume or no Range header was sent
+                        if resume_byte_pos > 0:
+                            self.logger.warning(f"Server doesn't support resume, restarting from beginning")
+                            resume_byte_pos = 0
+                            # Remove partial file
+                            if os.path.exists(local_path):
+                                os.remove(local_path)
+                    else:
+                        response.raise_for_status()
+
+                    # Stream download with progress bar
+                    self._stream_download(response, local_path, resume_byte_pos, chunk_size)
+
+                    # Success - exit retry loop
                     return response.headers
-                elif response.status_code == 206:
-                    # Partial Content - resume successful
-                    self.logger.info(f"Resuming download (HTTP 206)")
-                elif response.status_code == 200:
-                    # OK - server doesn't support resume or no Range header was sent
-                    if resume_byte_pos > 0:
-                        self.logger.warning(f"Server doesn't support resume, restarting from beginning")
-                        resume_byte_pos = 0
-                        # Remove partial file
-                        if os.path.exists(local_path):
-                            os.remove(local_path)
-                else:
-                    response.raise_for_status()
-
-                # Stream download with progress bar
-                self._stream_download(response, local_path, resume_byte_pos, chunk_size)
-
-                # Success - exit retry loop
-                return response.headers
 
             except (requests.RequestException, IOError) as e:
                 self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}")

From 49f5c3ba6931a8454a3e7b73dda20a9aeee8c8b7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Mar 2026 16:16:56 -0400
Subject: [PATCH 43/66] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/core/babel_xrefs.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 51727ab..725e1f7 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -86,6 +86,9 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
 
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
+        if label_curies and self.nodenorm is None:
+            raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).")
+
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
 
         db = duckdb.connect()
@@ -113,6 +116,8 @@ def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference:
 
     def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False):
         """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query."""
+        if label_curies and self.nodenorm is None:
+            raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).")
         if not curies:
             return []
 

From c952c1277da41b89c119e00c9fb170e64ac400eb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:21:57 -0400
Subject: [PATCH 44/66] Fix DuckDB connection leaks by using context managers

Wrap all three duckdb.connect() calls in `with` statements so connections
are deterministically closed after each query rather than relying on GC.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/babel_xrefs.py | 30 ++++++++++++--------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 725e1f7..f11f3f4 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -77,12 +77,11 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
         identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
 
         # Query the Parquet files using DuckDB (in-memory; nothing is persisted).
-        db = duckdb.connect()
-        identifier_table = db.read_parquet(identifier_parquet)
-        result = db.execute(f"SELECT * FROM identifier_table WHERE curie IN $1", [curies])
-
-        column_names = [desc[0] for desc in result.description]
-        return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()]
+        with duckdb.connect() as db:
+            identifier_table = db.read_parquet(identifier_parquet)
+            result = db.execute("SELECT * FROM identifier_table WHERE curie IN $1", [curies])
+            column_names = [desc[0] for desc in result.description]
+            return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()]
 
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
@@ -91,14 +90,13 @@ def get_curie_xref(self, curie: str, label_curies: bool = False):
 
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
 
-        db = duckdb.connect()
-        concord_table = db.read_parquet(concord_parquet)
-        xref_tuples = db.execute(f"SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
-        xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples]
+        with duckdb.connect() as db:
+            concord_table = db.read_parquet(concord_parquet)
+            xref_tuples = db.execute("SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
 
+        xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples]
         if label_curies:
             xrefs = [self._to_labeled_xref(xref) for xref in xrefs]
-
         return xrefs
 
     def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference:
@@ -123,9 +121,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
 
         concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
 
-        db = duckdb.connect()
-        concord_table = db.read_parquet(concord_parquet)
-        result = db.execute("""
+        with duckdb.connect() as db:
+            concord_table = db.read_parquet(concord_parquet)
+            rows = db.execute("""
             WITH RECURSIVE
             edges(a, b) AS (
                 SELECT subj, obj FROM concord_table
@@ -144,9 +142,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
             WHERE c.subj IN (SELECT curie FROM frontier)
                OR c.obj  IN (SELECT curie FROM frontier)
             ORDER BY c.filename, c.subj, c.obj, c.pred
-        """, [curies])
+        """, [curies]).fetchall()
 
-        xrefs = [CrossReference.from_tuple(row) for row in result.fetchall()]
+        xrefs = [CrossReference.from_tuple(row) for row in rows]
 
         if label_curies:
             xrefs = [self._to_labeled_xref(xref) for xref in xrefs]

From b0539bb44cb577ccfdeff8ffb3931b3c88071d48 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:36:53 -0400
Subject: [PATCH 45/66] Fix and simplify test mocks for context manager
 protocol

After production code was updated to use `with duckdb.connect()` and
`with requests.get()`, the test mocks (plain Mock()) no longer supported
the context manager protocol. Updated affected mocks to MagicMock() with
__enter__.return_value = self.

Also extracted _make_response() helper in TestDownloadWithRetry to
eliminate five near-identical 4-line mock setup blocks.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/test_babel_xrefs.py |  1 +
 tests/test_downloader.py  | 39 +++++++++++++++------------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 41ad777..75e33cb 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -184,6 +184,7 @@ def test_get_curie_xref_calls_downloader(self, tmp_path):
             ("concord.tsv", "A:1", "skos:exactMatch", "B:2"),
         ]
         mock_db = MagicMock()
+        mock_db.__enter__.return_value = mock_db
         mock_db.read_parquet.return_value = "table"
         mock_db.execute.return_value = mock_result
 
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 045e402..16a7e9b 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -445,6 +445,16 @@ def fake_download(url, path, chunk_size):
 class TestDownloadWithRetry:
     """Tests for _download_with_retry."""
 
+    @staticmethod
+    def _make_response(status_code, headers=None, content=None):
+        m = MagicMock()
+        m.__enter__.return_value = m
+        m.status_code = status_code
+        m.headers = headers or {}
+        if content is not None:
+            m.iter_content = Mock(return_value=content)
+        return m
+
     def test_retries_exhausted_raises_runtime_error(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2)
         with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")):
@@ -456,11 +466,7 @@ def test_succeeds_on_second_attempt(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3)
         out_path = str(tmp_path / "retry_success.bin")
 
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.headers = {'Content-Length': '5'}
-        mock_response.iter_content = Mock(return_value=[b"hello"])
-
+        mock_response = self._make_response(200, {'Content-Length': '5'}, [b"hello"])
         side_effects = [requests.ConnectionError("first fail"), mock_response]
 
         with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects):
@@ -473,11 +479,7 @@ def test_resume_sends_range_header(self, tmp_path):
         out_path = tmp_path / "partial.bin"
         out_path.write_bytes(b"partial")  # 7 bytes
 
-        mock_response = Mock()
-        mock_response.status_code = 206
-        mock_response.headers = {'Content-Length': '3'}
-        mock_response.iter_content = Mock(return_value=[b"end"])
-
+        mock_response = self._make_response(206, {'Content-Length': '3'}, [b"end"])
         with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get:
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
             _, kwargs = mock_get.call_args
@@ -488,10 +490,7 @@ def test_http_416_file_already_complete(self, tmp_path):
         out_path = tmp_path / "complete.bin"
         out_path.write_bytes(b"full file")
 
-        mock_response = Mock()
-        mock_response.status_code = 416
-        mock_response.headers = {}
-
+        mock_response = self._make_response(416)
         with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
         # Should return without error
@@ -503,11 +502,7 @@ def test_server_no_resume_restarts_download(self, tmp_path):
         out_path = tmp_path / "no_resume.bin"
         out_path.write_bytes(b"partial")
 
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.headers = {'Content-Length': '12'}
-        mock_response.iter_content = Mock(return_value=[b"full content"])
-
+        mock_response = self._make_response(200, {'Content-Length': '12'}, [b"full content"])
         with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
         assert out_path.read_bytes() == b"full content"
@@ -517,11 +512,7 @@ def test_returns_response_headers(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
         out_path = str(tmp_path / "headers.bin")
 
-        mock_response = Mock()
-        mock_response.status_code = 200
-        mock_response.headers = {'Content-Length': '5', 'ETag': '"abc"'}
-        mock_response.iter_content = Mock(return_value=[b"hello"])
-
+        mock_response = self._make_response(200, {'Content-Length': '5', 'ETag': '"abc"'}, [b"hello"])
         with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
             headers = dl._download_with_retry("https://example.com/file", out_path, 1024)
         assert headers['ETag'] == '"abc"'

From 6319212e9a606652350a70f6ad8d0cd3a568687a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:41:51 -0400
Subject: [PATCH 46/66] Add configurable HTTP timeout to NodeNorm and
 BabelDownloader

normalize_curie() was calling requests.get() with no timeout, risking
an indefinite hang if the NodeNorm service stalls. The downloader had
timeout=30 hardcoded in two places with no way to override it.

Add timeout: int = 30 to both constructors and thread self.timeout
through all three request call sites, making the default consistent
and the value overridable without patching.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/downloader.py | 7 ++++---
 src/babel_explorer/core/nodenorm.py   | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 39ea3fa..7b628d0 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -15,11 +15,12 @@ class BabelDownloader:
     Class for downloading Babel cross-reference files to a local directory as needed.
     """
 
-    def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600):
+    def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600, timeout: int = 30):
         # We assume the URL base is correct (if not, we can fix it later).
         self.url_base = url_base
         self.retries = retries
         self.freshness_seconds = freshness_seconds
+        self.timeout = timeout
         self.logger = logging.getLogger(BabelDownloader.__name__)
 
         if local_path is None:
@@ -114,7 +115,7 @@ def _etag_matches(self, url, meta):
             bool: True if remote matches local meta (file is still current)
         """
         try:
-            response = requests.head(url, timeout=30)
+            response = requests.head(url, timeout=self.timeout)
             response.raise_for_status()
         except requests.RequestException as e:
             self.logger.warning(f"HEAD request failed for {url}: {e}")
@@ -213,7 +214,7 @@ def _download_with_retry(self, url, local_path, chunk_size):
                     self.logger.info(f"Resuming download from byte {resume_byte_pos}")
 
                 # Make streaming request with timeout for connection (not total time)
-                with requests.get(url, headers=headers, stream=True, timeout=30) as response:
+                with requests.get(url, headers=headers, stream=True, timeout=self.timeout) as response:
 
                     # Handle different response codes
                     if response.status_code == 416:
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index b48c4ec..04a3629 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -28,8 +28,9 @@ def from_dict(d: dict):
         return identifier
 
 class NodeNorm:
-    def __init__(self, nodenorm_url: str=""):
+    def __init__(self, nodenorm_url: str = "", timeout: int = 30):
         self.nodenorm_url = nodenorm_url
+        self.timeout = timeout
         if self.nodenorm_url and not self.nodenorm_url.endswith("/"):
             self.nodenorm_url += "/"
 
@@ -55,7 +56,7 @@ def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True
             "description": description,
             "individual_types": individual_types,
             "include_taxa": include_taxa,
-        })
+        }, timeout=self.timeout)
         response.raise_for_status()
         result = response.json()
 

From b634d11bf4b0a4455aca920ad02746752bcf76bb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:45:18 -0400
Subject: [PATCH 47/66] Fix _etag_matches docstring to match actual behavior
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The docstring claimed the method updates last_checked in the .meta file,
but it is a pure predicate — the caller (get_downloaded_file) owns that
write. Updated the docstring and removed stale inline comments that said
the same thing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/downloader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 7b628d0..016fb01 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -105,7 +105,10 @@ def _is_within_freshness(self, meta, freshness_seconds):
     def _etag_matches(self, url, meta):
         """
         Do a HEAD request and check if the ETag (or Last-Modified + Content-Length)
-        matches the stored metadata. If they match, update last_checked in the .meta file.
+        matches the stored metadata.
+
+        Does not write to disk — the caller is responsible for updating last_checked
+        when this returns True.
 
         Args:
             url: URL to HEAD
@@ -129,9 +132,6 @@ def _etag_matches(self, url, meta):
         if local_etag and remote_etag:
             if local_etag == remote_etag:
                 self.logger.info(f"ETag matches ({remote_etag}), file is current")
-                # Update last_checked in the .meta file
-                # We need the local_path to update — derive it from URL
-                # Caller will handle updating; return True
                 return True
             else:
                 self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading")

From a7eb8c1db94b63fbaf3eba3566c31a08640b1eb3 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:51:16 -0400
Subject: [PATCH 48/66] Got rid of ignore_curies_in_expansion, which is no
 longer used.

---
 src/babel_explorer/core/babel_xrefs.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index f11f3f4..c017d6b 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -151,25 +151,17 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
 
         return xrefs
 
-    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, ignore_curies_in_expansion: set | None = None, label_curies: bool = False):
+    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, label_curies: bool = False):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
         :param curies: A list of CURIEs to search for.
         :param recurse: Whether to expand the cross-references (i.e. recursively follow all identifiers).
-        :param ignore_curies_in_expansion: Deprecated when recurse=True; has no effect.
         :param label_curies: Whether to annotate results with labels from NodeNorm.
         :return: A list of cross-references containing those CURIEs.
         """
 
         if recurse:
-            if ignore_curies_in_expansion:
-                warnings.warn(
-                    "ignore_curies_in_expansion has no effect when recurse=True; "
-                    "cycle detection is handled automatically by the SQL query.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
             return self._get_curie_xrefs_recursive(curies, label_curies)
 
         xrefs = set()

From 7163a643ca148f4da20f15c71e9ac30748264a81 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 16:56:11 -0400
Subject: [PATCH 49/66] Add ruff CI and fix all lint errors

Add .github/workflows/lint.yml to run ruff check and ruff format --check
on every pull request.

Fix the 7 errors ruff found, and apply ruff format to all files:
- Remove unused `import warnings` in babel_xrefs.py (left over after
  ignore_curies_in_expansion was removed)
- Add # noqa: F841 to the three read_parquet() assignments: ruff flags
  them as unused, but DuckDB resolves SQL table names by matching the
  Python variable name, so the assignments are load-bearing
- Remove spurious f-prefix from two string literals in downloader.py
- Drop unused `local_path` variable in test_downloader.py

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/lint.yml             |  14 ++
 src/babel_explorer/cli.py              |  93 +++++++++++---
 src/babel_explorer/core/babel_xrefs.py |  64 +++++++---
 src/babel_explorer/core/downloader.py  |  86 +++++++++----
 src/babel_explorer/core/nodenorm.py    |  64 ++++++----
 tests/conftest.py                      |   4 +-
 tests/test_babel_xrefs.py              |  71 ++++++++---
 tests/test_downloader.py               | 169 +++++++++++++++++--------
 tests/test_nodenorm.py                 |  28 ++--
 9 files changed, 427 insertions(+), 166 deletions(-)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..f4771d4
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,14 @@
+name: Lint
+
+on:
+  pull_request:
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - run: uv sync --group dev
+      - run: uv run ruff check src/ tests/
+      - run: uv run ruff format --check src/ tests/
diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 23854c5..0e25ea3 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -42,18 +42,46 @@ def parse_duration(value: str) -> float:
 def cli():
     pass
 
+
 @cli.command("xrefs")
 @click.argument("curies", type=str, required=True, nargs=-1)
-@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
-@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
-@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
+@click.option(
+    "--local-dir",
+    type=str,
+    default="data/2025nov19",
+    help="Local location to save Babel download files to",
+)
+@click.option(
+    "--babel-url",
+    type=str,
+    default="https://stars.renci.org:443/var/babel_outputs/2025nov19/",
+    help="Base URL of the Babel server",
+)
+@click.option(
+    "--nodenorm-url",
+    type=str,
+    default="https://nodenormalization-sri.renci.org/",
+    help="NodeNorm URL to check for concord changes",
+)
 @click.option("--recurse", is_flag=True, help="Recursively query returned xrefs")
 @click.option("--labels", is_flag=True, help="Include labels for CURIEs")
-@click.option("--check-download", type=str, default="3h", show_default=True,
-              help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
-                   "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.")
-def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recurse: bool, labels: bool,
-          check_download: str):
+@click.option(
+    "--check-download",
+    type=str,
+    default="3h",
+    show_default=True,
+    help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
+    "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.",
+)
+def xrefs(
+    curies: list[str],
+    babel_url: str,
+    nodenorm_url,
+    local_dir: str,
+    recurse: bool,
+    labels: bool,
+    check_download: str,
+):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
 
@@ -72,18 +100,37 @@ def xrefs(curies: list[str], babel_url: str, nodenorm_url, local_dir: str, recur
     logging.basicConfig(level=logging.INFO)
 
     freshness = parse_duration(check_download)
-    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness), NodeNorm(nodenorm_url))
+    bxref = BabelXRefs(
+        BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness),
+        NodeNorm(nodenorm_url),
+    )
     xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels)
     for xref in xrefs:
         print(xref)
 
+
 @cli.command("ids")
 @click.argument("curies", type=str, required=True, nargs=-1)
-@click.option("--local-dir", type=str, default="data/2025nov19", help="Local location to save Babel download files to")
-@click.option("--babel-url", type=str, default="https://stars.renci.org:443/var/babel_outputs/2025nov19/", help="Base URL of the Babel server")
-@click.option("--check-download", type=str, default="3h", show_default=True,
-              help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
-                   "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.")
+@click.option(
+    "--local-dir",
+    type=str,
+    default="data/2025nov19",
+    help="Local location to save Babel download files to",
+)
+@click.option(
+    "--babel-url",
+    type=str,
+    default="https://stars.renci.org:443/var/babel_outputs/2025nov19/",
+    help="Base URL of the Babel server",
+)
+@click.option(
+    "--check-download",
+    type=str,
+    default="3h",
+    show_default=True,
+    help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
+    "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.",
+)
 def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     """
     Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided.
@@ -101,14 +148,22 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     logging.basicConfig(level=logging.INFO)
 
     freshness = parse_duration(check_download)
-    bxref = BabelXRefs(BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness))
+    bxref = BabelXRefs(
+        BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness)
+    )
     xrefs = bxref.get_curie_ids(curies)
     for xref in xrefs:
         print(xref)
 
+
 @cli.command("test-concord")
 @click.argument("curies", type=str, required=True, nargs=-1)
-@click.option("--nodenorm-url", type=str, default="https://nodenormalization-sri.renci.org/", help="NodeNorm URL to check for concord changes")
+@click.option(
+    "--nodenorm-url",
+    type=str,
+    default="https://nodenormalization-sri.renci.org/",
+    help="NodeNorm URL to check for concord changes",
+)
 def test_concord(curies, nodenorm_url):
     # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm?
     # By definition, this can only combine all the cliques mentioned in the CURIEs.
@@ -116,9 +171,11 @@ def test_concord(curies, nodenorm_url):
     nodenorm = NodeNorm(nodenorm_url)
     for curie in curies:
         identifiers = nodenorm.get_clique_identifiers(curie)
-        for identifier in (identifiers or []):
+        for identifier in identifiers or []:
             if identifier.label:
-                print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}")
+                print(
+                    f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}"
+                )
             else:
                 print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}")
 
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index c017d6b..c218761 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -3,7 +3,6 @@
 # why we consider two identifiers to be identical.
 import dataclasses
 import logging
-import warnings
 import duckdb
 import functools
 
@@ -20,14 +19,22 @@ class CrossReference:
 
     @staticmethod
     def from_tuple(tuple: tuple[str, str, str, str]):
-        return CrossReference(filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3])
+        return CrossReference(
+            filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3]
+        )
 
     @property
     def curies(self):
         return frozenset([self.subj, self.obj])
 
     def __lt__(self, other):
-        return (self.filename, self.subj, self.obj, self.pred) < (other.filename, other.subj, other.obj, other.pred)
+        return (self.filename, self.subj, self.obj, self.pred) < (
+            other.filename,
+            other.subj,
+            other.obj,
+            other.pred,
+        )
+
 
 @dataclasses.dataclass(frozen=True)
 class LabeledCrossReference(CrossReference):
@@ -39,16 +46,18 @@ class LabeledCrossReference(CrossReference):
     def __str__(self):
         return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")"""
 
+
 @dataclasses.dataclass(frozen=True)
 class IdentifierRecord:
     """A record from the Identifiers.parquet file."""
+
     curie: str
     extra_fields: tuple = ()
 
     @staticmethod
     def from_row(row: tuple, column_names: list[str]):
         """Create an IdentifierRecord from a DuckDB result row and its column names."""
-        curie_idx = column_names.index('curie')
+        curie_idx = column_names.index("curie")
         extra = tuple(
             (col, row[i]) for i, col in enumerate(column_names) if i != curie_idx
         )
@@ -74,25 +83,37 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
         :return: A list of IdentifierRecords containing those CURIEs.
         """
 
-        identifier_parquet = self.downloader.get_downloaded_file('duckdb/Identifiers.parquet')
+        identifier_parquet = self.downloader.get_downloaded_file(
+            "duckdb/Identifiers.parquet"
+        )
 
         # Query the Parquet files using DuckDB (in-memory; nothing is persisted).
         with duckdb.connect() as db:
-            identifier_table = db.read_parquet(identifier_parquet)
-            result = db.execute("SELECT * FROM identifier_table WHERE curie IN $1", [curies])
+            identifier_table = db.read_parquet(identifier_parquet)  # noqa: F841 — DuckDB resolves 'identifier_table' by Python variable name in the SQL query
+            result = db.execute(
+                "SELECT * FROM identifier_table WHERE curie IN $1", [curies]
+            )
             column_names = [desc[0] for desc in result.description]
-            return [IdentifierRecord.from_row(row, column_names) for row in result.fetchall()]
+            return [
+                IdentifierRecord.from_row(row, column_names)
+                for row in result.fetchall()
+            ]
 
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
         if label_curies and self.nodenorm is None:
-            raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).")
+            raise ValueError(
+                "label_curies=True requires a configured NodeNorm instance (nodenorm was None)."
+            )
 
-        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
+        concord_parquet = self.downloader.get_downloaded_file("duckdb/Concord.parquet")
 
         with duckdb.connect() as db:
-            concord_table = db.read_parquet(concord_parquet)
-            xref_tuples = db.execute("SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1", [curie]).fetchall()
+            concord_table = db.read_parquet(concord_parquet)  # noqa: F841 — DuckDB resolves 'concord_table' by Python variable name in the SQL query
+            xref_tuples = db.execute(
+                "SELECT filename, subj, pred, obj FROM concord_table WHERE subj=$1 OR obj=$1",
+                [curie],
+            ).fetchall()
 
         xrefs = [CrossReference.from_tuple(rec) for rec in xref_tuples]
         if label_curies:
@@ -115,15 +136,18 @@ def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference:
     def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False):
         """Traverse the cross-reference graph in one DuckDB WITH RECURSIVE query."""
         if label_curies and self.nodenorm is None:
-            raise ValueError("label_curies=True requires a configured NodeNorm instance (nodenorm was None).")
+            raise ValueError(
+                "label_curies=True requires a configured NodeNorm instance (nodenorm was None)."
+            )
         if not curies:
             return []
 
-        concord_parquet = self.downloader.get_downloaded_file('duckdb/Concord.parquet')
+        concord_parquet = self.downloader.get_downloaded_file("duckdb/Concord.parquet")
 
         with duckdb.connect() as db:
-            concord_table = db.read_parquet(concord_parquet)
-            rows = db.execute("""
+            concord_table = db.read_parquet(concord_parquet)  # noqa: F841 — DuckDB resolves 'concord_table' by Python variable name in the SQL query
+            rows = db.execute(
+                """
             WITH RECURSIVE
             edges(a, b) AS (
                 SELECT subj, obj FROM concord_table
@@ -142,7 +166,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
             WHERE c.subj IN (SELECT curie FROM frontier)
                OR c.obj  IN (SELECT curie FROM frontier)
             ORDER BY c.filename, c.subj, c.obj, c.pred
-        """, [curies]).fetchall()
+        """,
+                [curies],
+            ).fetchall()
 
         xrefs = [CrossReference.from_tuple(row) for row in rows]
 
@@ -151,7 +177,9 @@ def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = Fal
 
         return xrefs
 
-    def get_curie_xrefs(self, curies: list[str], recurse: bool = False, label_curies: bool = False):
+    def get_curie_xrefs(
+        self, curies: list[str], recurse: bool = False, label_curies: bool = False
+    ):
         """
         Search for all identifiers that are cross-referenced to the given CURIE.
 
diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 016fb01..6ba9a38 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -15,7 +15,14 @@ class BabelDownloader:
     Class for downloading Babel cross-reference files to a local directory as needed.
     """
 
-    def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 * 3600, timeout: int = 30):
+    def __init__(
+        self,
+        url_base,
+        local_path=None,
+        retries=10,
+        freshness_seconds=3 * 3600,
+        timeout: int = 30,
+    ):
         # We assume the URL base is correct (if not, we can fix it later).
         self.url_base = url_base
         self.retries = retries
@@ -33,7 +40,9 @@ def __init__(self, url_base, local_path=None, retries=10, freshness_seconds=3 *
         elif os.path.exists(local_path) and os.path.isdir(local_path):
             self.local_path = local_path
         else:
-            raise ValueError(f"Invalid local_path (must be an existing directory): '{local_path}'")
+            raise ValueError(
+                f"Invalid local_path (must be an existing directory): '{local_path}'"
+            )
 
     @functools.lru_cache(maxsize=None)
     def get_output_file(self, filename):
@@ -134,7 +143,9 @@ def _etag_matches(self, url, meta):
                 self.logger.info(f"ETag matches ({remote_etag}), file is current")
                 return True
             else:
-                self.logger.info(f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading")
+                self.logger.info(
+                    f"ETag changed: {local_etag!r} → {remote_etag!r}, re-downloading"
+                )
                 return False
 
         # Fallback: Last-Modified + Content-Length
@@ -145,10 +156,14 @@ def _etag_matches(self, url, meta):
 
         if local_lm and remote_lm and local_lm == remote_lm:
             if local_cl is None or remote_cl is None or int(remote_cl) == local_cl:
-                self.logger.info(f"Last-Modified matches ({remote_lm}), file is current")
+                self.logger.info(
+                    f"Last-Modified matches ({remote_lm}), file is current"
+                )
                 return True
 
-        self.logger.info("Cannot confirm file is current (no matching ETag or Last-Modified), will re-download")
+        self.logger.info(
+            "Cannot confirm file is current (no matching ETag or Last-Modified), will re-download"
+        )
         return False
 
     def _stream_download(self, response, local_path, resume_byte_pos, chunk_size):
@@ -162,23 +177,23 @@ def _stream_download(self, response, local_path, resume_byte_pos, chunk_size):
             chunk_size: Size of chunks to read/write
         """
         # Get total size from Content-Length header (may not be present)
-        content_length = response.headers.get('Content-Length')
+        content_length = response.headers.get("Content-Length")
         if content_length:
             total_size = int(content_length) + resume_byte_pos
         else:
             total_size = None
 
         # Open file in append mode if resuming, write mode otherwise
-        mode = 'ab' if resume_byte_pos > 0 else 'wb'
+        mode = "ab" if resume_byte_pos > 0 else "wb"
 
         with open(local_path, mode) as f:
             with tqdm(
                 total=total_size,
                 initial=resume_byte_pos,
-                unit='B',
+                unit="B",
                 unit_scale=True,
                 unit_divisor=1024,
-                desc=os.path.basename(local_path)
+                desc=os.path.basename(local_path),
             ) as progress_bar:
                 for chunk in response.iter_content(chunk_size=chunk_size):
                     if chunk:
@@ -210,12 +225,13 @@ def _download_with_retry(self, url, local_path, chunk_size):
                 # Prepare headers for resume
                 headers = {}
                 if resume_byte_pos > 0:
-                    headers['Range'] = f'bytes={resume_byte_pos}-'
+                    headers["Range"] = f"bytes={resume_byte_pos}-"
                     self.logger.info(f"Resuming download from byte {resume_byte_pos}")
 
                 # Make streaming request with timeout for connection (not total time)
-                with requests.get(url, headers=headers, stream=True, timeout=self.timeout) as response:
-
+                with requests.get(
+                    url, headers=headers, stream=True, timeout=self.timeout
+                ) as response:
                     # Handle different response codes
                     if response.status_code == 416:
                         # Range Not Satisfiable - file already complete
@@ -223,11 +239,13 @@ def _download_with_retry(self, url, local_path, chunk_size):
                         return response.headers
                     elif response.status_code == 206:
                         # Partial Content - resume successful
-                        self.logger.info(f"Resuming download (HTTP 206)")
+                        self.logger.info("Resuming download (HTTP 206)")
                     elif response.status_code == 200:
                         # OK - server doesn't support resume or no Range header was sent
                         if resume_byte_pos > 0:
-                            self.logger.warning(f"Server doesn't support resume, restarting from beginning")
+                            self.logger.warning(
+                                "Server doesn't support resume, restarting from beginning"
+                            )
                             resume_byte_pos = 0
                             # Remove partial file
                             if os.path.exists(local_path):
@@ -236,25 +254,31 @@ def _download_with_retry(self, url, local_path, chunk_size):
                         response.raise_for_status()
 
                     # Stream download with progress bar
-                    self._stream_download(response, local_path, resume_byte_pos, chunk_size)
+                    self._stream_download(
+                        response, local_path, resume_byte_pos, chunk_size
+                    )
 
                     # Success - exit retry loop
                     return response.headers
 
             except (requests.RequestException, IOError) as e:
-                self.logger.warning(f"Download attempt {attempt}/{self.retries} failed: {e}")
+                self.logger.warning(
+                    f"Download attempt {attempt}/{self.retries} failed: {e}"
+                )
 
                 if attempt < self.retries:
                     # Calculate exponential backoff with max of 60 seconds
-                    wait_time = min(2 ** attempt, 60)
+                    wait_time = min(2**attempt, 60)
                     self.logger.info(f"Retrying in {wait_time} seconds...")
                     time.sleep(wait_time)
                 else:
                     # All retries exhausted
-                    raise RuntimeError(f"Failed to download {url} after {self.retries} attempts: {e}")
+                    raise RuntimeError(
+                        f"Failed to download {url} after {self.retries} attempts: {e}"
+                    )
 
     @functools.lru_cache(maxsize=None)
-    def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
+    def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024):
         """
         Download a file from the Babel server to local storage with ETag-based caching.
 
@@ -280,7 +304,9 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
             if meta is not None:
                 # Tier 1: within freshness window — skip all network calls
                 if self._is_within_freshness(meta, self.freshness_seconds):
-                    self.logger.info(f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}")
+                    self.logger.info(
+                        f"File within freshness window ({self.freshness_seconds} seconds), skipping check: {local_path_to_download_to}"
+                    )
                     return local_path_to_download_to
 
                 # Tier 2: stale but maybe unchanged — HEAD request
@@ -290,24 +316,34 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024*1024):
                     meta_path = self._get_meta_path(local_path_to_download_to)
                     with open(meta_path, "w") as f:
                         json.dump(meta, f, indent=2)
-                    self.logger.info(f"ETag matches, using existing file: {local_path_to_download_to}")
+                    self.logger.info(
+                        f"ETag matches, using existing file: {local_path_to_download_to}"
+                    )
                     return local_path_to_download_to
 
                 # Tier 3: ETag changed — delete and re-download
-                self.logger.warning(f"Remote file changed, re-downloading: {local_path_to_download_to}")
+                self.logger.warning(
+                    f"Remote file changed, re-downloading: {local_path_to_download_to}"
+                )
                 os.remove(local_path_to_download_to)
 
-        self.logger.info(f"Downloading {url_to_download} to {local_path_to_download_to}")
+        self.logger.info(
+            f"Downloading {url_to_download} to {local_path_to_download_to}"
+        )
 
         # Download with retry logic; get response headers back
-        response_headers = self._download_with_retry(url_to_download, local_path_to_download_to, chunk_size)
+        response_headers = self._download_with_retry(
+            url_to_download, local_path_to_download_to, chunk_size
+        )
 
         # Save sidecar metadata
         if response_headers is not None:
             self._save_meta(local_path_to_download_to, response_headers)
 
         bytes_downloaded = os.path.getsize(local_path_to_download_to)
-        self.logger.info(f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes")
+        self.logger.info(
+            f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes"
+        )
         return local_path_to_download_to
 
     @functools.lru_cache(maxsize=None)
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index 04a3629..ec5b0a8 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -3,6 +3,7 @@
 import requests
 import logging
 
+
 @dataclasses.dataclass
 class Identifier:
     curie: str
@@ -16,17 +17,18 @@ def __lt__(self, other):
 
     @staticmethod
     def from_dict(d: dict):
-        identifier = Identifier(curie=d['identifier'])
-        if 'label' in d:
-            identifier.label = d['label']
-        if 'taxa' in d:
-            identifier.taxa = d['taxa']
-        if 'description' in d:
-            identifier.description = d['description']
-        if 'type' in d:
-            identifier.biolink_type = d['type']
+        identifier = Identifier(curie=d["identifier"])
+        if "label" in d:
+            identifier.label = d["label"]
+        if "taxa" in d:
+            identifier.taxa = d["taxa"]
+        if "description" in d:
+            identifier.description = d["description"]
+        if "type" in d:
+            identifier.biolink_type = d["type"]
         return identifier
 
+
 class NodeNorm:
     def __init__(self, nodenorm_url: str = "", timeout: int = 30):
         self.nodenorm_url = nodenorm_url
@@ -40,30 +42,44 @@ def get_identifier(self, curie: str):
         logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}")
         if not result:
             return Identifier(curie=curie)
-        for identifier in result.get('equivalent_identifiers', []):
-            if identifier['identifier'] == curie:
+        for identifier in result.get("equivalent_identifiers", []):
+            if identifier["identifier"] == curie:
                 logging.debug(f"Found exact match for {curie}: {identifier}")
                 return Identifier.from_dict(identifier)
 
         return Identifier(curie=curie)
 
     @functools.lru_cache(maxsize=None)
-    def normalize_curie(self, curie: str, conflate=True, drug_chemical_conflate=True, description=True, individual_types=True, include_taxa=True):
-        response = requests.get(f"{self.nodenorm_url}get_normalized_nodes", params={
-            "curie": curie,
-            "conflate": conflate,
-            "drug_chemical_conflate": drug_chemical_conflate,
-            "description": description,
-            "individual_types": individual_types,
-            "include_taxa": include_taxa,
-        }, timeout=self.timeout)
+    def normalize_curie(
+        self,
+        curie: str,
+        conflate=True,
+        drug_chemical_conflate=True,
+        description=True,
+        individual_types=True,
+        include_taxa=True,
+    ):
+        response = requests.get(
+            f"{self.nodenorm_url}get_normalized_nodes",
+            params={
+                "curie": curie,
+                "conflate": conflate,
+                "drug_chemical_conflate": drug_chemical_conflate,
+                "description": description,
+                "individual_types": individual_types,
+                "include_taxa": include_taxa,
+            },
+            timeout=self.timeout,
+        )
         response.raise_for_status()
         result = response.json()
 
         try:
             return result[curie]
         except KeyError:
-            logging.debug(f"NodeNorm response did not contain CURIE {curie!r}; returning None")
+            logging.debug(
+                f"NodeNorm response did not contain CURIE {curie!r}; returning None"
+            )
             return None
 
     @functools.lru_cache(maxsize=None)
@@ -71,6 +87,8 @@ def get_clique_identifiers(self, curie, **kwargs):
         result = self.normalize_curie(curie, **kwargs)
         if not result:
             return None
-        if 'equivalent_identifiers' not in result:
+        if "equivalent_identifiers" not in result:
             return None
-        return list(map(lambda x: Identifier.from_dict(x), result['equivalent_identifiers']))
+        return list(
+            map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"])
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index f1e0df6..92ecb06 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -116,6 +116,8 @@ def babel_xrefs(shared_downloader, downloaded_parquet_files) -> BabelXRefs:
 
 
 @pytest.fixture(scope="session")
-def babel_xrefs_with_nodenorm(shared_downloader, nodenorm, downloaded_parquet_files) -> BabelXRefs:
+def babel_xrefs_with_nodenorm(
+    shared_downloader, nodenorm, downloaded_parquet_files
+) -> BabelXRefs:
     """A BabelXRefs instance with NodeNorm, Concord + Metadata already downloaded."""
     return BabelXRefs(shared_downloader, nodenorm)
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 75e33cb..d67f81b 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -28,7 +28,9 @@
 
 class TestCrossReference:
     def test_creation(self):
-        xr = CrossReference(filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2")
+        xr = CrossReference(
+            filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2"
+        )
         assert xr.filename == "f.txt"
         assert xr.subj == "A:1"
         assert xr.pred == "skos:exactMatch"
@@ -85,9 +87,14 @@ def test_sorting(self):
 class TestLabeledCrossReference:
     def test_creation(self):
         lxr = LabeledCrossReference(
-            subj="A:1", pred="p", obj="B:2", filename="f",
-            subj_label="Alpha", subj_biolink_type="biolink:Disease",
-            obj_label="Beta", obj_biolink_type="biolink:Gene",
+            subj="A:1",
+            pred="p",
+            obj="B:2",
+            filename="f",
+            subj_label="Alpha",
+            subj_biolink_type="biolink:Disease",
+            obj_label="Beta",
+            obj_biolink_type="biolink:Gene",
         )
         assert lxr.subj == "A:1"
         assert lxr.subj_label == "Alpha"
@@ -95,23 +102,40 @@ def test_creation(self):
 
     def test_inherits_from_cross_reference(self):
         lxr = LabeledCrossReference(
-            subj="A:1", pred="p", obj="B:2", filename="f",
-            subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="",
+            subj="A:1",
+            pred="p",
+            obj="B:2",
+            filename="f",
+            subj_label="",
+            subj_biolink_type="",
+            obj_label="",
+            obj_biolink_type="",
         )
         assert isinstance(lxr, CrossReference)
 
     def test_curies_property(self):
         lxr = LabeledCrossReference(
-            subj="A:1", pred="p", obj="B:2", filename="f",
-            subj_label="", subj_biolink_type="", obj_label="", obj_biolink_type="",
+            subj="A:1",
+            pred="p",
+            obj="B:2",
+            filename="f",
+            subj_label="",
+            subj_biolink_type="",
+            obj_label="",
+            obj_biolink_type="",
         )
         assert lxr.curies == frozenset({"A:1", "B:2"})
 
     def test_str(self):
         lxr = LabeledCrossReference(
-            subj="A:1", pred="p", obj="B:2", filename="f",
-            subj_label="Alpha", subj_biolink_type="biolink:Disease",
-            obj_label="Beta", obj_biolink_type="biolink:Gene",
+            subj="A:1",
+            pred="p",
+            obj="B:2",
+            filename="f",
+            subj_label="Alpha",
+            subj_biolink_type="biolink:Disease",
+            obj_label="Beta",
+            obj_biolink_type="biolink:Gene",
         )
         s = str(lxr)
         assert "A:1" in s
@@ -188,9 +212,16 @@ def test_get_curie_xref_calls_downloader(self, tmp_path):
         mock_db.read_parquet.return_value = "table"
         mock_db.execute.return_value = mock_result
 
-        with patch.object(bx.downloader, 'get_downloaded_file', return_value="/fake/path") as mock_dl:
-            with patch.object(bx.downloader, 'get_output_file', return_value="/fake/db"):
-                with patch("babel_explorer.core.babel_xrefs.duckdb.connect", return_value=mock_db):
+        with patch.object(
+            bx.downloader, "get_downloaded_file", return_value="/fake/path"
+        ) as mock_dl:
+            with patch.object(
+                bx.downloader, "get_output_file", return_value="/fake/db"
+            ):
+                with patch(
+                    "babel_explorer.core.babel_xrefs.duckdb.connect",
+                    return_value=mock_db,
+                ):
                     bx.get_curie_xref.cache_clear()
                     result = bx.get_curie_xref("A:1")
                     # Downloader should be called for Concord only (Metadata unused here)
@@ -202,7 +233,7 @@ def test_get_curie_xref_calls_downloader(self, tmp_path):
     def test_get_curie_xrefs_no_expand(self, tmp_path):
         bx = self._make_bx(tmp_path)
         xr = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
-        with patch.object(bx, 'get_curie_xref', return_value=[xr]):
+        with patch.object(bx, "get_curie_xref", return_value=[xr]):
             bx.get_curie_xref.cache_clear()
             result = bx.get_curie_xrefs(["A:1"], recurse=False)
             assert len(result) == 1
@@ -213,7 +244,9 @@ def test_get_curie_xrefs_with_expand(self, tmp_path):
         xr1 = CrossReference(filename="f", subj="A:1", pred="p", obj="B:2")
         xr2 = CrossReference(filename="f", subj="B:2", pred="p", obj="C:3")
 
-        with patch.object(bx, '_get_curie_xrefs_recursive', return_value=[xr1, xr2]) as mock_rec:
+        with patch.object(
+            bx, "_get_curie_xrefs_recursive", return_value=[xr1, xr2]
+        ) as mock_rec:
             result = bx.get_curie_xrefs(["A:1"], recurse=True)
             mock_rec.assert_called_once_with(["A:1"], False)
             assert xr1 in result
@@ -239,7 +272,9 @@ def test_get_curie_xrefs_recursive_sql_traversal(self, tmp_path):
         """)
         setup_db.close()
 
-        with patch.object(bx.downloader, 'get_downloaded_file', return_value=parquet_path):
+        with patch.object(
+            bx.downloader, "get_downloaded_file", return_value=parquet_path
+        ):
             # Starting from A:1 should reach B:2 and C:3 but not the D-E component
             result = bx._get_curie_xrefs_recursive(["A:1"])
             pairs = {(xr.subj, xr.obj) for xr in result}
@@ -262,7 +297,7 @@ def test_results_are_sorted(self, tmp_path):
         xr_b = CrossReference(filename="b", subj="B:1", pred="p", obj="C:1")
         xr_a = CrossReference(filename="a", subj="A:1", pred="p", obj="B:1")
 
-        with patch.object(bx, 'get_curie_xref', return_value=[xr_b, xr_a]):
+        with patch.object(bx, "get_curie_xref", return_value=[xr_b, xr_a]):
             result = bx.get_curie_xrefs(["X:1"], recurse=False)
             assert result == [xr_a, xr_b]
 
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 16a7e9b..9b33e7a 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -39,7 +39,9 @@ def test_creates_directory_if_missing(self, tmp_path):
         assert dl.local_path == new_dir
 
     def test_custom_retries(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3)
+        dl = BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path), retries=3
+        )
         assert dl.retries == 3
 
     def test_default_retries(self, tmp_path):
@@ -51,7 +53,11 @@ def test_default_freshness_seconds(self, tmp_path):
         assert dl.freshness_seconds == 3 * 3600
 
     def test_custom_freshness_seconds(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), freshness_seconds=0)
+        dl = BabelDownloader(
+            url_base="https://example.com/",
+            local_path=str(tmp_path),
+            freshness_seconds=0,
+        )
         assert dl.freshness_seconds == 0
 
     def test_invalid_path_raises_value_error(self):
@@ -90,13 +96,15 @@ class TestSaveMeta:
     """Tests for _save_meta."""
 
     def _make_dl(self, tmp_path):
-        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        return BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path)
+        )
 
     def test_writes_all_fields(self, tmp_path):
         dl = self._make_dl(tmp_path)
         file_path = str(tmp_path / "test.parquet")
         # Create the file so the path is valid
-        open(file_path, 'wb').close()
+        open(file_path, "wb").close()
 
         headers = {
             "ETag": '"abc123"',
@@ -118,7 +126,7 @@ def test_writes_all_fields(self, tmp_path):
     def test_last_checked_is_recent_utc(self, tmp_path):
         dl = self._make_dl(tmp_path)
         file_path = str(tmp_path / "f.parquet")
-        open(file_path, 'wb').close()
+        open(file_path, "wb").close()
 
         dl._save_meta(file_path, {"ETag": '"x"'})
 
@@ -133,7 +141,7 @@ def test_missing_headers_not_written(self, tmp_path):
         """Headers not present in the response should not appear in .meta."""
         dl = self._make_dl(tmp_path)
         file_path = str(tmp_path / "sparse.parquet")
-        open(file_path, 'wb').close()
+        open(file_path, "wb").close()
 
         dl._save_meta(file_path, {})
 
@@ -150,7 +158,9 @@ class TestLoadMeta:
     """Tests for _load_meta."""
 
     def _make_dl(self, tmp_path):
-        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        return BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path)
+        )
 
     def test_returns_none_if_no_meta_file(self, tmp_path):
         dl = self._make_dl(tmp_path)
@@ -159,7 +169,7 @@ def test_returns_none_if_no_meta_file(self, tmp_path):
     def test_returns_dict_for_valid_meta(self, tmp_path):
         dl = self._make_dl(tmp_path)
         file_path = str(tmp_path / "f.parquet")
-        open(file_path, 'wb').close()
+        open(file_path, "wb").close()
         meta_data = {"etag": '"abc"', "last_checked": "2026-01-01T00:00:00+00:00"}
         with open(file_path + ".meta", "w") as f:
             json.dump(meta_data, f)
@@ -170,7 +180,7 @@ def test_returns_dict_for_valid_meta(self, tmp_path):
     def test_returns_none_for_corrupt_meta(self, tmp_path):
         dl = self._make_dl(tmp_path)
         file_path = str(tmp_path / "corrupt.parquet")
-        open(file_path, 'wb').close()
+        open(file_path, "wb").close()
         with open(file_path + ".meta", "w") as f:
             f.write("not valid json {{{")
 
@@ -181,7 +191,9 @@ class TestIsWithinFreshness:
     """Tests for _is_within_freshness."""
 
     def _make_dl(self, tmp_path):
-        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        return BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path)
+        )
 
     def test_returns_true_when_recent(self, tmp_path):
         dl = self._make_dl(tmp_path)
@@ -217,7 +229,9 @@ class TestEtagMatches:
     """Tests for _etag_matches."""
 
     def _make_dl(self, tmp_path):
-        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
+        return BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path)
+        )
 
     def test_returns_true_on_matching_etag(self, tmp_path):
         dl = self._make_dl(tmp_path)
@@ -225,7 +239,9 @@ def test_returns_true_on_matching_etag(self, tmp_path):
         mock_resp = Mock()
         mock_resp.headers = {"ETag": '"abc123"'}
         mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_resp
+        ):
             assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
     def test_returns_false_on_different_etag(self, tmp_path):
@@ -234,7 +250,9 @@ def test_returns_false_on_different_etag(self, tmp_path):
         mock_resp = Mock()
         mock_resp.headers = {"ETag": '"new"'}
         mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_resp
+        ):
             assert dl._etag_matches("https://example.com/f.parquet", meta) is False
 
     def test_fallback_last_modified_match(self, tmp_path):
@@ -244,14 +262,18 @@ def test_fallback_last_modified_match(self, tmp_path):
         mock_resp = Mock()
         mock_resp.headers = {"Last-Modified": lm, "Content-Length": "100"}
         mock_resp.raise_for_status = Mock()
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_resp):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_resp
+        ):
             assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
     def test_returns_false_on_request_error(self, tmp_path):
         dl = self._make_dl(tmp_path)
         meta = {"etag": '"abc"'}
-        with patch("babel_explorer.core.downloader.requests.head",
-                   side_effect=requests.ConnectionError("fail")):
+        with patch(
+            "babel_explorer.core.downloader.requests.head",
+            side_effect=requests.ConnectionError("fail"),
+        ):
             assert dl._etag_matches("https://example.com/f.parquet", meta) is False
 
 
@@ -259,8 +281,11 @@ class TestGetDownloadedFileTiers:
     """Tests for the three-tier logic in get_downloaded_file."""
 
     def _make_dl(self, tmp_path, freshness=3600):
-        return BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path),
-                               freshness_seconds=freshness)
+        return BabelDownloader(
+            url_base="https://example.com/",
+            local_path=str(tmp_path),
+            freshness_seconds=freshness,
+        )
 
     # --- Tier 1: within freshness window ---
 
@@ -303,7 +328,9 @@ def test_tier2_head_check_no_redownload(self, tmp_path):
         mock_head_resp.headers = {"ETag": '"abc"'}
         mock_head_resp.raise_for_status = Mock()
 
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp
+        ):
             with patch("babel_explorer.core.downloader.requests.get") as mock_get:
                 dl.get_downloaded_file.cache_clear()
                 result = dl.get_downloaded_file(test_file)
@@ -327,7 +354,9 @@ def test_tier2_updates_last_checked_after_head(self, tmp_path):
         mock_head_resp.headers = {"ETag": '"abc"'}
         mock_head_resp.raise_for_status = Mock()
 
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp
+        ):
             dl.get_downloaded_file.cache_clear()
             dl.get_downloaded_file(test_file)
 
@@ -358,16 +387,18 @@ def test_tier3_redownloads_when_etag_changed(self, tmp_path):
         new_content = b"new data"
 
         def fake_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
+            with open(path, "wb") as f:
                 f.write(new_content)
             return {"ETag": '"new"', "Content-Length": str(len(new_content))}
 
-        with patch("babel_explorer.core.downloader.requests.head", return_value=mock_head_resp):
-            with patch.object(dl, '_download_with_retry', side_effect=fake_download):
+        with patch(
+            "babel_explorer.core.downloader.requests.head", return_value=mock_head_resp
+        ):
+            with patch.object(dl, "_download_with_retry", side_effect=fake_download):
                 dl.get_downloaded_file.cache_clear()
                 result = dl.get_downloaded_file(test_file)
 
-        assert open(result, 'rb').read() == new_content
+        assert open(result, "rb").read() == new_content
 
     # --- No .meta: fresh download ---
 
@@ -375,22 +406,23 @@ def test_downloads_when_no_meta(self, tmp_path):
         """No file and no .meta → download happens, .meta is saved."""
         dl = self._make_dl(tmp_path)
         test_file = "duckdb/new.parquet"
-        local_path = str(tmp_path / "duckdb" / "new.parquet")
         content = b"fresh download"
 
         def fake_download(url, path, chunk_size):
             os.makedirs(os.path.dirname(path), exist_ok=True)
-            with open(path, 'wb') as f:
+            with open(path, "wb") as f:
                 f.write(content)
             return {"ETag": '"fresh"', "Content-Length": str(len(content))}
 
-        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+        with patch.object(
+            dl, "_download_with_retry", side_effect=fake_download
+        ) as mock_dl:
             dl.get_downloaded_file.cache_clear()
             result = dl.get_downloaded_file(test_file)
             mock_dl.assert_called_once()
 
         assert os.path.exists(result)
-        assert open(result, 'rb').read() == content
+        assert open(result, "rb").read() == content
         # .meta should be saved
         meta_path = result + ".meta"
         assert os.path.exists(meta_path)
@@ -410,16 +442,18 @@ def test_downloads_when_file_exists_but_no_meta(self, tmp_path):
         new_content = b"refreshed"
 
         def fake_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
+            with open(path, "wb") as f:
                 f.write(new_content)
             return {"ETag": '"new"'}
 
-        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+        with patch.object(
+            dl, "_download_with_retry", side_effect=fake_download
+        ) as mock_dl:
             dl.get_downloaded_file.cache_clear()
             result = dl.get_downloaded_file(test_file)
             mock_dl.assert_called_once()
 
-        assert open(result, 'rb').read() == new_content
+        assert open(result, "rb").read() == new_content
 
 
 class TestGetDownloadedFileCaching:
@@ -430,11 +464,13 @@ def test_cache_returns_same_result(self, tmp_path):
         content = b"cached content"
 
         def fake_download(url, path, chunk_size):
-            with open(path, 'wb') as f:
+            with open(path, "wb") as f:
                 f.write(content)
             return {}
 
-        with patch.object(dl, '_download_with_retry', side_effect=fake_download) as mock_dl:
+        with patch.object(
+            dl, "_download_with_retry", side_effect=fake_download
+        ) as mock_dl:
             dl.get_downloaded_file.cache_clear()
             r1 = dl.get_downloaded_file("cached.txt")
             r2 = dl.get_downloaded_file("cached.txt")
@@ -456,20 +492,31 @@ def _make_response(status_code, headers=None, content=None):
         return m
 
     def test_retries_exhausted_raises_runtime_error(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=2)
-        with patch("babel_explorer.core.downloader.requests.get", side_effect=requests.ConnectionError("fail")):
+        dl = BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path), retries=2
+        )
+        with patch(
+            "babel_explorer.core.downloader.requests.get",
+            side_effect=requests.ConnectionError("fail"),
+        ):
             with patch("babel_explorer.core.downloader.time.sleep"):  # skip waiting
                 with pytest.raises(RuntimeError, match="Failed to download"):
-                    dl._download_with_retry("https://example.com/file", str(tmp_path / "f"), 1024)
+                    dl._download_with_retry(
+                        "https://example.com/file", str(tmp_path / "f"), 1024
+                    )
 
     def test_succeeds_on_second_attempt(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path), retries=3)
+        dl = BabelDownloader(
+            url_base="https://example.com/", local_path=str(tmp_path), retries=3
+        )
         out_path = str(tmp_path / "retry_success.bin")
 
-        mock_response = self._make_response(200, {'Content-Length': '5'}, [b"hello"])
+        mock_response = self._make_response(200, {"Content-Length": "5"}, [b"hello"])
         side_effects = [requests.ConnectionError("first fail"), mock_response]
 
-        with patch("babel_explorer.core.downloader.requests.get", side_effect=side_effects):
+        with patch(
+            "babel_explorer.core.downloader.requests.get", side_effect=side_effects
+        ):
             with patch("babel_explorer.core.downloader.time.sleep"):
                 dl._download_with_retry("https://example.com/file", out_path, 1024)
         assert os.path.exists(out_path)
@@ -479,11 +526,13 @@ def test_resume_sends_range_header(self, tmp_path):
         out_path = tmp_path / "partial.bin"
         out_path.write_bytes(b"partial")  # 7 bytes
 
-        mock_response = self._make_response(206, {'Content-Length': '3'}, [b"end"])
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response) as mock_get:
+        mock_response = self._make_response(206, {"Content-Length": "3"}, [b"end"])
+        with patch(
+            "babel_explorer.core.downloader.requests.get", return_value=mock_response
+        ) as mock_get:
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
             _, kwargs = mock_get.call_args
-            assert kwargs['headers'] == {'Range': 'bytes=7-'}
+            assert kwargs["headers"] == {"Range": "bytes=7-"}
 
     def test_http_416_file_already_complete(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
@@ -491,7 +540,9 @@ def test_http_416_file_already_complete(self, tmp_path):
         out_path.write_bytes(b"full file")
 
         mock_response = self._make_response(416)
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
+        with patch(
+            "babel_explorer.core.downloader.requests.get", return_value=mock_response
+        ):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
         # Should return without error
         assert out_path.read_bytes() == b"full file"
@@ -502,8 +553,12 @@ def test_server_no_resume_restarts_download(self, tmp_path):
         out_path = tmp_path / "no_resume.bin"
         out_path.write_bytes(b"partial")
 
-        mock_response = self._make_response(200, {'Content-Length': '12'}, [b"full content"])
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
+        mock_response = self._make_response(
+            200, {"Content-Length": "12"}, [b"full content"]
+        )
+        with patch(
+            "babel_explorer.core.downloader.requests.get", return_value=mock_response
+        ):
             dl._download_with_retry("https://example.com/file", str(out_path), 1024)
         assert out_path.read_bytes() == b"full content"
 
@@ -512,10 +567,16 @@ def test_returns_response_headers(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
         out_path = str(tmp_path / "headers.bin")
 
-        mock_response = self._make_response(200, {'Content-Length': '5', 'ETag': '"abc"'}, [b"hello"])
-        with patch("babel_explorer.core.downloader.requests.get", return_value=mock_response):
-            headers = dl._download_with_retry("https://example.com/file", out_path, 1024)
-        assert headers['ETag'] == '"abc"'
+        mock_response = self._make_response(
+            200, {"Content-Length": "5", "ETag": '"abc"'}, [b"hello"]
+        )
+        with patch(
+            "babel_explorer.core.downloader.requests.get", return_value=mock_response
+        ):
+            headers = dl._download_with_retry(
+                "https://example.com/file", out_path, 1024
+            )
+        assert headers["ETag"] == '"abc"'
 
 
 class TestStreamDownload:
@@ -526,11 +587,11 @@ def test_writes_chunks(self, tmp_path):
         out_path = str(tmp_path / "stream.bin")
 
         mock_response = Mock()
-        mock_response.headers = {'Content-Length': '10'}
+        mock_response.headers = {"Content-Length": "10"}
         mock_response.iter_content = Mock(return_value=[b"hello", b"world"])
 
         dl._stream_download(mock_response, out_path, resume_byte_pos=0, chunk_size=1024)
-        with open(out_path, 'rb') as f:
+        with open(out_path, "rb") as f:
             assert f.read() == b"helloworld"
 
     def test_append_mode_on_resume(self, tmp_path):
@@ -539,10 +600,12 @@ def test_append_mode_on_resume(self, tmp_path):
         out_path.write_bytes(b"start")
 
         mock_response = Mock()
-        mock_response.headers = {'Content-Length': '3'}
+        mock_response.headers = {"Content-Length": "3"}
         mock_response.iter_content = Mock(return_value=[b"end"])
 
-        dl._stream_download(mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024)
+        dl._stream_download(
+            mock_response, str(out_path), resume_byte_pos=5, chunk_size=1024
+        )
         assert out_path.read_bytes() == b"startend"
 
 
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
index 2322eef..363d077 100644
--- a/tests/test_nodenorm.py
+++ b/tests/test_nodenorm.py
@@ -75,7 +75,11 @@ def test_lt_ordering(self):
         assert a < b
 
     def test_sorting(self):
-        items = [Identifier(curie="C:3"), Identifier(curie="A:1"), Identifier(curie="B:2")]
+        items = [
+            Identifier(curie="C:3"),
+            Identifier(curie="A:1"),
+            Identifier(curie="B:2"),
+        ]
         result = sorted(items)
         assert [x.curie for x in result] == ["A:1", "B:2", "C:3"]
 
@@ -108,7 +112,9 @@ def test_correct_api_endpoint_and_params(self):
         mock_resp.json.return_value = {"X:1": {"id": {"identifier": "X:1"}}}
         mock_resp.raise_for_status = Mock()
 
-        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get:
+        with patch(
+            "babel_explorer.core.nodenorm.requests.get", return_value=mock_resp
+        ) as mock_get:
             nn.normalize_curie("X:1")
             mock_get.assert_called_once()
             args, kwargs = mock_get.call_args
@@ -132,7 +138,9 @@ def test_lru_caching(self):
         mock_resp.json.return_value = {"X:1": {"id": "X:1"}}
         mock_resp.raise_for_status = Mock()
 
-        with patch("babel_explorer.core.nodenorm.requests.get", return_value=mock_resp) as mock_get:
+        with patch(
+            "babel_explorer.core.nodenorm.requests.get", return_value=mock_resp
+        ) as mock_get:
             nn.normalize_curie("X:1")
             nn.normalize_curie("X:1")
             mock_get.assert_called_once()
@@ -162,7 +170,7 @@ def test_exact_match_found(self):
                 {"identifier": "X:2", "label": "Beta"},
             ],
         }
-        with patch.object(nn, 'normalize_curie', return_value=api_result):
+        with patch.object(nn, "normalize_curie", return_value=api_result):
             ident = nn.get_identifier("X:1")
             assert ident.curie == "X:1"
             assert ident.label == "Alpha"
@@ -174,14 +182,14 @@ def test_no_match_returns_bare_identifier(self):
                 {"identifier": "X:2", "label": "Beta"},
             ],
         }
-        with patch.object(nn, 'normalize_curie', return_value=api_result):
+        with patch.object(nn, "normalize_curie", return_value=api_result):
             ident = nn.get_identifier("X:1")
             assert ident.curie == "X:1"
             assert ident.label == ""
 
     def test_falsy_result_returns_bare_identifier(self):
         nn = self._make_nn()
-        with patch.object(nn, 'normalize_curie', return_value=None):
+        with patch.object(nn, "normalize_curie", return_value=None):
             ident = nn.get_identifier("X:1")
             assert ident.curie == "X:1"
             assert ident.label == ""
@@ -193,7 +201,7 @@ def test_caching(self):
                 {"identifier": "X:1", "label": "Alpha"},
             ],
         }
-        with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm:
+        with patch.object(nn, "normalize_curie", return_value=api_result) as mock_norm:
             nn.get_identifier("X:1")
             nn.get_identifier("X:1")
             mock_norm.assert_called_once()
@@ -214,7 +222,7 @@ def test_success_returns_list(self):
                 {"identifier": "X:2", "label": "Beta"},
             ],
         }
-        with patch.object(nn, 'normalize_curie', return_value=api_result):
+        with patch.object(nn, "normalize_curie", return_value=api_result):
             result = nn.get_clique_identifiers("X:1")
             assert len(result) == 2
             assert all(isinstance(x, Identifier) for x in result)
@@ -222,7 +230,7 @@ def test_success_returns_list(self):
     def test_missing_key_returns_none(self):
         nn = self._make_nn()
         api_result = {"id": {"identifier": "X:1"}}  # no equivalent_identifiers
-        with patch.object(nn, 'normalize_curie', return_value=api_result):
+        with patch.object(nn, "normalize_curie", return_value=api_result):
             result = nn.get_clique_identifiers("X:1")
             assert result is None
 
@@ -231,7 +239,7 @@ def test_caching(self):
         api_result = {
             "equivalent_identifiers": [{"identifier": "X:1"}],
         }
-        with patch.object(nn, 'normalize_curie', return_value=api_result) as mock_norm:
+        with patch.object(nn, "normalize_curie", return_value=api_result) as mock_norm:
             nn.get_clique_identifiers("X:1")
             nn.get_clique_identifiers("X:1")
             mock_norm.assert_called_once()

From f9e549e62c9f5d150d582eacd1210fc9d004191b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 17:00:36 -0400
Subject: [PATCH 50/66] Rename lint workflow to CI and add unit test job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaced .github/workflows/lint.yml with ci.yml. The new file keeps the
existing ruff lint/format job and adds a parallel test job that runs
`pytest -v -m "not integration"` (unit tests only — no network required).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/{lint.yml => ci.yml} | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
 rename .github/workflows/{lint.yml => ci.yml} (54%)

diff --git a/.github/workflows/lint.yml b/.github/workflows/ci.yml
similarity index 54%
rename from .github/workflows/lint.yml
rename to .github/workflows/ci.yml
index f4771d4..8ce9dd1 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/ci.yml
@@ -1,10 +1,10 @@
-name: Lint
+name: CI
 
 on:
   pull_request:
 
 jobs:
-  ruff:
+  lint:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -12,3 +12,11 @@ jobs:
       - run: uv sync --group dev
       - run: uv run ruff check src/ tests/
       - run: uv run ruff format --check src/ tests/
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - run: uv sync --group dev
+      - run: uv run pytest -v -m "not integration"

From 7d1d5ca438b029f8ca82032083295c1b46530f8c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 17:25:40 -0400
Subject: [PATCH 51/66] Improved documentation.

---
 src/babel_explorer/cli.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 0e25ea3..878720a 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -85,14 +85,12 @@ def xrefs(
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
 
-    This function searches for xrefs associated with the provided CURIEs.
-
     \f
 
     :param curies: A list of CURIEs (Compact URI) for which cross-references need
         to be retrieved.
     :type curies: list[str]
-    :param babel_url: Base URL of the Babel server
+    :param babel_url: Base URL of the Babel server from which to download DuckDB files.
     :type babel_url: str
 
     :return: None

From 0d9e32c14b7c16a89e8e180c7437752b65bc9156 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 17:30:45 -0400
Subject: [PATCH 52/66] Add tests for parse_duration() in cli.py

Covers all 5 branches: empty input, 'never', unit suffix, bare integer,
and invalid values that raise click.BadParameter.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/test_cli.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 tests/test_cli.py

diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..ed12a69
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,52 @@
+"""
+Tests for CLI helper functions.
+
+Unit tests — no network required.
+"""
+
+import pytest
+import click
+
+from babel_explorer.cli import parse_duration
+
+
+# ==========================================================================
+# Unit Tests — no network required
+# ==========================================================================
+
+
+class TestParseDuration:
+    """Tests for parse_duration()."""
+
+    @pytest.mark.parametrize(
+        "value, expected",
+        [
+            ("never", float("inf")),
+            ("NEVER", float("inf")),
+            ("3h", 10800),
+            ("3H", 10800),
+            ("30m", 1800),
+            ("1d", 86400),
+            ("7200s", 7200),
+            ("7200", 7200),
+            ("0", 0),
+            ("  3h  ", 10800),
+        ],
+    )
+    def test_valid_inputs(self, value, expected):
+        assert parse_duration(value) == expected
+
+    @pytest.mark.parametrize(
+        "value",
+        [
+            "",
+            None,
+            "abc",
+            "3.5h",
+            "1.5",
+            "3x",
+        ],
+    )
+    def test_invalid_inputs_raise_bad_parameter(self, value):
+        with pytest.raises(click.BadParameter):
+            parse_duration(value)

From 0ca35eb7b881c7dc9d834746eb25ec6bd4492aa7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Mon, 16 Mar 2026 17:39:41 -0400
Subject: [PATCH 53/66] Add CliRunner tests for xrefs, ids, and test-concord
 commands

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/test_cli.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index ed12a69..c8cc924 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -6,8 +6,10 @@
 
 import pytest
 import click
+from click.testing import CliRunner
+from unittest.mock import patch, MagicMock
 
-from babel_explorer.cli import parse_duration
+from babel_explorer.cli import parse_duration, cli
 
 
 # ==========================================================================
@@ -50,3 +52,95 @@ def test_valid_inputs(self, value, expected):
     def test_invalid_inputs_raise_bad_parameter(self, value):
         with pytest.raises(click.BadParameter):
             parse_duration(value)
+
+
+class TestCliCommands:
+    """Tests for CLI commands using CliRunner — no network required."""
+
+    def test_xrefs_happy_path(self):
+        runner = CliRunner()
+        mock_xref = MagicMock()
+        mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
+
+        with patch("babel_explorer.cli.BabelDownloader"), \
+             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
+             patch("babel_explorer.cli.NodeNorm"):
+            mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref]
+            result = runner.invoke(cli, ["xrefs", "MONDO:0004979"])
+
+        assert result.exit_code == 0
+        mock_bx.return_value.get_curie_xrefs.assert_called_once_with(
+            ("MONDO:0004979",), False, label_curies=False
+        )
+
+    def test_xrefs_recurse_and_labels_flags(self):
+        runner = CliRunner()
+        mock_xref = MagicMock()
+        mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
+
+        with patch("babel_explorer.cli.BabelDownloader"), \
+             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
+             patch("babel_explorer.cli.NodeNorm"):
+            mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref]
+            result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"])
+
+        assert result.exit_code == 0
+        mock_bx.return_value.get_curie_xrefs.assert_called_once_with(
+            ("MONDO:0004979",), True, label_curies=True
+        )
+
+    def test_xrefs_check_download_option(self):
+        runner = CliRunner()
+
+        with patch("babel_explorer.cli.BabelDownloader") as mock_dl, \
+             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
+             patch("babel_explorer.cli.NodeNorm"):
+            mock_bx.return_value.get_curie_xrefs.return_value = []
+            result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"])
+
+        assert result.exit_code == 0
+        _, kwargs = mock_dl.call_args
+        assert kwargs.get("freshness_seconds") == 3600
+
+    def test_ids_happy_path(self):
+        runner = CliRunner()
+        mock_id = MagicMock()
+        mock_id.__str__ = lambda self: "MONDO:0004979 record"
+
+        with patch("babel_explorer.cli.BabelDownloader"), \
+             patch("babel_explorer.cli.BabelXRefs") as mock_bx:
+            mock_bx.return_value.get_curie_ids.return_value = [mock_id]
+            result = runner.invoke(cli, ["ids", "MONDO:0004979"])
+
+        assert result.exit_code == 0
+        mock_bx.return_value.get_curie_ids.assert_called_once_with(("MONDO:0004979",))
+
+    def test_test_concord_happy_path(self):
+        runner = CliRunner()
+        mock_ident = MagicMock()
+        mock_ident.curie = "MONDO:0004979"
+        mock_ident.label = "asthma"
+        mock_ident.biolink_type = "biolink:Disease"
+
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979"])
+
+        assert result.exit_code == 0
+        assert "asthma" in result.output
+        mock_nn.return_value.get_clique_identifiers.assert_called_once_with("MONDO:0004979")
+
+    def test_test_concord_no_label(self):
+        runner = CliRunner()
+        mock_ident = MagicMock()
+        mock_ident.curie = "MONDO:0004979"
+        mock_ident.label = None
+        mock_ident.biolink_type = "biolink:Disease"
+
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979"])
+
+        assert result.exit_code == 0
+        assert "MONDO:0004979" in result.output
+        assert "biolink:Disease" in result.output

From 17782a2149229c6ac33356bf54ed5b416b67ed8d Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 26 Mar 2026 00:49:42 -0400
Subject: [PATCH 54/66] Reformatted code with ruff.

---
 tests/test_cli.py | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index c8cc924..09d415d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -62,9 +62,11 @@ def test_xrefs_happy_path(self):
         mock_xref = MagicMock()
         mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
 
-        with patch("babel_explorer.cli.BabelDownloader"), \
-             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
-             patch("babel_explorer.cli.NodeNorm"):
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
             mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref]
             result = runner.invoke(cli, ["xrefs", "MONDO:0004979"])
 
@@ -78,11 +80,15 @@ def test_xrefs_recurse_and_labels_flags(self):
         mock_xref = MagicMock()
         mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
 
-        with patch("babel_explorer.cli.BabelDownloader"), \
-             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
-             patch("babel_explorer.cli.NodeNorm"):
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
             mock_bx.return_value.get_curie_xrefs.return_value = [mock_xref]
-            result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"])
+            result = runner.invoke(
+                cli, ["xrefs", "MONDO:0004979", "--recurse", "--labels"]
+            )
 
         assert result.exit_code == 0
         mock_bx.return_value.get_curie_xrefs.assert_called_once_with(
@@ -92,11 +98,15 @@ def test_xrefs_recurse_and_labels_flags(self):
     def test_xrefs_check_download_option(self):
         runner = CliRunner()
 
-        with patch("babel_explorer.cli.BabelDownloader") as mock_dl, \
-             patch("babel_explorer.cli.BabelXRefs") as mock_bx, \
-             patch("babel_explorer.cli.NodeNorm"):
+        with (
+            patch("babel_explorer.cli.BabelDownloader") as mock_dl,
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
             mock_bx.return_value.get_curie_xrefs.return_value = []
-            result = runner.invoke(cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"])
+            result = runner.invoke(
+                cli, ["xrefs", "MONDO:0004979", "--check-download", "1h"]
+            )
 
         assert result.exit_code == 0
         _, kwargs = mock_dl.call_args
@@ -107,8 +117,10 @@ def test_ids_happy_path(self):
         mock_id = MagicMock()
         mock_id.__str__ = lambda self: "MONDO:0004979 record"
 
-        with patch("babel_explorer.cli.BabelDownloader"), \
-             patch("babel_explorer.cli.BabelXRefs") as mock_bx:
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+        ):
             mock_bx.return_value.get_curie_ids.return_value = [mock_id]
             result = runner.invoke(cli, ["ids", "MONDO:0004979"])
 
@@ -128,7 +140,9 @@ def test_test_concord_happy_path(self):
 
         assert result.exit_code == 0
         assert "asthma" in result.output
-        mock_nn.return_value.get_clique_identifiers.assert_called_once_with("MONDO:0004979")
+        mock_nn.return_value.get_clique_identifiers.assert_called_once_with(
+            "MONDO:0004979"
+        )
 
     def test_test_concord_no_label(self):
         runner = CliRunner()

From d34c5c32ee272bbab4af4c241be3d95f25a9238a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 30 Mar 2026 16:14:01 -0400
Subject: [PATCH 55/66] Update src/babel_explorer/cli.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/babel_explorer/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 878720a..eb2cbe1 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -61,7 +61,7 @@ def cli():
     "--nodenorm-url",
     type=str,
     default="https://nodenormalization-sri.renci.org/",
-    help="NodeNorm URL to check for concord changes",
+    help="NodeNorm base URL used for node normalization and label enrichment",
 )
 @click.option("--recurse", is_flag=True, help="Recursively query returned xrefs")
 @click.option("--labels", is_flag=True, help="Include labels for CURIEs")

From ec0a71c03b1b69acb06b3d572f01266edc7de5f8 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 30 Mar 2026 16:28:10 -0400
Subject: [PATCH 56/66] Cache get_identifier() locals in _to_labeled_xref;
 root-anchor lib/ in .gitignore

Both backported from add-nodenorm-frontend:
- babel_xrefs.py: avoid calling get_identifier() twice per CURIE in _to_labeled_xref
- .gitignore: anchor lib/ and lib64/ to repo root so nested lib dirs aren't ignored

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                             |  5 +++--
 src/babel_explorer/core/babel_xrefs.py | 10 ++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 67d8b31..55c6b2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,9 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
-lib64/
+# Python distribution lib directories (not web/src/lib/)
+/lib/
+/lib64/
 parts/
 sdist/
 var/
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index c218761..53dfb28 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -122,15 +122,17 @@ def get_curie_xref(self, curie: str, label_curies: bool = False):
 
     def _to_labeled_xref(self, xref: CrossReference) -> LabeledCrossReference:
         """Convert a CrossReference to a LabeledCrossReference using NodeNorm."""
+        subj_ident = self.nodenorm.get_identifier(xref.subj)
+        obj_ident = self.nodenorm.get_identifier(xref.obj)
         return LabeledCrossReference(
             subj=xref.subj,
             obj=xref.obj,
             filename=xref.filename,
             pred=xref.pred,
-            subj_label=self.nodenorm.get_identifier(xref.subj).label,
-            subj_biolink_type=self.nodenorm.get_identifier(xref.subj).biolink_type,
-            obj_label=self.nodenorm.get_identifier(xref.obj).label,
-            obj_biolink_type=self.nodenorm.get_identifier(xref.obj).biolink_type,
+            subj_label=subj_ident.label,
+            subj_biolink_type=subj_ident.biolink_type,
+            obj_label=obj_ident.label,
+            obj_biolink_type=obj_ident.biolink_type,
         )
 
     def _get_curie_xrefs_recursive(self, curies: list[str], label_curies: bool = False):

From 67be81e0b535de1a6a75fbbacc27153d483d6f98 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 30 Mar 2026 16:47:31 -0400
Subject: [PATCH 57/66] Fix bugs and gaps identified in PR #1 code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- nodenorm.py: Identifier.biolink_type str→list[str] to match NodeNorm API
- nodenorm.py: get_clique_identifiers returns [] instead of None; add return type annotation
- nodenorm.py: log debug message when get_identifier finds no exact match
- cli.py: parse_duration return type int|float; join biolink_type list for display
- tests: update assertions for new biolink_type type; add test-concord edge cases
  (unknown CURIE producing no output, multiple CURIEs queried independently)
- ci.yml: add workflow_dispatch trigger and integration-test job

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml            | 10 ++++++++
 src/babel_explorer/cli.py           | 11 ++++-----
 src/babel_explorer/core/nodenorm.py | 11 +++++----
 tests/test_cli.py                   | 37 +++++++++++++++++++++++++++--
 tests/test_nodenorm.py              | 10 ++++----
 5 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8ce9dd1..5ee70c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,6 +2,7 @@ name: CI
 
 on:
   pull_request:
+  workflow_dispatch:
 
 jobs:
   lint:
@@ -20,3 +21,12 @@ jobs:
       - uses: astral-sh/setup-uv@v5
       - run: uv sync --group dev
       - run: uv run pytest -v -m "not integration"
+
+  integration-test:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - run: uv sync --group dev
+      - run: uv run pytest -v -m "integration and not slow"
diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index eb2cbe1..e6e3fab 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -6,7 +6,7 @@
 from babel_explorer.core.nodenorm import NodeNorm
 
 
-def parse_duration(value: str) -> float:
+def parse_duration(value: str) -> int | float:
     """Parse a duration string like '3h', '30m', '1d', '7200', or 'never' → seconds."""
     units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
     lower = (value or "").strip().lower()
@@ -169,13 +169,12 @@ def test_concord(curies, nodenorm_url):
     nodenorm = NodeNorm(nodenorm_url)
     for curie in curies:
         identifiers = nodenorm.get_clique_identifiers(curie)
-        for identifier in identifiers or []:
+        for identifier in identifiers:
+            biolink = ", ".join(identifier.biolink_type)
             if identifier.label:
-                print(
-                    f"{curie}\t{identifier.curie}\t{identifier.label}\t{identifier.biolink_type}"
-                )
+                print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{biolink}")
             else:
-                print(f"{curie}\t{identifier.curie}\t\t{identifier.biolink_type}")
+                print(f"{curie}\t{identifier.curie}\t\t{biolink}")
 
 
 if __name__ == "__main__":
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index ec5b0a8..f4ead30 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -8,7 +8,7 @@
 class Identifier:
     curie: str
     label: str = ""
-    biolink_type: str = ""
+    biolink_type: list[str] = dataclasses.field(default_factory=list)
     taxa: list[str] = dataclasses.field(default_factory=list)
     description: list[str] = dataclasses.field(default_factory=list)
 
@@ -47,6 +47,9 @@ def get_identifier(self, curie: str):
                 logging.debug(f"Found exact match for {curie}: {identifier}")
                 return Identifier.from_dict(identifier)
 
+        logging.debug(
+            f"No exact match for {curie!r} in equivalent_identifiers; returning bare Identifier"
+        )
         return Identifier(curie=curie)
 
     @functools.lru_cache(maxsize=None)
@@ -83,12 +86,12 @@ def normalize_curie(
             return None
 
     @functools.lru_cache(maxsize=None)
-    def get_clique_identifiers(self, curie, **kwargs):
+    def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]:
         result = self.normalize_curie(curie, **kwargs)
         if not result:
-            return None
+            return []
         if "equivalent_identifiers" not in result:
-            return None
+            return []
         return list(
             map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"])
         )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 09d415d..ac75fe6 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -132,7 +132,7 @@ def test_test_concord_happy_path(self):
         mock_ident = MagicMock()
         mock_ident.curie = "MONDO:0004979"
         mock_ident.label = "asthma"
-        mock_ident.biolink_type = "biolink:Disease"
+        mock_ident.biolink_type = ["biolink:Disease"]
 
         with patch("babel_explorer.cli.NodeNorm") as mock_nn:
             mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident]
@@ -149,7 +149,7 @@ def test_test_concord_no_label(self):
         mock_ident = MagicMock()
         mock_ident.curie = "MONDO:0004979"
         mock_ident.label = None
-        mock_ident.biolink_type = "biolink:Disease"
+        mock_ident.biolink_type = ["biolink:Disease"]
 
         with patch("babel_explorer.cli.NodeNorm") as mock_nn:
             mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident]
@@ -158,3 +158,36 @@ def test_test_concord_no_label(self):
         assert result.exit_code == 0
         assert "MONDO:0004979" in result.output
         assert "biolink:Disease" in result.output
+
+    def test_test_concord_unknown_curie_produces_no_output(self):
+        """When get_clique_identifiers returns [], no output is produced and exit code is 0."""
+        runner = CliRunner()
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = []
+            result = runner.invoke(cli, ["test-concord", "UNKNOWN:9999"])
+        assert result.exit_code == 0
+        assert result.output.strip() == ""
+
+    def test_test_concord_multiple_curies(self):
+        """Each CURIE is looked up independently."""
+        runner = CliRunner()
+        mock_a = MagicMock()
+        mock_a.curie = "A:1"
+        mock_a.label = "Alpha"
+        mock_a.biolink_type = ["biolink:Disease"]
+        mock_b = MagicMock()
+        mock_b.curie = "B:2"
+        mock_b.label = "Beta"
+        mock_b.biolink_type = ["biolink:Gene"]
+
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.side_effect = [
+                [mock_a],
+                [mock_b],
+            ]
+            result = runner.invoke(cli, ["test-concord", "A:1", "B:2"])
+
+        assert result.exit_code == 0
+        assert mock_nn.return_value.get_clique_identifiers.call_count == 2
+        assert "Alpha" in result.output
+        assert "Beta" in result.output
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
index 363d077..71459e7 100644
--- a/tests/test_nodenorm.py
+++ b/tests/test_nodenorm.py
@@ -26,7 +26,7 @@ def test_creation_with_defaults(self):
         ident = Identifier(curie="MONDO:0004979")
         assert ident.curie == "MONDO:0004979"
         assert ident.label == ""
-        assert ident.biolink_type == ""
+        assert ident.biolink_type == []
         assert ident.taxa == []
         assert ident.description == []
 
@@ -34,12 +34,12 @@ def test_full_creation(self):
         ident = Identifier(
             curie="MONDO:0004979",
             label="asthma",
-            biolink_type="biolink:Disease",
+            biolink_type=["biolink:Disease"],
             taxa=["NCBITaxon:9606"],
             description=["A chronic respiratory disease"],
         )
         assert ident.label == "asthma"
-        assert ident.biolink_type == "biolink:Disease"
+        assert ident.biolink_type == ["biolink:Disease"]
         assert ident.taxa == ["NCBITaxon:9606"]
 
     def test_from_dict_minimal(self):
@@ -67,7 +67,7 @@ def test_from_dict_partial(self):
         ident = Identifier.from_dict(d)
         assert ident.curie == "X:1"
         assert ident.label == "Beta"
-        assert ident.biolink_type == ""
+        assert ident.biolink_type == []
 
     def test_lt_ordering(self):
         a = Identifier(curie="A:1")
@@ -232,7 +232,7 @@ def test_missing_key_returns_none(self):
         api_result = {"id": {"identifier": "X:1"}}  # no equivalent_identifiers
         with patch.object(nn, "normalize_curie", return_value=api_result):
             result = nn.get_clique_identifiers("X:1")
-            assert result is None
+            assert result == []
 
     def test_caching(self):
         nn = self._make_nn()

From 4199ae2151745e9be86fe00dbd334c35f44af5a6 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 30 Mar 2026 16:55:35 -0400
Subject: [PATCH 58/66] Run integration tests on push to master and weekly on
 Tuesdays

- Add push trigger for master branch (fires when PRs are merged)
- Add schedule trigger: Tuesdays at 17:00 UTC (12pm EST / 1pm EDT)
- Change integration-test job condition to run on all non-PR events

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5ee70c1..7f712ae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,6 +2,10 @@ name: CI
 
 on:
   pull_request:
+  push:
+    branches: [master]
+  schedule:
+    - cron: "0 17 * * 2"  # Tuesdays at 12pm EST (17:00 UTC); 1pm during EDT
   workflow_dispatch:
 
 jobs:
@@ -24,7 +28,7 @@ jobs:
 
   integration-test:
     runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch'
+    if: github.event_name != 'pull_request'
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v5

From 17f6b09f9a610be97bd30eb918c2278e8b76e8b6 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 30 Mar 2026 18:09:17 -0400
Subject: [PATCH 59/66] Add module, class, and method docstrings to new files
 in PR #1

Source files:
- nodenorm.py: module, Identifier class/from_dict, NodeNorm class/__init__/
  get_identifier/normalize_curie/get_clique_identifiers
- babel_xrefs.py: convert # comment to module docstring; CrossReference class/
  from_tuple/curies property; LabeledCrossReference class; IdentifierRecord.__str__;
  BabelXRefs class/__init__/get_curie_xref
- downloader.py: module, BabelDownloader.__init__, get_output_file
- cli.py: cli() group, test_concord() command

Test files (class docstrings only):
- test_babel_xrefs.py: TestCrossReference, TestLabeledCrossReference,
  TestIdentifierRecord, TestBabelXRefsInit
- test_nodenorm.py: TestIdentifier, TestNodeNormInit, TestNormalizeCurieMocked,
  TestGetIdentifierMocked, TestGetCliqueIdentifiersMocked

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/cli.py              |  6 ++++
 src/babel_explorer/core/babel_xrefs.py | 40 ++++++++++++++++++++++++--
 src/babel_explorer/core/downloader.py  | 14 +++++++++
 src/babel_explorer/core/nodenorm.py    | 33 ++++++++++++++++++++-
 tests/test_babel_xrefs.py              |  8 ++++++
 tests/test_nodenorm.py                 | 10 +++++++
 6 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index e6e3fab..6e899df 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -40,6 +40,7 @@ def parse_duration(value: str) -> int | float:
 
 @click.group()
 def cli():
+    """babel-explorer: query and explore Babel intermediate files."""
     pass
 
 
@@ -163,6 +164,11 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     help="NodeNorm URL to check for concord changes",
 )
 def test_concord(curies, nodenorm_url):
+    """For each CURIE, print the current NodeNorm clique (all equivalent identifiers, labels, and Biolink types).
+
+    Useful for inspecting how a potential Babel concordance change would affect NodeNorm:
+    run before and after a Babel rebuild to see how cliques would shift.
+    """
     # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm?
     # By definition, this can only combine all the cliques mentioned in the CURIEs.
 
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 53dfb28..e89acb1 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -1,6 +1,10 @@
-# Babel XRefs is a tool for accessing and querying the intermediate files
-# that we make available with Babel builds. This allows you to find out
-# why we consider two identifiers to be identical.
+"""Query engine for Babel cross-reference intermediate files.
+
+Provides access to Concord.parquet and Identifiers.parquet via DuckDB,
+allowing callers to discover why two biological/chemical identifiers are
+considered identical in a Babel build.
+"""
+
 import dataclasses
 import logging
 import duckdb
@@ -12,6 +16,8 @@
 
 @dataclasses.dataclass(frozen=True)
 class CrossReference:
+    """A single cross-reference edge read from Concord.parquet."""
+
     filename: str
     subj: str
     pred: str
@@ -19,12 +25,14 @@ class CrossReference:
 
     @staticmethod
     def from_tuple(tuple: tuple[str, str, str, str]):
+        """Construct from a ``(filename, subj, pred, obj)`` database row tuple."""
         return CrossReference(
             filename=tuple[0], subj=tuple[1], pred=tuple[2], obj=tuple[3]
         )
 
     @property
     def curies(self):
+        """The frozenset of both CURIEs in this edge (subject and object)."""
         return frozenset([self.subj, self.obj])
 
     def __lt__(self, other):
@@ -38,6 +46,8 @@ def __lt__(self, other):
 
 @dataclasses.dataclass(frozen=True)
 class LabeledCrossReference(CrossReference):
+    """A CrossReference enriched with human-readable labels and Biolink types from NodeNorm."""
+
     subj_label: str
     subj_biolink_type: str
     obj_label: str
@@ -64,6 +74,7 @@ def from_row(row: tuple, column_names: list[str]):
         return IdentifierRecord(curie=row[curie_idx], extra_fields=extra)
 
     def __str__(self):
+        """Return a ``key=value`` string of the CURIE and all extra fields."""
         parts = [f"curie={self.curie!r}"]
         for name, value in self.extra_fields:
             parts.append(f"{name}={value!r}")
@@ -71,7 +82,20 @@ def __str__(self):
 
 
 class BabelXRefs:
+    """Query engine for Babel cross-reference and identifier Parquet files.
+
+    Uses DuckDB for in-memory SQL queries against Concord.parquet and
+    Identifiers.parquet. NodeNorm is optional and only required when
+    ``label_curies=True`` is passed to enrichment-aware methods.
+    """
+
     def __init__(self, downloader: BabelDownloader, nodenorm: NodeNorm = None):
+        """
+        :param downloader: A configured ``BabelDownloader`` that provides local paths
+            to the required Parquet files, downloading them on first access.
+        :param nodenorm: Optional ``NodeNorm`` client. Required only when callers pass
+            ``label_curies=True``; may be ``None`` for label-free queries.
+        """
         self.downloader = downloader
         self.nodenorm = nodenorm
 
@@ -101,6 +125,16 @@ def get_curie_ids(self, curies: list[str]) -> list[IdentifierRecord]:
 
     @functools.lru_cache(maxsize=None)
     def get_curie_xref(self, curie: str, label_curies: bool = False):
+        """Return all cross-references in Concord.parquet where *curie* is the subject or object.
+
+        Results are LRU-cached per ``(curie, label_curies)`` pair.
+
+        :param curie: The CURIE to look up.
+        :param label_curies: If ``True``, annotate each result with NodeNorm labels and
+            Biolink types. Requires a NodeNorm instance to have been passed to ``__init__``.
+        :raises ValueError: If ``label_curies=True`` but no NodeNorm instance is available.
+        :return: A list of ``CrossReference`` (or ``LabeledCrossReference``) objects.
+        """
         if label_curies and self.nodenorm is None:
             raise ValueError(
                 "label_curies=True requires a configured NodeNorm instance (nodenorm was None)."
diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 6ba9a38..f314c3d 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -1,3 +1,5 @@
+"""HTTP downloader for Babel Parquet files with ETag-based freshness checking."""
+
 import functools
 import json
 import os
@@ -23,6 +25,17 @@ def __init__(
         freshness_seconds=3 * 3600,
         timeout: int = 30,
     ):
+        """
+        :param url_base: Base URL of the Babel server (must end with ``/``).
+        :param local_path: Directory for cached downloads. Defaults to
+            ``tempfile.gettempdir()`` if ``None``; created automatically if it
+            does not exist.
+        :param retries: Maximum number of download retry attempts on failure.
+        :param freshness_seconds: How long a local file is considered fresh without
+            re-checking the server. Use ``float('inf')`` to never re-check, or ``0``
+            to always issue a HEAD request. Defaults to 3 hours.
+        :param timeout: HTTP request timeout in seconds.
+        """
         # We assume the URL base is correct (if not, we can fix it later).
         self.url_base = url_base
         self.retries = retries
@@ -46,6 +59,7 @@ def __init__(
 
     @functools.lru_cache(maxsize=None)
     def get_output_file(self, filename):
+        """Return (and create) the local filesystem path for a given relative filename."""
         filepath = os.path.join(self.local_path, filename)
         os.makedirs(os.path.dirname(filepath), exist_ok=True)
         return filepath
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index f4ead30..fae0a57 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -1,3 +1,5 @@
+"""NodeNorm API client for identifier normalisation and label enrichment."""
+
 import dataclasses
 import functools
 import requests
@@ -6,6 +8,8 @@
 
 @dataclasses.dataclass
 class Identifier:
+    """Normalised identifier record returned by the NodeNorm API."""
+
     curie: str
     label: str = ""
     biolink_type: list[str] = dataclasses.field(default_factory=list)
@@ -17,6 +21,7 @@ def __lt__(self, other):
 
     @staticmethod
     def from_dict(d: dict):
+        """Parse an identifier entry from a NodeNorm API response dict."""
         identifier = Identifier(curie=d["identifier"])
         if "label" in d:
             identifier.label = d["label"]
@@ -30,14 +35,29 @@ def from_dict(d: dict):
 
 
 class NodeNorm:
+    """Client for the NodeNormalization API (https://nodenormalization-sri.renci.org/)."""
+
     def __init__(self, nodenorm_url: str = "", timeout: int = 30):
+        """
+        :param nodenorm_url: Base URL of the NodeNorm service. Pass an empty string (default)
+            to skip all network calls and have every lookup return a bare ``Identifier``.
+        :param timeout: HTTP request timeout in seconds.
+        """
         self.nodenorm_url = nodenorm_url
         self.timeout = timeout
         if self.nodenorm_url and not self.nodenorm_url.endswith("/"):
             self.nodenorm_url += "/"
 
     @functools.lru_cache(maxsize=None)
-    def get_identifier(self, curie: str):
+    def get_identifier(self, curie: str) -> "Identifier":
+        """Return the ``Identifier`` for *curie* by looking it up in its NodeNorm clique.
+
+        Searches ``equivalent_identifiers`` for an entry whose ``identifier`` field matches
+        *curie* exactly. Falls back to a bare ``Identifier(curie=curie)`` (empty label and
+        type) if NodeNorm does not recognise the CURIE or it is not listed in the clique.
+
+        Results are LRU-cached so repeated calls for the same CURIE are free.
+        """
         result = self.normalize_curie(curie)
         logging.debug(f"Normalizing {curie} with NodeNorm to result: {result}")
         if not result:
@@ -62,6 +82,12 @@ def normalize_curie(
         individual_types=True,
         include_taxa=True,
     ):
+        """Call ``get_normalized_nodes`` and return the per-CURIE result dict.
+
+        :return: The normalisation dict for *curie* (contains ``id``, ``equivalent_identifiers``,
+            ``type``, etc.), or ``None`` if the CURIE is not recognised by NodeNorm.
+        :raises requests.HTTPError: If the API returns a non-2xx status code.
+        """
         response = requests.get(
             f"{self.nodenorm_url}get_normalized_nodes",
             params={
@@ -87,6 +113,11 @@ def normalize_curie(
 
     @functools.lru_cache(maxsize=None)
     def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]:
+        """Return all ``Identifier`` objects in the NodeNorm clique for *curie*.
+
+        :return: A list of ``Identifier`` objects (one per entry in ``equivalent_identifiers``),
+            or an empty list if the CURIE is unknown or has no equivalents.
+        """
         result = self.normalize_curie(curie, **kwargs)
         if not result:
             return []
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index d67f81b..7c48935 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -27,6 +27,8 @@
 
 
 class TestCrossReference:
+    """Tests for the CrossReference frozen dataclass."""
+
     def test_creation(self):
         xr = CrossReference(
             filename="f.txt", subj="A:1", pred="skos:exactMatch", obj="B:2"
@@ -85,6 +87,8 @@ def test_sorting(self):
 
 
 class TestLabeledCrossReference:
+    """Tests for the LabeledCrossReference frozen dataclass."""
+
     def test_creation(self):
         lxr = LabeledCrossReference(
             subj="A:1",
@@ -149,6 +153,8 @@ def test_str(self):
 
 
 class TestIdentifierRecord:
+    """Tests for the IdentifierRecord frozen dataclass."""
+
     def test_creation(self):
         rec = IdentifierRecord(curie="MONDO:0004979")
         assert rec.curie == "MONDO:0004979"
@@ -181,6 +187,8 @@ def test_str(self):
 
 
 class TestBabelXRefsInit:
+    """Tests for BabelXRefs constructor."""
+
     def test_init_without_nodenorm(self, tmp_path):
         dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
         bx = BabelXRefs(dl)
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
index 71459e7..5fb088d 100644
--- a/tests/test_nodenorm.py
+++ b/tests/test_nodenorm.py
@@ -22,6 +22,8 @@
 
 
 class TestIdentifier:
+    """Tests for the Identifier dataclass."""
+
     def test_creation_with_defaults(self):
         ident = Identifier(curie="MONDO:0004979")
         assert ident.curie == "MONDO:0004979"
@@ -90,6 +92,8 @@ def test_sorting(self):
 
 
 class TestNodeNormInit:
+    """Tests for NodeNorm constructor and URL normalisation."""
+
     def test_default_url(self):
         nn = NodeNorm()
         assert nn.nodenorm_url == ""
@@ -100,6 +104,8 @@ def test_custom_url(self):
 
 
 class TestNormalizeCurieMocked:
+    """Unit tests for NodeNorm.normalize_curie() with mocked HTTP responses."""
+
     def _make_nn(self):
         nn = NodeNorm(nodenorm_url="https://example.com/")
         nn.normalize_curie.cache_clear()
@@ -156,6 +162,8 @@ def test_http_error_raises(self):
 
 
 class TestGetIdentifierMocked:
+    """Unit tests for NodeNorm.get_identifier() with mocked normalize_curie."""
+
     def _make_nn(self):
         nn = NodeNorm(nodenorm_url="https://example.com/")
         nn.normalize_curie.cache_clear()
@@ -208,6 +216,8 @@ def test_caching(self):
 
 
 class TestGetCliqueIdentifiersMocked:
+    """Unit tests for NodeNorm.get_clique_identifiers() with mocked normalize_curie."""
+
     def _make_nn(self):
         nn = NodeNorm(nodenorm_url="https://example.com/")
         nn.normalize_curie.cache_clear()

From 8216b0b4007bde67dcf27a3a7a3b5feb44cfc1cc Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 1 Apr 2026 01:40:43 -0400
Subject: [PATCH 60/66] Fix LabeledCrossReference biolink_type fields to
 list[str]; simplify map to listcomp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- babel_xrefs.py: subj_biolink_type/obj_biolink_type str→list[str] to match
  Identifier.biolink_type after the nodenorm.py type change
- nodenorm.py: replace map(lambda) with list comprehension in get_clique_identifiers
- tests: update LabeledCrossReference construction to use list values

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/babel_xrefs.py |  4 ++--
 src/babel_explorer/core/nodenorm.py    |  4 +---
 tests/test_babel_xrefs.py              | 18 +++++++++---------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index 53dfb28..2df9ac6 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -39,9 +39,9 @@ def __lt__(self, other):
 @dataclasses.dataclass(frozen=True)
 class LabeledCrossReference(CrossReference):
     subj_label: str
-    subj_biolink_type: str
+    subj_biolink_type: list[str]
     obj_label: str
-    obj_biolink_type: str
+    obj_biolink_type: list[str]
 
     def __str__(self):
         return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")"""
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index f4ead30..f83a5d4 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -92,6 +92,4 @@ def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]:
             return []
         if "equivalent_identifiers" not in result:
             return []
-        return list(
-            map(lambda x: Identifier.from_dict(x), result["equivalent_identifiers"])
-        )
+        return [Identifier.from_dict(x) for x in result["equivalent_identifiers"]]
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index d67f81b..409c6fe 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -92,13 +92,13 @@ def test_creation(self):
             obj="B:2",
             filename="f",
             subj_label="Alpha",
-            subj_biolink_type="biolink:Disease",
+            subj_biolink_type=["biolink:Disease"],
             obj_label="Beta",
-            obj_biolink_type="biolink:Gene",
+            obj_biolink_type=["biolink:Gene"],
         )
         assert lxr.subj == "A:1"
         assert lxr.subj_label == "Alpha"
-        assert lxr.obj_biolink_type == "biolink:Gene"
+        assert lxr.obj_biolink_type == ["biolink:Gene"]
 
     def test_inherits_from_cross_reference(self):
         lxr = LabeledCrossReference(
@@ -107,9 +107,9 @@ def test_inherits_from_cross_reference(self):
             obj="B:2",
             filename="f",
             subj_label="",
-            subj_biolink_type="",
+            subj_biolink_type=[],
             obj_label="",
-            obj_biolink_type="",
+            obj_biolink_type=[],
         )
         assert isinstance(lxr, CrossReference)
 
@@ -120,9 +120,9 @@ def test_curies_property(self):
             obj="B:2",
             filename="f",
             subj_label="",
-            subj_biolink_type="",
+            subj_biolink_type=[],
             obj_label="",
-            obj_biolink_type="",
+            obj_biolink_type=[],
         )
         assert lxr.curies == frozenset({"A:1", "B:2"})
 
@@ -133,9 +133,9 @@ def test_str(self):
             obj="B:2",
             filename="f",
             subj_label="Alpha",
-            subj_biolink_type="biolink:Disease",
+            subj_biolink_type=["biolink:Disease"],
             obj_label="Beta",
-            obj_biolink_type="biolink:Gene",
+            obj_biolink_type=["biolink:Gene"],
         )
         s = str(lxr)
         assert "A:1" in s

From ac418ff3d87440c77ad61b12c411cc6142a46dc1 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 1 Apr 2026 02:52:54 -0400
Subject: [PATCH 61/66] Address PR #1 review: frozen Identifier, atomic rename,
 fail-open HEAD, type fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- nodenorm.py: Identifier is now frozen=True; rewrite from_dict as one-shot
  constructor to avoid post-construction mutation of lru_cache'd objects
- nodenorm.py: remove **kwargs from get_clique_identifiers — unhashable and unused,
  would raise TypeError if any kwarg was ever passed
- downloader.py: download to .tmp then os.replace() so the final file is never
  partially written; clean up .tmp on failure
- downloader.py: _etag_matches returns True (fail open) on HEAD network error
  instead of False, avoiding spurious 2GB re-downloads on transient failures
- cli.py: add nodenorm_url: str annotation in xrefs and test_concord; move
  test_concord inline comment to docstring
- tests: update test_returns_false_on_request_error → test_returns_true_on_request_error
- FUTURE.md: track CLI option deduplication refactor

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 FUTURE.md                             |  7 +++++++
 src/babel_explorer/cli.py             | 10 ++++++----
 src/babel_explorer/core/downloader.py | 25 +++++++++++++++++--------
 src/babel_explorer/core/nodenorm.py   | 25 +++++++++++--------------
 tests/test_downloader.py              |  5 +++--
 5 files changed, 44 insertions(+), 28 deletions(-)
 create mode 100644 FUTURE.md

diff --git a/FUTURE.md b/FUTURE.md
new file mode 100644
index 0000000..a2bb3fb
--- /dev/null
+++ b/FUTURE.md
@@ -0,0 +1,7 @@
+# Future Work
+
+## Deduplicate CLI option blocks
+
+`--local-dir`, `--babel-url`, and `--check-download` are copy-pasted between the
+`xrefs` and `ids` commands in `cli.py`. Extract a `@common_babel_options` Click
+decorator so defaults are defined in one place and can't drift.
diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index e6e3fab..207968f 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -76,7 +76,7 @@ def cli():
 def xrefs(
     curies: list[str],
     babel_url: str,
-    nodenorm_url,
+    nodenorm_url: str,
     local_dir: str,
     recurse: bool,
     labels: bool,
@@ -162,10 +162,12 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     default="https://nodenormalization-sri.renci.org/",
     help="NodeNorm URL to check for concord changes",
 )
-def test_concord(curies, nodenorm_url):
-    # We're trying to answer a simple question here: if the CURIEs we mention were combined, how would the cliques change in NodeNorm?
-    # By definition, this can only combine all the cliques mentioned in the CURIEs.
+def test_concord(curies: tuple[str, ...], nodenorm_url: str):
+    """
+    For each input CURIE, show what clique NodeNorm currently maps it to.
 
+    Answers: if these CURIEs were merged in Babel, which NodeNorm cliques would combine?
+    """
     nodenorm = NodeNorm(nodenorm_url)
     for curie in curies:
         identifiers = nodenorm.get_clique_identifiers(curie)
diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index 6ba9a38..63e6826 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -130,8 +130,10 @@ def _etag_matches(self, url, meta):
             response = requests.head(url, timeout=self.timeout)
             response.raise_for_status()
         except requests.RequestException as e:
-            self.logger.warning(f"HEAD request failed for {url}: {e}")
-            return False
+            self.logger.warning(
+                f"HEAD request failed for {url}: {e}; assuming file is current"
+            )
+            return True
 
         remote_headers = response.headers
 
@@ -321,20 +323,27 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024):
                     )
                     return local_path_to_download_to
 
-                # Tier 3: ETag changed — delete and re-download
+                # Tier 3: ETag changed — re-download
                 self.logger.warning(
                     f"Remote file changed, re-downloading: {local_path_to_download_to}"
                 )
-                os.remove(local_path_to_download_to)
 
         self.logger.info(
             f"Downloading {url_to_download} to {local_path_to_download_to}"
         )
 
-        # Download with retry logic; get response headers back
-        response_headers = self._download_with_retry(
-            url_to_download, local_path_to_download_to, chunk_size
-        )
+        # Download to a sibling .tmp file, then atomically replace the final destination.
+        # This ensures the final file is never partially written.
+        tmp_path = local_path_to_download_to + ".tmp"
+        try:
+            response_headers = self._download_with_retry(
+                url_to_download, tmp_path, chunk_size
+            )
+            os.replace(tmp_path, local_path_to_download_to)
+        except Exception:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+            raise
 
         # Save sidecar metadata
         if response_headers is not None:
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index f83a5d4..42845fb 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -4,7 +4,7 @@
 import logging
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class Identifier:
     curie: str
     label: str = ""
@@ -16,17 +16,14 @@ def __lt__(self, other):
         return self.curie < other.curie
 
     @staticmethod
-    def from_dict(d: dict):
-        identifier = Identifier(curie=d["identifier"])
-        if "label" in d:
-            identifier.label = d["label"]
-        if "taxa" in d:
-            identifier.taxa = d["taxa"]
-        if "description" in d:
-            identifier.description = d["description"]
-        if "type" in d:
-            identifier.biolink_type = d["type"]
-        return identifier
+    def from_dict(d: dict) -> "Identifier":
+        return Identifier(
+            curie=d["identifier"],
+            label=d.get("label", ""),
+            biolink_type=d.get("type", []),
+            taxa=d.get("taxa", []),
+            description=d.get("description", []),
+        )
 
 
 class NodeNorm:
@@ -86,8 +83,8 @@ def normalize_curie(
             return None
 
     @functools.lru_cache(maxsize=None)
-    def get_clique_identifiers(self, curie, **kwargs) -> list[Identifier]:
-        result = self.normalize_curie(curie, **kwargs)
+    def get_clique_identifiers(self, curie: str) -> list[Identifier]:
+        result = self.normalize_curie(curie)
         if not result:
             return []
         if "equivalent_identifiers" not in result:
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index 9b33e7a..d23b323 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -267,14 +267,15 @@ def test_fallback_last_modified_match(self, tmp_path):
         ):
             assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
-    def test_returns_false_on_request_error(self, tmp_path):
+    def test_returns_true_on_request_error(self, tmp_path):
+        """Network errors are treated as 'assume still fresh' to avoid triggering large re-downloads."""
         dl = self._make_dl(tmp_path)
         meta = {"etag": '"abc"'}
         with patch(
             "babel_explorer.core.downloader.requests.head",
             side_effect=requests.ConnectionError("fail"),
         ):
-            assert dl._etag_matches("https://example.com/f.parquet", meta) is False
+            assert dl._etag_matches("https://example.com/f.parquet", meta) is True
 
 
 class TestGetDownloadedFileTiers:

From 06cd300786b69e5f4437f57897b704ad7452171b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Wed, 1 Apr 2026 02:58:46 -0400
Subject: [PATCH 62/66] Sync CLAUDE.md with current code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix --expand → --recurse (the actual flag name) in Data Flow and Key Design Patterns
- BabelXRefs: remove false claim about writing DuckDB databases to disk;
  all connections are in-memory (duckdb.connect() with no path)
- Remove 'Generated DuckDB databases' entry from File Locations (nothing on disk)
- Update test count table: numbers were stale and test_cli.py was missing entirely
- Add Identifier to Key Dataclasses (now frozen=True as of recent fix)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 7ad79fb..8ecea72 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -79,9 +79,9 @@ uv run ruff format
 
 2. **BabelXRefs** (`src/babel_explorer/core/babel_xrefs.py`):
    - Main query engine for cross-references
-   - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`, `Metadata.parquet`)
-   - Supports recursive expansion of cross-references
-   - Creates ephemeral DuckDB databases in `data/<version>/output/duckdbs/`
+   - Uses DuckDB to query Parquet files (`Concord.parquet`, `Identifiers.parquet`)
+   - Supports recursive expansion of cross-references via a single `WITH RECURSIVE` query
+   - Uses ephemeral in-memory DuckDB connections (nothing written to disk)
 
 3. **NodeNorm** (`src/babel_explorer/core/nodenorm.py`):
    - Integration with NodeNormalization API (https://nodenormalization-sri.renci.org/)
@@ -98,14 +98,14 @@ uv run ruff format
 1. User provides CURIEs via CLI
 2. BabelDownloader ensures required Parquet files are downloaded
 3. BabelXRefs queries files using DuckDB
-4. If `--labels` or `--expand` flags are set, NodeNorm is queried for additional metadata
+4. If `--labels` or `--recurse` flags are set, NodeNorm is queried for additional metadata
 5. Results are printed to stdout
 
 ### Key Design Patterns
 
 - **Lazy downloading**: Files are only downloaded when first accessed
 - **LRU caching**: Heavy use of `@functools.lru_cache` to avoid redundant downloads and API calls
-- **Recursive expansion**: The `--expand` flag recursively follows all cross-references to build complete graphs
+- **Recursive expansion**: The `--recurse` flag recursively follows all cross-references to build complete graphs
 - **DuckDB for querying**: In-memory SQL queries against Parquet files for fast lookups
 
 ## Testing
@@ -119,9 +119,10 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ
 
 | File | Unit | Integration | Slow | Total |
 |------|------|-------------|------|-------|
-| `tests/test_downloader.py` | 22 | 3 | 1 | 26 |
-| `tests/test_babel_xrefs.py` | 22 | 8 | 1 | 31 |
-| `tests/test_nodenorm.py` | 18 | 5 | 0 | 23 |
+| `tests/test_downloader.py` | 41 | 4 | 1 | 46 |
+| `tests/test_babel_xrefs.py` | 23 | 20 | 3 | 46 |
+| `tests/test_nodenorm.py` | 20 | 13 | 0 | 33 |
+| `tests/test_cli.py` | 24 | 0 | 0 | 24 |
 
 ### Test Infrastructure
 
@@ -131,6 +132,7 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ
 
 ### Key Dataclasses
 
+- **`Identifier`** — Frozen dataclass for a normalized NodeNorm entry (curie, label, biolink_type, taxa, description). Returned by `NodeNorm.get_identifier()` and `get_clique_identifiers()`.
 - **`CrossReference`** — Frozen dataclass for Concord.parquet rows (filename, subj, pred, obj)
 - **`LabeledCrossReference`** — Extends CrossReference with labels and biolink types from NodeNorm
 - **`IdentifierRecord`** — Frozen dataclass for Identifiers.parquet rows (curie + dynamic extra fields). Returned by `BabelXRefs.get_curie_ids()`.
@@ -146,5 +148,4 @@ Tests live in `tests/` and are split into fast **unit tests** (mocked, no networ
 - Tests: `tests/`
 - Test CURIEs: `tests/data/valid_curies.txt`
 - Downloaded Babel files: `data/<version>/duckdb/*.parquet`
-- Generated DuckDB databases: `data/<version>/output/duckdbs/`
 - Entry point: `src/babel_explorer/cli.py`

From bf1c48c06ff7fa1c3adcd458f71684da9bb10b7b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Wed, 1 Apr 2026 18:49:09 -0400
Subject: [PATCH 63/66] Address PR #1 review: fix six correctness and quality
 issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- list→tuple on Identifier and LabeledCrossReference fields so frozen
  dataclasses are hashable (was a TypeError crash in get_curie_xrefs)
- NodeNorm(''): add early return in normalize_curie so empty URL truly
  skips all network calls as documented
- BabelDownloader: auto-append trailing slash to url_base so urljoin
  can't silently drop path segments
- CI: fix push trigger branch master → main
- Remove dead get_downloaded_dir method (lru_cache + NotImplementedError)
- parse_duration: reject negative values with a clear BadParameter error

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml               |  2 +-
 src/babel_explorer/cli.py              | 11 ++++++++-
 src/babel_explorer/core/babel_xrefs.py |  4 ++--
 src/babel_explorer/core/downloader.py  | 19 ++--------------
 src/babel_explorer/core/nodenorm.py    | 14 +++++++-----
 tests/test_babel_xrefs.py              | 18 +++++++--------
 tests/test_cli.py                      |  2 ++
 tests/test_downloader.py               | 23 ++++++++++---------
 tests/test_nodenorm.py                 | 31 +++++++++++++++++---------
 9 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f712ae..c3ef4c8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: CI
 on:
   pull_request:
   push:
-    branches: [master]
+    branches: [main]
   schedule:
     - cron: "0 17 * * 2"  # Tuesdays at 12pm EST (17:00 UTC); 1pm during EDT
   workflow_dispatch:
diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 1854cc4..bcd8787 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -27,15 +27,24 @@ def parse_duration(value: str) -> int | float:
                 f"Invalid duration {value!r}: expected an integer followed by an optional unit "
                 "('s', 'm', 'h', or 'd'), or 'never'."
             )
+        if amount < 0:
+            raise click.BadParameter(
+                f"Invalid duration {value!r}: duration must be non-negative."
+            )
         return amount * units[lower[-1]]
     # Bare integer seconds
     try:
-        return int(lower)
+        result = int(lower)
     except ValueError:
         raise click.BadParameter(
             f"Invalid duration {value!r}: expected an integer number of seconds, optionally "
             "followed by 's', 'm', 'h', or 'd', or 'never'."
         )
+    if result < 0:
+        raise click.BadParameter(
+            f"Invalid duration {value!r}: duration must be non-negative."
+        )
+    return result
 
 
 @click.group()
diff --git a/src/babel_explorer/core/babel_xrefs.py b/src/babel_explorer/core/babel_xrefs.py
index a8b94b3..0c94074 100644
--- a/src/babel_explorer/core/babel_xrefs.py
+++ b/src/babel_explorer/core/babel_xrefs.py
@@ -49,9 +49,9 @@ class LabeledCrossReference(CrossReference):
     """A CrossReference enriched with human-readable labels and Biolink types from NodeNorm."""
 
     subj_label: str
-    subj_biolink_type: list[str]
+    subj_biolink_type: tuple[str, ...]
     obj_label: str
-    obj_biolink_type: list[str]
+    obj_biolink_type: tuple[str, ...]
 
     def __str__(self):
         return f"""LabeledCrossReference(subj="{self.subj}", pred="{self.pred}", obj="{self.obj}", subj_label="{self.subj_label}", subj_biolink_type="{self.subj_biolink_type}", obj_label="{self.obj_label}", obj_biolink_type="{self.obj_biolink_type}")"""
diff --git a/src/babel_explorer/core/downloader.py b/src/babel_explorer/core/downloader.py
index c5dcada..4d2f9a2 100644
--- a/src/babel_explorer/core/downloader.py
+++ b/src/babel_explorer/core/downloader.py
@@ -36,7 +36,8 @@ def __init__(
             to always issue a HEAD request. Defaults to 3 hours.
         :param timeout: HTTP request timeout in seconds.
         """
-        # We assume the URL base is correct (if not, we can fix it later).
+        if not url_base.endswith("/"):
+            url_base += "/"
         self.url_base = url_base
         self.retries = retries
         self.freshness_seconds = freshness_seconds
@@ -368,19 +369,3 @@ def get_downloaded_file(self, dirpath: str, chunk_size: int = 1024 * 1024):
             f"Downloaded {url_to_download} to {local_path_to_download_to}: {bytes_downloaded} bytes"
         )
         return local_path_to_download_to
-
-    @functools.lru_cache(maxsize=None)
-    def get_downloaded_dir(self, dirpath: str):
-        """
-        Download a directory recursively.
-
-        NOTE: This method is not implemented in the Python-based downloader.
-        Use get_downloaded_file() for individual files instead.
-
-        Raises:
-            NotImplementedError: This method is not implemented
-        """
-        raise NotImplementedError(
-            "Recursive directory downloads are not supported. "
-            "Use get_downloaded_file() for individual files."
-        )
diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index 11a7da9..9ce916d 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -12,9 +12,9 @@ class Identifier:
 
     curie: str
     label: str = ""
-    biolink_type: list[str] = dataclasses.field(default_factory=list)
-    taxa: list[str] = dataclasses.field(default_factory=list)
-    description: list[str] = dataclasses.field(default_factory=list)
+    biolink_type: tuple[str, ...] = ()
+    taxa: tuple[str, ...] = ()
+    description: tuple[str, ...] = ()
 
     def __lt__(self, other):
         return self.curie < other.curie
@@ -24,9 +24,9 @@ def from_dict(d: dict) -> "Identifier":
         return Identifier(
             curie=d["identifier"],
             label=d.get("label", ""),
-            biolink_type=d.get("type", []),
-            taxa=d.get("taxa", []),
-            description=d.get("description", []),
+            biolink_type=tuple(d.get("type", [])),
+            taxa=tuple(d.get("taxa", [])),
+            description=tuple(d.get("description", [])),
         )
 
 
@@ -84,6 +84,8 @@ def normalize_curie(
             ``type``, etc.), or ``None`` if the CURIE is not recognised by NodeNorm.
         :raises requests.HTTPError: If the API returns a non-2xx status code.
         """
+        if not self.nodenorm_url:
+            return None
         response = requests.get(
             f"{self.nodenorm_url}get_normalized_nodes",
             params={
diff --git a/tests/test_babel_xrefs.py b/tests/test_babel_xrefs.py
index 9fdb5da..114d269 100644
--- a/tests/test_babel_xrefs.py
+++ b/tests/test_babel_xrefs.py
@@ -96,13 +96,13 @@ def test_creation(self):
             obj="B:2",
             filename="f",
             subj_label="Alpha",
-            subj_biolink_type=["biolink:Disease"],
+            subj_biolink_type=("biolink:Disease",),
             obj_label="Beta",
-            obj_biolink_type=["biolink:Gene"],
+            obj_biolink_type=("biolink:Gene",),
         )
         assert lxr.subj == "A:1"
         assert lxr.subj_label == "Alpha"
-        assert lxr.obj_biolink_type == ["biolink:Gene"]
+        assert lxr.obj_biolink_type == ("biolink:Gene",)
 
     def test_inherits_from_cross_reference(self):
         lxr = LabeledCrossReference(
@@ -111,9 +111,9 @@ def test_inherits_from_cross_reference(self):
             obj="B:2",
             filename="f",
             subj_label="",
-            subj_biolink_type=[],
+            subj_biolink_type=(),
             obj_label="",
-            obj_biolink_type=[],
+            obj_biolink_type=(),
         )
         assert isinstance(lxr, CrossReference)
 
@@ -124,9 +124,9 @@ def test_curies_property(self):
             obj="B:2",
             filename="f",
             subj_label="",
-            subj_biolink_type=[],
+            subj_biolink_type=(),
             obj_label="",
-            obj_biolink_type=[],
+            obj_biolink_type=(),
         )
         assert lxr.curies == frozenset({"A:1", "B:2"})
 
@@ -137,9 +137,9 @@ def test_str(self):
             obj="B:2",
             filename="f",
             subj_label="Alpha",
-            subj_biolink_type=["biolink:Disease"],
+            subj_biolink_type=("biolink:Disease",),
             obj_label="Beta",
-            obj_biolink_type=["biolink:Gene"],
+            obj_biolink_type=("biolink:Gene",),
         )
         s = str(lxr)
         assert "A:1" in s
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ac75fe6..3d73e55 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -47,6 +47,8 @@ def test_valid_inputs(self, value, expected):
             "3.5h",
             "1.5",
             "3x",
+            "-5",
+            "-5h",
         ],
     )
     def test_invalid_inputs_raise_bad_parameter(self, value):
diff --git a/tests/test_downloader.py b/tests/test_downloader.py
index d23b323..7fe8609 100644
--- a/tests/test_downloader.py
+++ b/tests/test_downloader.py
@@ -60,6 +60,19 @@ def test_custom_freshness_seconds(self, tmp_path):
         )
         assert dl.freshness_seconds == 0
 
+    def test_url_base_trailing_slash_added(self, tmp_path):
+        """url_base without trailing slash gets one appended automatically."""
+        dl = BabelDownloader(
+            url_base="https://example.com/path", local_path=str(tmp_path)
+        )
+        assert dl.url_base == "https://example.com/path/"
+
+    def test_url_base_with_trailing_slash_unchanged(self, tmp_path):
+        dl = BabelDownloader(
+            url_base="https://example.com/path/", local_path=str(tmp_path)
+        )
+        assert dl.url_base == "https://example.com/path/"
+
     def test_invalid_path_raises_value_error(self):
         """Using a file path (not a directory) should raise ValueError."""
         with tempfile.NamedTemporaryFile(delete=False) as f:
@@ -610,16 +623,6 @@ def test_append_mode_on_resume(self, tmp_path):
         assert out_path.read_bytes() == b"startend"
 
 
-class TestGetDownloadedDir:
-    """Tests for get_downloaded_dir."""
-
-    def test_raises_not_implemented(self, tmp_path):
-        dl = BabelDownloader(url_base="https://example.com/", local_path=str(tmp_path))
-        dl.get_downloaded_dir.cache_clear()
-        with pytest.raises(NotImplementedError):
-            dl.get_downloaded_dir("some/dir")
-
-
 # ==========================================================================
 # Integration Tests — require network access
 # ==========================================================================
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
index 5fb088d..8b30fcd 100644
--- a/tests/test_nodenorm.py
+++ b/tests/test_nodenorm.py
@@ -28,21 +28,21 @@ def test_creation_with_defaults(self):
         ident = Identifier(curie="MONDO:0004979")
         assert ident.curie == "MONDO:0004979"
         assert ident.label == ""
-        assert ident.biolink_type == []
-        assert ident.taxa == []
-        assert ident.description == []
+        assert ident.biolink_type == ()
+        assert ident.taxa == ()
+        assert ident.description == ()
 
     def test_full_creation(self):
         ident = Identifier(
             curie="MONDO:0004979",
             label="asthma",
-            biolink_type=["biolink:Disease"],
-            taxa=["NCBITaxon:9606"],
-            description=["A chronic respiratory disease"],
+            biolink_type=("biolink:Disease",),
+            taxa=("NCBITaxon:9606",),
+            description=("A chronic respiratory disease",),
         )
         assert ident.label == "asthma"
-        assert ident.biolink_type == ["biolink:Disease"]
-        assert ident.taxa == ["NCBITaxon:9606"]
+        assert ident.biolink_type == ("biolink:Disease",)
+        assert ident.taxa == ("NCBITaxon:9606",)
 
     def test_from_dict_minimal(self):
         d = {"identifier": "X:1"}
@@ -61,15 +61,15 @@ def test_from_dict_full(self):
         ident = Identifier.from_dict(d)
         assert ident.curie == "X:1"
         assert ident.label == "Alpha"
-        assert ident.biolink_type == ["biolink:NamedThing"]
-        assert ident.taxa == ["NCBITaxon:9606"]
+        assert ident.biolink_type == ("biolink:NamedThing",)
+        assert ident.taxa == ("NCBITaxon:9606",)
 
     def test_from_dict_partial(self):
         d = {"identifier": "X:1", "label": "Beta"}
         ident = Identifier.from_dict(d)
         assert ident.curie == "X:1"
         assert ident.label == "Beta"
-        assert ident.biolink_type == []
+        assert ident.biolink_type == ()
 
     def test_lt_ordering(self):
         a = Identifier(curie="A:1")
@@ -102,6 +102,15 @@ def test_custom_url(self):
         nn = NodeNorm(nodenorm_url="https://custom.api/")
         assert nn.nodenorm_url == "https://custom.api/"
 
+    def test_empty_url_normalize_curie_returns_none_without_network(self):
+        """NodeNorm('') must not make any HTTP calls and must return None."""
+        nn = NodeNorm("")
+        nn.normalize_curie.cache_clear()
+        with patch("babel_explorer.core.nodenorm.requests.get") as mock_get:
+            result = nn.normalize_curie("MONDO:0004979")
+            mock_get.assert_not_called()
+        assert result is None
+
 
 class TestNormalizeCurieMocked:
     """Unit tests for NodeNorm.normalize_curie() with mocked HTTP responses."""

From 49e27c01a3a0f10916f1b99d13e412caa1ef4d43 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 9 Apr 2026 11:45:33 -0600
Subject: [PATCH 64/66] Add --format [text|json|tsv|csv] option to all CLI
 commands

Introduces a central formatting.py module (write_records + _record_to_dict)
that serialises any dataclass to text, JSON, TSV, or CSV without touching
domain objects. A format_option decorator wires --format and --json-indent
onto xrefs, ids, and test-concord. test-concord injects a query_curie column
for non-text formats. 30 new unit tests in test_formatting.py; 7 CLI format
tests added to test_cli.py.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/cli.py        |  56 ++++++--
 src/babel_explorer/formatting.py |  71 ++++++++++
 tests/test_cli.py                | 138 +++++++++++++++++-
 tests/test_formatting.py         | 233 +++++++++++++++++++++++++++++++
 4 files changed, 483 insertions(+), 15 deletions(-)
 create mode 100644 src/babel_explorer/formatting.py
 create mode 100644 tests/test_formatting.py

diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index bcd8787..098e831 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -4,6 +4,26 @@
 from babel_explorer.core.downloader import BabelDownloader
 from babel_explorer.core.babel_xrefs import BabelXRefs
 from babel_explorer.core.nodenorm import NodeNorm
+from babel_explorer.formatting import write_records, _record_to_dict
+
+
+def format_option(f):
+    """Decorator adding --format and --json-indent options to a command."""
+    f = click.option(
+        "--format",
+        "fmt",
+        default="text",
+        type=click.Choice(["text", "json", "tsv", "csv"]),
+        show_default=True,
+        help="Output format",
+    )(f)
+    f = click.option(
+        "--json-indent",
+        default=2,
+        show_default=True,
+        help="Indentation depth for JSON output",
+    )(f)
+    return f
 
 
 def parse_duration(value: str) -> int | float:
@@ -83,6 +103,7 @@ def cli():
     help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
     "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.",
 )
+@format_option
 def xrefs(
     curies: list[str],
     babel_url: str,
@@ -91,6 +112,8 @@ def xrefs(
     recurse: bool,
     labels: bool,
     check_download: str,
+    fmt: str,
+    json_indent: int,
 ):
     """
     Fetches and prints the cross-references (xrefs) for the given CURIEs.
@@ -113,8 +136,7 @@ def xrefs(
         NodeNorm(nodenorm_url),
     )
     xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels)
-    for xref in xrefs:
-        print(xref)
+    write_records(xrefs, fmt=fmt, indent=json_indent)
 
 
 @cli.command("ids")
@@ -139,7 +161,8 @@ def xrefs(
     help="How often to re-check downloads (e.g. '3h', '30m', '1d', '0', 'never'). "
     "'never' disables re-checking and always uses cached files; '0' forces a re-check every time.",
 )
-def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
+@format_option
+def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str, fmt: str, json_indent: int):
     """
     Fetches and prints the ID records for the given CURIEs, along with Biolink type if provided.
 
@@ -160,8 +183,7 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
         BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness)
     )
     xrefs = bxref.get_curie_ids(curies)
-    for xref in xrefs:
-        print(xref)
+    write_records(xrefs, fmt=fmt, indent=json_indent)
 
 
 @cli.command("test-concord")
@@ -172,21 +194,27 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str):
     default="https://nodenormalization-sri.renci.org/",
     help="NodeNorm URL to check for concord changes",
 )
-def test_concord(curies, nodenorm_url):
+@format_option
+def test_concord(curies, nodenorm_url, fmt, json_indent):
     """For each CURIE, print the current NodeNorm clique (all equivalent identifiers, labels, and Biolink types).
 
     Useful for inspecting how a potential Babel concordance change would affect NodeNorm:
     run before and after a Babel rebuild to see how cliques would shift.
     """
     nodenorm = NodeNorm(nodenorm_url)
-    for curie in curies:
-        identifiers = nodenorm.get_clique_identifiers(curie)
-        for identifier in identifiers:
-            biolink = ", ".join(identifier.biolink_type)
-            if identifier.label:
-                print(f"{curie}\t{identifier.curie}\t{identifier.label}\t{biolink}")
-            else:
-                print(f"{curie}\t{identifier.curie}\t\t{biolink}")
+    if fmt == "text":
+        for curie in curies:
+            for identifier in nodenorm.get_clique_identifiers(curie):
+                biolink = ", ".join(identifier.biolink_type)
+                label = identifier.label or ""
+                print(f"{curie}\t{identifier.curie}\t{label}\t{biolink}")
+    else:
+        rows = [
+            {"query_curie": curie, **_record_to_dict(ident)}
+            for curie in curies
+            for ident in nodenorm.get_clique_identifiers(curie)
+        ]
+        write_records(rows, fmt=fmt, indent=json_indent)
 
 
 if __name__ == "__main__":
diff --git a/src/babel_explorer/formatting.py b/src/babel_explorer/formatting.py
new file mode 100644
index 0000000..7e7701e
--- /dev/null
+++ b/src/babel_explorer/formatting.py
@@ -0,0 +1,71 @@
+"""Output formatting for babel-explorer CLI commands.
+
+Provides write_records() to render any list of dataclass records (or plain
+dicts) as text, JSON, TSV, or CSV.
+"""
+
+import csv
+import dataclasses
+import json
+import sys
+from typing import Any
+
+
+def _record_to_dict(record) -> dict[str, Any]:
+    """Convert a dataclass (or plain dict) to a flat dict.
+
+    Handles IdentifierRecord's extra_fields, which asdict() returns as a
+    list of [col, val] pairs rather than a nested dict.
+    """
+    if isinstance(record, dict):
+        return record
+    d = dataclasses.asdict(record)
+    if "extra_fields" in d:
+        for col, val in d.pop("extra_fields"):
+            d[col] = val
+    return d
+
+
+def _flatten_for_tabular(row: dict) -> dict:
+    """Convert list/tuple fields to pipe-joined strings for TSV/CSV output."""
+    return {k: "|".join(v) if isinstance(v, (list, tuple)) else v for k, v in row.items()}
+
+
+def write_records(records, fmt: str, indent: int = 2, file=None):
+    """Write an iterable of dataclass records (or dicts) in the requested format.
+
+    :param records: Iterable of dataclass instances or plain dicts.
+    :param fmt: One of "text", "json", "tsv", "csv".
+    :param indent: JSON indentation depth (ignored for other formats).
+    :param file: Output file-like object; defaults to sys.stdout.
+    :raises ValueError: If fmt is not a recognised format.
+    """
+    if file is None:
+        file = sys.stdout
+    records = list(records)
+
+    if fmt == "text":
+        for r in records:
+            print(r, file=file)
+
+    elif fmt == "json":
+        rows = [_record_to_dict(r) for r in records]
+        json.dump(rows, file, indent=indent, default=str)
+        print(file=file)  # trailing newline
+
+    elif fmt in ("tsv", "csv"):
+        if not records:
+            return
+        rows = [_flatten_for_tabular(_record_to_dict(r)) for r in records]
+        delimiter = "\t" if fmt == "tsv" else ","
+        writer = csv.DictWriter(
+            file,
+            fieldnames=list(rows[0].keys()),
+            delimiter=delimiter,
+            lineterminator="\n",
+        )
+        writer.writeheader()
+        writer.writerows(rows)
+
+    else:
+        raise ValueError(f"Unknown format: {fmt!r}")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 3d73e55..62112d4 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -4,12 +4,16 @@
 Unit tests — no network required.
 """
 
-import pytest
+import json
+
 import click
+import pytest
 from click.testing import CliRunner
 from unittest.mock import patch, MagicMock
 
 from babel_explorer.cli import parse_duration, cli
+from babel_explorer.core.babel_xrefs import CrossReference, IdentifierRecord
+from babel_explorer.core.nodenorm import Identifier
 
 
 # ==========================================================================
@@ -193,3 +197,135 @@ def test_test_concord_multiple_curies(self):
         assert mock_nn.return_value.get_clique_identifiers.call_count == 2
         assert "Alpha" in result.output
         assert "Beta" in result.output
+
+
+class TestOutputFormats:
+    """Tests for --format option on all commands."""
+
+    # Shared real dataclass instances (no mocking needed for formatting logic)
+    _xref = CrossReference(filename="Concord.parquet", subj="A:1", pred="skos:exactMatch", obj="B:2")
+    _id_record = IdentifierRecord(curie="A:1", extra_fields=(("type", "gene"), ("label", "Alpha")))
+    _identifier = Identifier(
+        curie="MONDO:0004979", label="asthma",
+        biolink_type=("biolink:Disease",), taxa=(), description=(),
+    )
+
+    # -- xrefs --
+
+    def test_xrefs_format_json(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            mock_bx.return_value.get_curie_xrefs.return_value = [self._xref]
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "json"])
+
+        assert result.exit_code == 0
+        data = json.loads(result.output)
+        assert isinstance(data, list)
+        assert data[0]["subj"] == "A:1"
+        assert data[0]["obj"] == "B:2"
+
+    def test_xrefs_format_tsv(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            mock_bx.return_value.get_curie_xrefs.return_value = [self._xref]
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "tsv"])
+
+        assert result.exit_code == 0
+        lines = result.output.splitlines()
+        assert lines[0] == "filename\tsubj\tpred\tobj"
+        assert "A:1" in lines[1]
+
+    def test_xrefs_format_csv(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            mock_bx.return_value.get_curie_xrefs.return_value = [self._xref]
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "csv"])
+
+        assert result.exit_code == 0
+        lines = result.output.splitlines()
+        assert lines[0] == "filename,subj,pred,obj"
+        assert "A:1" in lines[1]
+
+    # -- ids --
+
+    def test_ids_format_json_expands_extra_fields(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+        ):
+            mock_bx.return_value.get_curie_ids.return_value = [self._id_record]
+            result = runner.invoke(cli, ["ids", "A:1", "--format", "json"])
+
+        assert result.exit_code == 0
+        data = json.loads(result.output)
+        assert data[0]["curie"] == "A:1"
+        assert data[0]["type"] == "gene"
+        assert data[0]["label"] == "Alpha"
+        assert "extra_fields" not in data[0]
+
+    def test_ids_format_tsv_expands_extra_fields(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+        ):
+            mock_bx.return_value.get_curie_ids.return_value = [self._id_record]
+            result = runner.invoke(cli, ["ids", "A:1", "--format", "tsv"])
+
+        assert result.exit_code == 0
+        lines = result.output.splitlines()
+        assert "type" in lines[0]
+        assert "label" in lines[0]
+        assert "gene" in lines[1]
+
+    # -- test-concord --
+
+    def test_test_concord_format_json_includes_query_curie(self):
+        runner = CliRunner()
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "json"])
+
+        assert result.exit_code == 0
+        data = json.loads(result.output)
+        assert data[0]["query_curie"] == "MONDO:0004979"
+        assert data[0]["curie"] == "MONDO:0004979"
+        assert data[0]["label"] == "asthma"
+        assert data[0]["biolink_type"] == ["biolink:Disease"]
+
+    def test_test_concord_format_tsv(self):
+        runner = CliRunner()
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "tsv"])
+
+        assert result.exit_code == 0
+        lines = result.output.splitlines()
+        assert "query_curie" in lines[0]
+        assert "MONDO:0004979" in lines[1]
+
+    # -- format validation --
+
+    def test_invalid_format_rejected_by_click(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs"),
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "xml"])
+
+        assert result.exit_code != 0
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
new file mode 100644
index 0000000..9b9aa90
--- /dev/null
+++ b/tests/test_formatting.py
@@ -0,0 +1,233 @@
+"""
+Unit tests for formatting.py — no network, no mocking required.
+"""
+
+import io
+import json
+
+import pytest
+
+from babel_explorer.core.babel_xrefs import CrossReference, LabeledCrossReference, IdentifierRecord
+from babel_explorer.core.nodenorm import Identifier
+from babel_explorer.formatting import _record_to_dict, write_records
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def xref():
+    return CrossReference(filename="Concord.parquet", subj="A:1", pred="skos:exactMatch", obj="B:2")
+
+
+@pytest.fixture
+def labeled_xref():
+    return LabeledCrossReference(
+        filename="Concord.parquet",
+        subj="A:1",
+        pred="skos:exactMatch",
+        obj="B:2",
+        subj_label="Alpha",
+        subj_biolink_type=("biolink:Disease",),
+        obj_label="Beta",
+        obj_biolink_type=("biolink:Gene", "biolink:NamedThing"),
+    )
+
+
+@pytest.fixture
+def id_record():
+    return IdentifierRecord(
+        curie="A:1",
+        extra_fields=(("type", "gene"), ("label", "Alpha")),
+    )
+
+
+@pytest.fixture
+def identifier():
+    return Identifier(
+        curie="MONDO:0004979",
+        label="asthma",
+        biolink_type=("biolink:Disease",),
+        taxa=("NCBITaxon:9606",),
+        description=("A chronic inflammatory disease",),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests for _record_to_dict
+# ---------------------------------------------------------------------------
+
+
+class TestRecordToDict:
+    def test_cross_reference(self, xref):
+        d = _record_to_dict(xref)
+        assert d == {"filename": "Concord.parquet", "subj": "A:1", "pred": "skos:exactMatch", "obj": "B:2"}
+
+    def test_labeled_cross_reference_has_all_eight_fields(self, labeled_xref):
+        d = _record_to_dict(labeled_xref)
+        assert set(d.keys()) == {
+            "filename", "subj", "pred", "obj",
+            "subj_label", "subj_biolink_type", "obj_label", "obj_biolink_type",
+        }
+        # dataclasses.asdict() preserves tuple types
+        assert d["subj_biolink_type"] == ("biolink:Disease",)
+        assert d["obj_biolink_type"] == ("biolink:Gene", "biolink:NamedThing")
+
+    def test_identifier_record_extra_fields_expanded(self, id_record):
+        d = _record_to_dict(id_record)
+        assert "extra_fields" not in d
+        assert d["curie"] == "A:1"
+        assert d["type"] == "gene"
+        assert d["label"] == "Alpha"
+
+    def test_identifier_record_no_extra_fields(self):
+        rec = IdentifierRecord(curie="X:1")
+        d = _record_to_dict(rec)
+        assert d == {"curie": "X:1"}
+
+    def test_plain_dict_passthrough(self):
+        data = {"a": 1, "b": "hello"}
+        assert _record_to_dict(data) is data
+
+    def test_identifier_dataclass(self, identifier):
+        d = _record_to_dict(identifier)
+        assert d["curie"] == "MONDO:0004979"
+        assert d["label"] == "asthma"
+        # dataclasses.asdict() preserves tuple types
+        assert d["biolink_type"] == ("biolink:Disease",)
+        assert d["taxa"] == ("NCBITaxon:9606",)
+
+
+# ---------------------------------------------------------------------------
+# Tests for write_records
+# ---------------------------------------------------------------------------
+
+
+class TestWriteRecords:
+
+    # -- text format --
+
+    def test_text_uses_str(self, xref):
+        out = io.StringIO()
+        write_records([xref], "text", file=out)
+        assert out.getvalue().strip() == str(xref)
+
+    def test_text_empty_no_output(self):
+        out = io.StringIO()
+        write_records([], "text", file=out)
+        assert out.getvalue() == ""
+
+    def test_text_multiple_records(self, xref):
+        out = io.StringIO()
+        write_records([xref, xref], "text", file=out)
+        lines = out.getvalue().strip().splitlines()
+        assert len(lines) == 2
+
+    # -- json format --
+
+    def test_json_is_valid_list(self, xref):
+        out = io.StringIO()
+        write_records([xref], "json", file=out)
+        data = json.loads(out.getvalue())
+        assert isinstance(data, list)
+        assert len(data) == 1
+        assert data[0]["subj"] == "A:1"
+
+    def test_json_empty_list(self):
+        out = io.StringIO()
+        write_records([], "json", file=out)
+        assert json.loads(out.getvalue()) == []
+
+    def test_json_indent_controls_formatting(self, xref):
+        out_pretty = io.StringIO()
+        write_records([xref], "json", indent=2, file=out_pretty)
+
+        out_compact = io.StringIO()
+        write_records([xref], "json", indent=None, file=out_compact)
+
+        # Pretty-printed output has more lines (has newlines per field)
+        assert out_pretty.getvalue().count("\n") > out_compact.getvalue().count("\n")
+
+    def test_json_tuple_fields_serialized_as_arrays(self, labeled_xref):
+        # json.dump converts tuples to JSON arrays, so json.loads gives back lists
+        out = io.StringIO()
+        write_records([labeled_xref], "json", file=out)
+        data = json.loads(out.getvalue())
+        assert isinstance(data[0]["subj_biolink_type"], list)
+        assert data[0]["obj_biolink_type"] == ["biolink:Gene", "biolink:NamedThing"]
+
+    def test_json_plain_dict(self):
+        out = io.StringIO()
+        write_records([{"a": 1, "b": "x"}], "json", file=out)
+        assert json.loads(out.getvalue()) == [{"a": 1, "b": "x"}]
+
+    # -- tsv format --
+
+    def test_tsv_has_header_row(self, xref):
+        out = io.StringIO()
+        write_records([xref], "tsv", file=out)
+        lines = out.getvalue().splitlines()
+        assert lines[0] == "filename\tsubj\tpred\tobj"
+
+    def test_tsv_data_row(self, xref):
+        out = io.StringIO()
+        write_records([xref], "tsv", file=out)
+        lines = out.getvalue().splitlines()
+        assert lines[1] == "Concord.parquet\tA:1\tskos:exactMatch\tB:2"
+
+    def test_tsv_tuple_fields_pipe_joined(self, labeled_xref):
+        out = io.StringIO()
+        write_records([labeled_xref], "tsv", file=out)
+        lines = out.getvalue().splitlines()
+        # Header row
+        assert "subj_biolink_type" in lines[0]
+        # Data row: multi-value tuple joined with pipe
+        assert "biolink:Gene|biolink:NamedThing" in lines[1]
+
+    def test_tsv_empty_no_output(self):
+        out = io.StringIO()
+        write_records([], "tsv", file=out)
+        assert out.getvalue() == ""
+
+    def test_tsv_identifier_record_extra_fields_expanded(self, id_record):
+        out = io.StringIO()
+        write_records([id_record], "tsv", file=out)
+        lines = out.getvalue().splitlines()
+        assert "curie" in lines[0]
+        assert "type" in lines[0]
+        assert "label" in lines[0]
+        assert "A:1" in lines[1]
+
+    # -- csv format --
+
+    def test_csv_has_header_row(self, xref):
+        out = io.StringIO()
+        write_records([xref], "csv", file=out)
+        lines = out.getvalue().splitlines()
+        assert lines[0] == "filename,subj,pred,obj"
+
+    def test_csv_data_row(self, xref):
+        out = io.StringIO()
+        write_records([xref], "csv", file=out)
+        lines = out.getvalue().splitlines()
+        assert lines[1] == "Concord.parquet,A:1,skos:exactMatch,B:2"
+
+    def test_csv_empty_no_output(self):
+        out = io.StringIO()
+        write_records([], "csv", file=out)
+        assert out.getvalue() == ""
+
+    def test_csv_tuple_fields_pipe_joined(self, labeled_xref):
+        out = io.StringIO()
+        write_records([labeled_xref], "csv", file=out)
+        lines = out.getvalue().splitlines()
+        assert "biolink:Gene|biolink:NamedThing" in lines[1]
+
+    # -- invalid format --
+
+    def test_invalid_format_raises_value_error(self, xref):
+        out = io.StringIO()
+        with pytest.raises(ValueError, match="Unknown format"):
+            write_records([xref], "xml", file=out)

From aecae5085acb623085b5894835e95dfb6fd4ce52 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 9 Apr 2026 12:10:12 -0600
Subject: [PATCH 65/66] Add console format with rich color highlighting;
 replace text default

Replaces the 'text' default format with 'console', backed by the rich
library. xrefs and test-concord highlight query CURIEs in bold cyan
wherever they appear as subject or object; rich auto-strips markup when
output is piped. ids uses console.print(str(record)) for TTY-aware plain
output. formatting.py gains make_console() and hl_curie() utilities for
new commands to reuse. LabeledCrossReference labels appear in parentheses
next to CURIEs in console output.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml                   |  1 +
 src/babel_explorer/cli.py        | 51 ++++++++++++++++----
 src/babel_explorer/formatting.py | 32 +++++++++---
 tests/test_cli.py                | 83 ++++++++++++++++++++++++++++++--
 tests/test_formatting.py         | 80 ++++++++++++++++++++++--------
 uv.lock                          | 36 ++++++++++++++
 6 files changed, 241 insertions(+), 42 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 59c1b68..34a0be3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "click>=8.3.1",
     "duckdb>=1.4.2",
     "requests>=2.32.5",
+    "rich>=13",
     "tqdm>=4.67.0",
 ]
 
diff --git a/src/babel_explorer/cli.py b/src/babel_explorer/cli.py
index 098e831..db0d46f 100644
--- a/src/babel_explorer/cli.py
+++ b/src/babel_explorer/cli.py
@@ -4,7 +4,9 @@
 from babel_explorer.core.downloader import BabelDownloader
 from babel_explorer.core.babel_xrefs import BabelXRefs
 from babel_explorer.core.nodenorm import NodeNorm
-from babel_explorer.formatting import write_records, _record_to_dict
+from babel_explorer.core.babel_xrefs import LabeledCrossReference
+from babel_explorer.formatting import write_records, _record_to_dict, make_console, hl_curie
+from rich.markup import escape
 
 
 def format_option(f):
@@ -12,8 +14,8 @@ def format_option(f):
     f = click.option(
         "--format",
         "fmt",
-        default="text",
-        type=click.Choice(["text", "json", "tsv", "csv"]),
+        default="console",
+        type=click.Choice(["console", "json", "tsv", "csv"]),
         show_default=True,
         help="Output format",
     )(f)
@@ -136,7 +138,24 @@ def xrefs(
         NodeNorm(nodenorm_url),
     )
     xrefs = bxref.get_curie_xrefs(curies, recurse, label_curies=labels)
-    write_records(xrefs, fmt=fmt, indent=json_indent)
+
+    if fmt == "console":
+        console = make_console()
+        query_set = set(curies)
+        for xref in xrefs:
+            subj_str = hl_curie(xref.subj, xref.subj in query_set)
+            obj_str = hl_curie(xref.obj, xref.obj in query_set)
+            if isinstance(xref, LabeledCrossReference):
+                if xref.subj_label:
+                    subj_str += f" ({escape(xref.subj_label)})"
+                if xref.obj_label:
+                    obj_str += f" ({escape(xref.obj_label)})"
+            console.print(
+                f"{subj_str}  [dim]{escape(xref.pred)}[/dim]  "
+                f"{obj_str}  [dim italic]{escape(xref.filename)}[/dim italic]"
+            )
+    else:
+        write_records(xrefs, fmt=fmt, indent=json_indent)
 
 
 @cli.command("ids")
@@ -183,7 +202,13 @@ def ids(curies: list[str], babel_url: str, local_dir: str, check_download: str,
         BabelDownloader(babel_url, local_path=local_dir, freshness_seconds=freshness)
     )
     xrefs = bxref.get_curie_ids(curies)
-    write_records(xrefs, fmt=fmt, indent=json_indent)
+
+    if fmt == "console":
+        console = make_console()
+        for record in xrefs:
+            console.print(str(record))
+    else:
+        write_records(xrefs, fmt=fmt, indent=json_indent)
 
 
 @cli.command("test-concord")
@@ -202,12 +227,18 @@ def test_concord(curies, nodenorm_url, fmt, json_indent):
     run before and after a Babel rebuild to see how cliques would shift.
     """
     nodenorm = NodeNorm(nodenorm_url)
-    if fmt == "text":
+    if fmt == "console":
+        console = make_console()
+        query_set = set(curies)
         for curie in curies:
-            for identifier in nodenorm.get_clique_identifiers(curie):
-                biolink = ", ".join(identifier.biolink_type)
-                label = identifier.label or ""
-                print(f"{curie}\t{identifier.curie}\t{label}\t{biolink}")
+            for ident in nodenorm.get_clique_identifiers(curie):
+                biolink = ", ".join(ident.biolink_type)
+                console.print(
+                    f"{hl_curie(curie, True)}  "
+                    f"{hl_curie(ident.curie, ident.curie in query_set)}  "
+                    f"{escape(ident.label or '-')}  "
+                    f"[dim]{escape(biolink)}[/dim]"
+                )
     else:
         rows = [
             {"query_curie": curie, **_record_to_dict(ident)}
diff --git a/src/babel_explorer/formatting.py b/src/babel_explorer/formatting.py
index 7e7701e..191f1cd 100644
--- a/src/babel_explorer/formatting.py
+++ b/src/babel_explorer/formatting.py
@@ -1,7 +1,8 @@
 """Output formatting for babel-explorer CLI commands.
 
-Provides write_records() to render any list of dataclass records (or plain
-dicts) as text, JSON, TSV, or CSV.
+Provides:
+- write_records() for machine-readable output (json, tsv, csv)
+- make_console() and hl_curie() for rich console output
 """
 
 import csv
@@ -10,6 +11,9 @@
 import sys
 from typing import Any
 
+from rich.console import Console
+from rich.markup import escape
+
 
 def _record_to_dict(record) -> dict[str, Any]:
     """Convert a dataclass (or plain dict) to a flat dict.
@@ -31,11 +35,27 @@ def _flatten_for_tabular(row: dict) -> dict:
     return {k: "|".join(v) if isinstance(v, (list, tuple)) else v for k, v in row.items()}
 
 
+def make_console(file=None) -> Console:
+    """Create a rich Console with babel-explorer defaults.
+
+    Auto-detects TTY and NO_COLOR; strips markup when output is piped.
+    highlight=False prevents rich from auto-highlighting numbers and strings.
+    """
+    return Console(file=file, highlight=False)
+
+
+def hl_curie(curie: str, highlight: bool) -> str:
+    """Return rich markup for a CURIE — bold cyan if it is a query CURIE."""
+    escaped = escape(curie)
+    return f"[bold cyan]{escaped}[/bold cyan]" if highlight else escaped
+
+
 def write_records(records, fmt: str, indent: int = 2, file=None):
     """Write an iterable of dataclass records (or dicts) in the requested format.
 
     :param records: Iterable of dataclass instances or plain dicts.
-    :param fmt: One of "text", "json", "tsv", "csv".
+    :param fmt: One of "json", "tsv", "csv". (Console output is handled by
+        make_console/hl_curie in the CLI layer.)
     :param indent: JSON indentation depth (ignored for other formats).
     :param file: Output file-like object; defaults to sys.stdout.
     :raises ValueError: If fmt is not a recognised format.
@@ -44,11 +64,7 @@ def write_records(records, fmt: str, indent: int = 2, file=None):
         file = sys.stdout
     records = list(records)
 
-    if fmt == "text":
-        for r in records:
-            print(r, file=file)
-
-    elif fmt == "json":
+    if fmt == "json":
         rows = [_record_to_dict(r) for r in records]
         json.dump(rows, file, indent=indent, default=str)
         print(file=file)  # trailing newline
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 62112d4..f4f6dbb 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -67,6 +67,10 @@ def test_xrefs_happy_path(self):
         runner = CliRunner()
         mock_xref = MagicMock()
         mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
+        mock_xref.subj = "A:1"
+        mock_xref.obj = "B:2"
+        mock_xref.pred = "skos:exactMatch"
+        mock_xref.filename = "test.parquet"
 
         with (
             patch("babel_explorer.cli.BabelDownloader"),
@@ -84,7 +88,10 @@ def test_xrefs_happy_path(self):
     def test_xrefs_recurse_and_labels_flags(self):
         runner = CliRunner()
         mock_xref = MagicMock()
-        mock_xref.__str__ = lambda self: "A:1 skos:exactMatch B:2"
+        mock_xref.subj = "A:1"
+        mock_xref.obj = "B:2"
+        mock_xref.pred = "skos:exactMatch"
+        mock_xref.filename = "test.parquet"
 
         with (
             patch("babel_explorer.cli.BabelDownloader"),
@@ -210,7 +217,65 @@ class TestOutputFormats:
         biolink_type=("biolink:Disease",), taxa=(), description=(),
     )
 
-    # -- xrefs --
+    # -- console format (default) --
+
+    def test_xrefs_default_format_is_console(self):
+        """Default format is console — output contains the CURIEs as plain text (no TTY in runner)."""
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            mock_bx.return_value.get_curie_xrefs.return_value = [self._xref]
+            result = runner.invoke(cli, ["xrefs", "A:1"])
+
+        assert result.exit_code == 0
+        # Rich strips markup on non-TTY; plain CURIEs and predicate appear
+        assert "A:1" in result.output
+        assert "B:2" in result.output
+        assert "skos:exactMatch" in result.output
+
+    def test_xrefs_console_shows_query_curie(self):
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs") as mock_bx,
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            mock_bx.return_value.get_curie_xrefs.return_value = [self._xref]
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "console"])
+
+        assert result.exit_code == 0
+        assert "A:1" in result.output
+
+    def test_test_concord_console_format(self):
+        runner = CliRunner()
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [self._identifier]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "console"])
+
+        assert result.exit_code == 0
+        assert "MONDO:0004979" in result.output
+        assert "asthma" in result.output
+        assert "biolink:Disease" in result.output
+
+    def test_test_concord_console_no_label_shows_dash(self):
+        """Identifiers with no label display '-' in console format."""
+        runner = CliRunner()
+        mock_ident = MagicMock()
+        mock_ident.curie = "MONDO:0004979"
+        mock_ident.label = None
+        mock_ident.biolink_type = ["biolink:Disease"]
+
+        with patch("babel_explorer.cli.NodeNorm") as mock_nn:
+            mock_nn.return_value.get_clique_identifiers.return_value = [mock_ident]
+            result = runner.invoke(cli, ["test-concord", "MONDO:0004979", "--format", "console"])
+
+        assert result.exit_code == 0
+        assert "-" in result.output
+
+    # -- json format --
 
     def test_xrefs_format_json(self):
         runner = CliRunner()
@@ -291,7 +356,7 @@ def test_ids_format_tsv_expands_extra_fields(self):
         assert "label" in lines[0]
         assert "gene" in lines[1]
 
-    # -- test-concord --
+    # -- test-concord structured formats --
 
     def test_test_concord_format_json_includes_query_curie(self):
         runner = CliRunner()
@@ -329,3 +394,15 @@ def test_invalid_format_rejected_by_click(self):
             result = runner.invoke(cli, ["xrefs", "A:1", "--format", "xml"])
 
         assert result.exit_code != 0
+
+    def test_text_format_rejected_by_click(self):
+        """'text' was removed; it is no longer a valid choice."""
+        runner = CliRunner()
+        with (
+            patch("babel_explorer.cli.BabelDownloader"),
+            patch("babel_explorer.cli.BabelXRefs"),
+            patch("babel_explorer.cli.NodeNorm"),
+        ):
+            result = runner.invoke(cli, ["xrefs", "A:1", "--format", "text"])
+
+        assert result.exit_code != 0
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
index 9b9aa90..2d402db 100644
--- a/tests/test_formatting.py
+++ b/tests/test_formatting.py
@@ -6,10 +6,11 @@
 import json
 
 import pytest
+from rich.console import Console
 
 from babel_explorer.core.babel_xrefs import CrossReference, LabeledCrossReference, IdentifierRecord
 from babel_explorer.core.nodenorm import Identifier
-from babel_explorer.formatting import _record_to_dict, write_records
+from babel_explorer.formatting import _record_to_dict, write_records, make_console, hl_curie
 
 
 # ---------------------------------------------------------------------------
@@ -55,6 +56,50 @@ def identifier():
     )
 
 
+# ---------------------------------------------------------------------------
+# Tests for make_console and hl_curie
+# ---------------------------------------------------------------------------
+
+
+class TestConsoleUtilities:
+    def test_make_console_returns_console(self):
+        console = make_console()
+        assert isinstance(console, Console)
+
+    def test_make_console_accepts_file(self):
+        out = io.StringIO()
+        console = make_console(file=out)
+        assert isinstance(console, Console)
+        console.print("hello")
+        assert "hello" in out.getvalue()
+
+    def test_hl_curie_highlighted_contains_markup(self):
+        result = hl_curie("HGNC:1100", highlight=True)
+        assert "bold cyan" in result
+        assert "HGNC:1100" in result
+
+    def test_hl_curie_not_highlighted_is_plain(self):
+        result = hl_curie("HGNC:1100", highlight=False)
+        assert result == "HGNC:1100"
+        assert "[" not in result
+
+    def test_hl_curie_highlighted_renders_correctly(self):
+        """Markup renders to plain text on a non-TTY console."""
+        out = io.StringIO()
+        console = Console(file=out, highlight=False, no_color=True)
+        console.print(hl_curie("HGNC:1100", highlight=True))
+        assert "HGNC:1100" in out.getvalue()
+
+    def test_hl_curie_highlighted_renders_with_color(self):
+        """On a forced-TTY console, ANSI codes are emitted."""
+        out = io.StringIO()
+        console = Console(file=out, highlight=False, force_terminal=True)
+        console.print(hl_curie("HGNC:1100", highlight=True))
+        output = out.getvalue()
+        assert "HGNC:1100" in output
+        assert "\x1b[" in output  # ANSI escape present
+
+
 # ---------------------------------------------------------------------------
 # Tests for _record_to_dict
 # ---------------------------------------------------------------------------
@@ -107,24 +152,6 @@ def test_identifier_dataclass(self, identifier):
 
 class TestWriteRecords:
 
-    # -- text format --
-
-    def test_text_uses_str(self, xref):
-        out = io.StringIO()
-        write_records([xref], "text", file=out)
-        assert out.getvalue().strip() == str(xref)
-
-    def test_text_empty_no_output(self):
-        out = io.StringIO()
-        write_records([], "text", file=out)
-        assert out.getvalue() == ""
-
-    def test_text_multiple_records(self, xref):
-        out = io.StringIO()
-        write_records([xref, xref], "text", file=out)
-        lines = out.getvalue().strip().splitlines()
-        assert len(lines) == 2
-
     # -- json format --
 
     def test_json_is_valid_list(self, xref):
@@ -225,9 +252,20 @@ def test_csv_tuple_fields_pipe_joined(self, labeled_xref):
         lines = out.getvalue().splitlines()
         assert "biolink:Gene|biolink:NamedThing" in lines[1]
 
-    # -- invalid format --
+    # -- invalid formats (including console, which is handled at CLI layer) --
+
+    def test_text_format_raises_value_error(self, xref):
+        out = io.StringIO()
+        with pytest.raises(ValueError, match="Unknown format"):
+            write_records([xref], "text", file=out)
+
+    def test_console_format_raises_value_error(self, xref):
+        """Console format is handled by the CLI, not write_records."""
+        out = io.StringIO()
+        with pytest.raises(ValueError, match="Unknown format"):
+            write_records([xref], "console", file=out)
 
-    def test_invalid_format_raises_value_error(self, xref):
+    def test_unknown_format_raises_value_error(self, xref):
         out = io.StringIO()
         with pytest.raises(ValueError, match="Unknown format"):
             write_records([xref], "xml", file=out)
diff --git a/uv.lock b/uv.lock
index b8496b5..7b201e4 100644
--- a/uv.lock
+++ b/uv.lock
@@ -10,6 +10,7 @@ dependencies = [
     { name = "click" },
     { name = "duckdb" },
     { name = "requests" },
+    { name = "rich" },
     { name = "tqdm" },
 ]
 
@@ -26,6 +27,7 @@ requires-dist = [
     { name = "click", specifier = ">=8.3.1" },
     { name = "duckdb", specifier = ">=1.4.2" },
     { name = "requests", specifier = ">=2.32.5" },
+    { name = "rich", specifier = ">=13" },
     { name = "tqdm", specifier = ">=4.67.0" },
 ]
 
@@ -212,6 +214,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "markdown-it-py"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "26.0"
@@ -316,6 +339,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" },
 ]
 
+[[package]]
+name = "rich"
+version = "14.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
+]
+
 [[package]]
 name = "ruff"
 version = "0.15.2"

From f7cde3a77d72aa70bc22b3aa3d29165e124d5fe6 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 9 Apr 2026 13:41:39 -0600
Subject: [PATCH 66/66] Fix Identifier.from_dict splitting string fields into
 characters

tuple() on a bare string iterates its characters, so biolink_type,
taxa, and description would become ('b','i','o',...) when NodeNorm
returns them as strings rather than lists. _to_tuple() now wraps
a bare string in a 1-tuple. Four new unit tests cover the string
case for each field.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/babel_explorer/core/nodenorm.py | 12 ++++++---
 tests/test_nodenorm.py              | 38 +++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/src/babel_explorer/core/nodenorm.py b/src/babel_explorer/core/nodenorm.py
index 9ce916d..4f0f6d1 100644
--- a/src/babel_explorer/core/nodenorm.py
+++ b/src/babel_explorer/core/nodenorm.py
@@ -21,12 +21,18 @@ def __lt__(self, other):
 
     @staticmethod
     def from_dict(d: dict) -> "Identifier":
+        def _to_tuple(val) -> tuple[str, ...]:
+            """Coerce a string or list to a tuple — guards against iterating string chars."""
+            if not val:
+                return ()
+            return (val,) if isinstance(val, str) else tuple(val)
+
         return Identifier(
             curie=d["identifier"],
             label=d.get("label", ""),
-            biolink_type=tuple(d.get("type", [])),
-            taxa=tuple(d.get("taxa", [])),
-            description=tuple(d.get("description", [])),
+            biolink_type=_to_tuple(d.get("type")),
+            taxa=_to_tuple(d.get("taxa")),
+            description=_to_tuple(d.get("description")),
         )
 
 
diff --git a/tests/test_nodenorm.py b/tests/test_nodenorm.py
index 8b30fcd..57b6dab 100644
--- a/tests/test_nodenorm.py
+++ b/tests/test_nodenorm.py
@@ -71,6 +71,44 @@ def test_from_dict_partial(self):
         assert ident.label == "Beta"
         assert ident.biolink_type == ()
 
+    def test_from_dict_type_as_string(self):
+        """NodeNorm may return 'type' as a bare string for individual identifiers."""
+        d = {"identifier": "X:1", "type": "biolink:Disease"}
+        ident = Identifier.from_dict(d)
+        assert ident.biolink_type == ("biolink:Disease",), (
+            "biolink_type should be a 1-tuple, not a tuple of characters"
+        )
+
+    def test_from_dict_description_as_string(self):
+        """NodeNorm may return 'description' as a bare string."""
+        d = {"identifier": "X:1", "description": "A chronic disease"}
+        ident = Identifier.from_dict(d)
+        assert ident.description == ("A chronic disease",), (
+            "description should be a 1-tuple, not a tuple of characters"
+        )
+
+    def test_from_dict_taxa_as_string(self):
+        """NodeNorm may return 'taxa' as a bare string."""
+        d = {"identifier": "X:1", "taxa": "NCBITaxon:9606"}
+        ident = Identifier.from_dict(d)
+        assert ident.taxa == ("NCBITaxon:9606",), (
+            "taxa should be a 1-tuple, not a tuple of characters"
+        )
+
+    def test_from_dict_all_fields_as_strings(self):
+        """All three tuple fields as strings produce correct single-element tuples."""
+        d = {
+            "identifier": "X:1",
+            "label": "Alpha",
+            "type": "biolink:NamedThing",
+            "taxa": "NCBITaxon:9606",
+            "description": "Some description",
+        }
+        ident = Identifier.from_dict(d)
+        assert ident.biolink_type == ("biolink:NamedThing",)
+        assert ident.taxa == ("NCBITaxon:9606",)
+        assert ident.description == ("Some description",)
+
     def test_lt_ordering(self):
         a = Identifier(curie="A:1")
         b = Identifier(curie="B:2")