kbase · ialarmedalien · Mar 18, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 17, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -14,7 +14,7 @@ on:
       - ready_for_review
 
 jobs:
-  code_format:
+  code_format_and_lint:
     name: Check code formatting
     runs-on: ubuntu-latest
     steps:
@@ -33,29 +33,13 @@ jobs:
         shell: bash
         run: uv run ruff format --check
 
-  code_linting:
-    name: Run code lint checks
-    runs-on: ubuntu-latest
-    needs: code_format
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7.1.2
-        with:
-          enable-cache: true
-
-      - name: Install dependencies
-        run: uv sync
-
       - name: Run code linting checks
         continue-on-error: true
         run: uv run ruff check --output-format=github .
 
   spark_tests:
     name: Run container tests
-    needs: code_format
+    needs: code_format_and_lint
     runs-on: ubuntu-latest
 
     steps:
@@ -83,7 +67,7 @@ jobs:
   tests:
     name: Run local tests (${{ matrix.python-version }}, ${{ matrix.os }})
     runs-on: ${{ matrix.os }}
-    needs: code_format
+    needs: code_format_and_lint
     strategy:
       fail-fast: false
       matrix:

diff --git a/AI_COVENANT.md b/AI_COVENANT.md
@@ -0,0 +1,65 @@
+# AI Covenant for Developers
+
+This covenant establishes community norms for responsible AI use in the cdm-data-loaders project. It aims to maintain trust, quality, and accountability while embracing AI as a useful tool.
+
+## Core Principle: You Own Your Contributions
+
+**Everything you contribute is yours, regardless of what tools helped create it.**
+
+When you submit code, documentation, issues, or comments with AI assistance, you are the author. You are responsible for:
+
+- Understanding what you are submitting
+- Verifying correctness and appropriateness
+- Defending and explaining your choices during review
+- Ensuring it meets project standards
+
+Do not submit AI-generated code without checking it first, and do not submit anything you cannot fully stand behind.
+
+## AI-Assisted Code Reviews
+
+AI review tools (Claude, Copilot, CodeRabbit, etc.) provide **automated quality checks, not human reviews**.
+
+- AI comments are suggestions, not requirements
+- PR owners may close AI comments without response
+- Human reviewers may use AI feedback to inform their own review
+- A PR still requires human approval regardless of AI feedback
+
+## When to Disclose AI Assistance
+
+**Required disclosure:**
+
+- When proposing bug fixes or changes to code you don't fully understand, attribute the idea to AI so reviewers can assess appropriately.
+
+**Appreciated transparency:**
+
+- When brainstorming solutions, distinguish between "AI suggests X" and "I recommend X based on my expertise". This helps reviewers to prioritize ideas.
+
+**Not required:**
+
+- Routine use of AI for writing code, issues, or PR descriptions.
+- AI co-authorship in commit messages. This is actively discouraged.
+
+## What This Means in Practice
+
+| Situation                                        | Guidance                                     |
+| ------------------------------------------------ | -------------------------------------------- |
+| Writing code with Copilot/Claude/etc.            | No disclosure needed; you own the result     |
+| Submitting AI-suggested fix you fully understand | No disclosure needed                         |
+| Submitting AI-suggested fix in unfamiliar code   | Disclose AI origin for reviewer context      |
+| Drafting issue or PR description with AI         | No disclosure needed; ensure it's accurate   |
+| Brainstorming in discussions                     | Be clear about AI-generated vs. expert ideas |
+| Receiving AI review comments                     | Address or close at your discretion          |
+
+## Trust and Accountability
+
+This covenant is built on trust. By contributing to this repository, you agree that:
+
+1. You will not submit AI-generated content without reviewing it
+2. You will take responsibility for any issues arising from your contributions
+3. You will be honest about the origins of ideas when it matters for review quality
+
+---
+
+*This covenant may evolve as AI tools and community needs change. Feedback and suggestions are welcome.*
+
+Based heavily on the excellent AI Covenant established by the [LinkML project](https://github.com/linkml/linkml/blob/main/AI_COVENANT.md).
diff --git a/Dockerfile b/Dockerfile
@@ -3,14 +3,21 @@
 # Dockerfile is based heavily on the example uv dockerfile:
 # https://github.com/astral-sh/uv-docker-example
 
+# Pull the pre-built Rust app from ghcr.io
+FROM ghcr.io/ialarmedalien/xml_file_splitter:latest AS rust-app
+
 # Use a Python image with uv pre-installed
 FROM ghcr.io/astral-sh/uv:python3.13-trixie-slim
 
 # Set environment variable to noninteractive to prevent prompts during apt operations
 ENV DEBIAN_FRONTEND=noninteractive
 
-# add tini
-RUN apt-get update -y && apt-get install -y --no-install-recommends tini git
+# add tini and git
+RUN apt-get update -y && apt-get install -y --no-install-recommends tini git ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+# copy only the compiled xml-file-splitter binary from the Rust image
+COPY --from=rust-app /usr/local/bin/xml_file_splitter /usr/local/bin/xml_file_splitter
 
 # Setup a non-root user
 RUN groupadd --system --gid 999 nonroot \

diff --git a/README.md b/README.md
@@ -1,10 +1,11 @@
-# cdm-data-loader-utils
+# cdm-data-loaders
 
 Repo for CDM input data loading and wrangling
 
-- [cdm-data-loader-utils](#cdm-data-loader-utils)
+- [cdm-data-loaders](#cdm-data-loaders)
   - [Environment and python management](#environment-and-python-management)
   - [Installation](#installation)
+  - [Running import pipelines](#running-import-pipelines)
   - [Development](#development)
     - [Spark and other non-python dependencies](#spark-and-other-non-python-dependencies)
     - [Tests](#tests)
@@ -20,9 +21,17 @@ The data loader utils package uses [uv](https://docs.astral.sh/uv/) for python e
 
 ## Installation
 
-The data loader utils run on python 3.13 and above.
+The CDM data loaders run on python 3.13 and above.
 
-To install dependencies (including python), run
+Most python code can be run using the command
+
+```sh
+> uv run <path_to_file.py>
+```
+
+This will automatically launch a virtual environment and install all required dependencies.
+
+To manually set up the virtual environment and install dependencies (including python), run
 
 ```sh
 > uv sync
@@ -39,6 +48,17 @@ To activate a virtual environment with these dependencies installed, run
 If you are using IDEs like VSCode, they should pick up the creation of the new environment and offer it for executing python code.
 
 
+## Running import pipelines
+
+The repo provides a Docker container that can be used to run several import pipelines or to run unit tests for the repo. The [entrypoint script](scripts/entrypoint.sh) parses the container `run` arguments and launches the appropriate functions.
+
+Current endpoints include:
+
+- `test`: run the unit tests that do _not_ require external dependencies like Spark
+- `uniprot`: run the UniProtKB (UniProt protein database) import pipeline; see [the UniProtKB pipeline](src/cdm_data_loaders/pipelines/uniprot_kb_pipeline.py) for arguments
+- `uniref`: run the UniRef import pipeline; the [the UniRef pipeline](src/cdm_data_loaders/pipelines/uniref_pipeline.py) for arguments
+
+
 ## Development
 
 
@@ -64,7 +84,7 @@ Run the container interactively as the user `runner`; current directory is mount
 > docker run --rm -e NB_USER=runner -it -v .:/tmp/cdm ghcr.io/berdatalakehouse/spark_notebook:main
 ```
 
-This will launch a bash shell; the contents of the `cdm-data-loader-utils` directory are mounted at `/tmp/cdm`.
+This will launch a bash shell; the contents of the `cdm-data-loaders` directory are mounted at `/tmp/cdm`.
 
 
 Run the container and sleep:
@@ -81,24 +101,31 @@ See the [BERDataLakehouse/spark_notebook](https://github.com/BERDataLakehouse/sp
 
 ### Tests
 
-To run the tests, execute the command:
+Tests are categorised using pytest markers to allow developers to execute some or all the tests. See [pyproject.toml](pyproject.toml) for the markers used.
+
+To run all tests (requires a running Spark instance), execute the command:
 
 ```sh
 > uv run pytest
 ```
 
-To generate coverage for the tests, run
+To run only tests that do not require Spark, run
 
 ```sh
-> uv run pytest --cov=src --cov-report=xml tests/
+> uv run pytest -m "not requires_spark"
+```
+
+To generate coverage for the tests, run
+```sh
+> uv run pytest --cov=src --cov-report=xml
 ```
 
 The standard python `coverage` package is used and coverage can be generated as html or other formats by changing the parameters.
 
 
 ## Loading genomes, contigs, and features
 
-The [genome loader](src/cdm_data_loader_utils/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format:
+The [genome loader](src/cdm_data_loaders/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format:
 
 ```json
 {

diff --git a/src/cdm_data_loader_utils/__init__.py → notebooks/__init__.py b/src/cdm_data_loader_utils/__init__.py → notebooks/__init__.py
diff --git a/notebooks/pangenome_refseq.py b/notebooks/pangenome_refseq.py
@@ -0,0 +1,146 @@
+"""
+Utility script to identify missing RefSeq assemblies relative to GTDB.
+
+This script:
+1. Reads a GTDB metastore table.
+2. Removes GB_/RS_ prefixes from genome_id.
+3. Downloads the latest RefSeq assembly summary.
+4. Computes missing GCF assemblies.
+5. Outputs two text files using Spark distributed write:
+   - r214_assemblies
+   - missing_refseq_ids
+"""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+import urllib.request
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import click
+from pyspark.sql.functions import regexp_replace
+
+from berdl_notebook_utils.setup_spark_session import get_spark_session
+
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+
+logger = logging.getLogger(__name__)
+
+REFSEQ_URL = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
+
+
+def download_refseq_summary(output_path: Path) -> Path:
+    """
+    Download RefSeq assembly summary file.
+    """
+    logger.info("Downloading RefSeq assembly summary from %s", REFSEQ_URL)
+    urllib.request.urlretrieve(REFSEQ_URL, output_path)  # noqa: S310
+    return output_path
+
+
+def parse_refseq_gcf_ids(file_path: Path) -> list[str]:
+    """
+    Parse all GCF_ assembly accessions from the RefSeq summary file.
+    """
+    assembly_ids: list[str] = []
+
+    with file_path.open(encoding="utf-8") as file:
+        for line in file:
+            if line.startswith("#"):
+                continue
+
+            accession = line.split("\t", 1)[0]
+            if accession.startswith("GCF_"):
+                assembly_ids.append(accession)
+
+    return assembly_ids
+
+
+@click.command()
+@click.option(
+    "--gtdb-table",
+    required=True,
+    help="Metastore table containing genome_id column",
+)
+@click.option(
+    "--output-dir",
+    required=True,
+    help="Output directory (e.g. s3a://...) where text files will be written",
+)
+def main(gtdb_table: str, output_dir: str) -> None:
+    """
+    Run the missing RefSeq assembly detection pipeline.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    spark: SparkSession = get_spark_session()
+
+    # ------------------------------------------------------------------
+    # 1. Read GTDB genome table
+    # ------------------------------------------------------------------
+    r214_df = spark.table(gtdb_table).select("genome_id").distinct()
+
+    rm_prefix_df = (
+        r214_df.withColumn(
+            "assembly_id",
+            regexp_replace("genome_id", r"^(GB_|RS_)", ""),
+        )
+        .select("assembly_id")
+        .distinct()
+    )
+
+    logger.info("GTDB assemblies: %d", rm_prefix_df.count())
+
+    # ------------------------------------------------------------------
+    # 2. Download RefSeq summary securely
+    # ------------------------------------------------------------------
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+        summary_path = Path(tmp.name)
+
+    download_refseq_summary(summary_path)
+
+    # ------------------------------------------------------------------
+    # 3. Parse RefSeq GCF IDs
+    # ------------------------------------------------------------------
+    refseq_ids = parse_refseq_gcf_ids(summary_path)
+
+    refseq_df = spark.createDataFrame(
+        [(x,) for x in refseq_ids],
+        ["assembly_id"],
+    )
+
+    logger.info("RefSeq assemblies: %d", refseq_df.count())
+
+    # ------------------------------------------------------------------
+    # 4. Compute missing assemblies
+    # ------------------------------------------------------------------
+    missing_df = refseq_df.join(
+        rm_prefix_df,
+        on="assembly_id",
+        how="left_anti",
+    )
+
+    logger.info("Missing RefSeq assemblies: %d", missing_df.count())
+
+    # ------------------------------------------------------------------
+    # 5. Distributed Spark text output
+    # ------------------------------------------------------------------
+
+    # Output 1: All GTDB assemblies
+    rm_prefix_df.select("assembly_id").orderBy("assembly_id").coalesce(1).write.mode("overwrite").text(
+        f"{output_dir}/r214_assemblies"
+    )
+
+    # Output 2: Missing RefSeq assemblies
+    missing_df.select("assembly_id").orderBy("assembly_id").coalesce(1).write.mode("overwrite").text(
+        f"{output_dir}/missing_refseq_ids"
+    )
+
+    logger.info("Output files successfully written to %s", output_dir)
+
+
+if __name__ == "__main__":
+    main()