diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml new file mode 100644 index 00000000..77962029 --- /dev/null +++ b/.github/workflows/integration_tests.yaml @@ -0,0 +1,48 @@ +name: Integration tests + +on: + workflow_call: + + push: + branches: + - main + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + +permissions: + contents: read + +jobs: + integration_tests: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build integration test image + uses: docker/build-push-action@v6 + with: + context: . + load: true + tags: cdm-data-loaders-integration-tests:latest + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run integration tests + run: | + docker compose up \ + --no-build \ + --abort-on-container-exit \ + --exit-code-from integration-tests + + - name: Tear down + if: always() + run: docker compose down --volumes diff --git a/README.md b/README.md index 0a3cbe1d..edb8198e 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Repo for CDM input data loading and wrangling - [Development](#development) - [Spark and other non-python dependencies](#spark-and-other-non-python-dependencies) - [Tests](#tests) + - [Integration tests (MinIO + NCBI FTP)](#integration-tests-minio--ncbi-ftp) - [Loading genomes, contigs, and features](#loading-genomes-contigs-and-features) - [Running bbmap stats and checkm2 on genome or contigset files](#running-bbmap-stats-and-checkm2-on-genome-or-contigset-files) @@ -70,6 +71,35 @@ uv run python -m ipykernel install --user --name cdm-data-loaders --display-name The `cdm-data-loaders` kernel should now be available from the dropdown list of kernels in the Jupyter notebook interface. +#### Jupyter Kernel Environment Variables + +Often you will need access to environment variables that are included in the default Lakehouse +Jupyter environment, but will not be automatically included in your custom Jupyter kernel. To address +this, first identify the needed variables and values, and add them to your new kernel configuration +with the following steps: + +Open a new Jupyter Notebook __with the default kernel__ and run this in a new cell: +```python +import os +for k, v in sorted(os.environ.items()): + if "AWS" in k or "S3" in k or "MINIO" in k: # replace with whatever keys you're interested in + print(f"{k}={v}") +``` +Take the output and add the environment vars to the `kernel.json` for your new kernel (e.g., in `cdm-data-loaders/.venv/share/jupyter/kernels/python3/kernel.json`): +```json +{ + "argv": ["..."], + "display_name": "cdm-data-loaders", + "language": "python", + "env": { + "AWS_ACCESS_KEY_ID": "...", + "AWS_SECRET_ACCESS_KEY": "...", + "AWS_DEFAULT_REGION": "...", + ... + } +} +``` + ## Running import pipelines @@ -146,6 +176,65 @@ To generate coverage for the tests, run The standard python `coverage` package is used and coverage can be generated as html or other formats by changing the parameters. +#### Integration tests (MinIO + NCBI FTP) + +End-to-end integration tests for the NCBI assembly pipeline live in `tests/integration/`. They exercise the full flow — manifest diffing, FTP download, S3 promote/archive — against a locally running [MinIO](https://min.io/) container and the real NCBI FTP server. + +**Requirements:** +- Docker (for MinIO) +- Network access to `ftp.ncbi.nlm.nih.gov` + +**Running with Docker Compose (easiest)** + +The [docker-compose.yml](docker-compose.yml) at the repo root defines both a MinIO service and the integration test runner. To build the image, start MinIO, and run the integration tests in one command: + +```sh +docker compose up --build --abort-on-container-exit +``` + +Compose will stream test output to the terminal and exit with the pytest exit code. To clean up afterwards: + +```sh +docker compose down --volumes +``` + +**Running manually** + +If you prefer to run the tests directly against a local MinIO instance (e.g. for faster iteration during development), follow the steps below. + +**1. Start MinIO locally:** + +```sh +docker run -d \ + --name minio \ + -p 9000:9000 \ + -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data --console-address ":9001" +``` + +**2. Run the integration tests:** + +```sh +> uv run pytest tests/integration/ -m integration -v +``` + +Tests are automatically skipped when MinIO is not reachable, so the default `uv run pytest` will never fail due to a missing MinIO instance. + +**3. Inspect results:** + +Buckets are **not** cleaned up after tests. Browse the MinIO console at [http://localhost:9001](http://localhost:9001) (login: `minioadmin` / `minioadmin`) to inspect the final state of each test bucket. Each test method creates its own bucket (e.g. `integ-test-promote-dry-run`). + +**4. Stop MinIO when done:** + +```sh +docker stop minio && docker rm minio +``` + +> **Note:** These tests download real assemblies from NCBI FTP and are inherently slow (~30–60s per assembly). They are also marked `slow_test` so you can exclude them independently: `uv run pytest -m "not slow_test"`. + + ## Loading genomes, contigs, and features The [genome loader](src/cdm_data_loaders/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..ba488cce --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,44 @@ +services: + minio: + image: quay.io/minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + ports: + - "9000:9000" + - "9001:9001" + healthcheck: + test: [ "CMD", "mc", "ready", "local" ] + interval: 5s + timeout: 5s + retries: 5 + + integration-tests: + image: cdm-data-loaders-integration-tests:latest + build: + context: . + depends_on: + minio: + condition: service_healthy + environment: + MINIO_ENDPOINT_URL: http://minio:9000 + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + entrypoint: + - /bin/sh + - -c + - | + attempts=0 + until python3 -c " + import urllib.request, os + urllib.request.urlopen(os.environ['MINIO_ENDPOINT_URL'] + '/minio/health/live', timeout=1) + " 2>/dev/null; do + attempts=$$((attempts + 1)) + if [ "$$attempts" -ge 30 ]; then + echo 'Timed out waiting for MinIO.' && exit 1 + fi + echo 'Waiting for MinIO...' && sleep 1 + done + exec /app/scripts/entrypoint.sh integration-test + command: [] diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md new file mode 100644 index 00000000..83eb3e1e --- /dev/null +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -0,0 +1,538 @@ +# NCBI FTP Pipeline — Local End-to-End Walkthrough + +Step-by-step instructions for running a small (≤ 10 assembly) end-to-end sync +of NCBI RefSeq records against a local MinIO container. The walkthrough uses +the two existing Jupyter notebooks for Phases 1 and 3, and the project's Docker +image for the Phase 2 download step. + +> **Prerequisites:** +> - Docker or Podman +> - [uv](https://docs.astral.sh/uv/) (for running notebooks locally) +> - Network access to `ftp.ncbi.nlm.nih.gov` + +--- + +## Architecture overview + +``` + Phase 1 (notebook) Phase 2 (container) Phase 3 (notebook) +┌────────────────────┐ ┌───────────────────────┐ ┌──────────────────────┐ +│ Manifest notebook │ │ ncbi_ftp_sync CLI │ │ Promote notebook │ +│ ─ download FTP │────▶│ ─ read manifest │────▶│ ─ promote staged │ +│ assembly summary │ │ ─ parallel FTP DL │ │ files to Lakehouse │ +│ ─ diff against │ │ ─ MD5 verify │ │ ─ archive old ver. │ +│ previous │ │ ─ write .md5 sidecars │ │ ─ trim manifest │ +│ ─ write manifests │ └──────────┬────────────┘ └──────────────────────┘ +└────────────────────┘ │ + local volume + mounted into + the container +``` + +--- + +## Path anatomy + +All S3 paths in this pipeline compose from a small set of variables. +Understanding this decomposition is the key to configuring the notebooks. + +### Path formats used + +| Format | Example | Description | +|--------|---------|-------------| +| **s3:// URI** | `s3://cdm-lake/staging/run1/` | Full URI with scheme + bucket + key | +| **bucket name** | `cdm-lake` | Just the bucket, no scheme | +| **S3 key prefix** | `tenant-general-warehouse/kbase/datasets/ncbi/` | Path within a bucket (no scheme, no bucket) | +| **S3 object key** | `staging/transfer_manifest.txt` | Single object key within a bucket | +| **local path** | `output/removed_manifest.txt` | Filesystem path on the host | + +### Lakehouse object (final location) + +``` +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ─────┘ └── key prefix ──────┘└── build_accession_path() ────────────────────────┘ +``` + +Example: +``` +s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz +``` + +### Staging object (Phase 2 output) + +``` +s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ─────┘ └── key prefix ────┘└── build_accession_path() ────────────────────────┘ +``` + +### Local output (Phase 1) + +``` +{OUTPUT_DIR}/transfer_manifest.txt +{OUTPUT_DIR}/removed_manifest.txt +{OUTPUT_DIR}/updated_manifest.txt +{OUTPUT_DIR}/diff_summary.json +``` + +--- + +## 1. Setup + +### Local testing + +### Start MinIO + +```sh +docker run -d \ + --name minio \ + -p 9000:9000 \ + -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data --console-address ":9001" +``` + +(Note that a similar service is included in the `docker-compose` configuration file at the root of +this repository that is used in CI test workflows.) + +Create a test bucket via the [MinIO console](http://localhost:9001) +(login: `minioadmin` / `minioadmin`), or from the command line using the +included `scripts/s3_local.py` helper (requires no extra installs — only +`boto3` which is already a project dependency): + +```sh +uv run python scripts/s3_local.py mb s3://cdm-lake +uv run python scripts/s3_local.py mb s3://cts +``` + +### Lakehouse + +#### Build `cdm-data-loaders` + +First, clone the `cdm-data-loaders` repo in your Lakehouse user space. Then, build the package +in a virtual environment and register it as a Jupyter kernel: +```bash +cd cdm-data-loaders +uv sync +source .venv/bin/activate +uv pip install -e . +uv pip install ipykernel +uv run python -m ipykernel install --user --name cdm-data-loaders --display-name "cdm-data-loaders" +``` +Then, when you open the manifest or promote notebooks, choose the `cdm-data-loaders` kernel. + +#### Add the S3 Credentials to the Kernel + +Open a new Jupyter Notebook with the default kernel and run this in a new cell: +```python +import os +for k, v in sorted(os.environ.items()): + if "AWS" in k or "S3" in k or "MINIO" in k: + print(f"{k}={v}") +``` +Take the output and add the environment vars to the `kernel.json` for your new kernel (e.g., in `cdm-data-loaders/.venv/share/jupyter/kernels/python3/kernel.json`): +```json +{ + "argv": ["..."], + "display_name": "cdm-data-loaders", + "language": "python", + "env": { + "AWS_ACCESS_KEY_ID": "...", + "AWS_SECRET_ACCESS_KEY": "...", + "AWS_DEFAULT_REGION": "...", + ... + } +} +``` + + +--- + +## 2. Phase 1 — Generate manifests (notebook) + +Open `notebooks/ncbi_ftp_manifest.ipynb` in JupyterLab or VS Code. + +### Constants to change (Cell 3) + +| Constant | Walkthrough value | Format | Why | +|-----------------------|----------------------------------|--------|---------------------------------------------------------| +| `DATABASE` | `"refseq"` | string | keep as-is | +| `PREFIX_FROM` | `"900"` | string | high-numbered prefix → few assemblies, fast diffing | +| `PREFIX_TO` | `"900"` | string | single prefix bucket | +| `LIMIT` | `10` | int | cap to 10 assemblies | +| `PREVIOUS_SUMMARY_URI` | `None` | s3:// URI | first run — everything is "new" | +| `SNAPSHOT_UPLOAD_URI` | `None` | s3:// URI | skip S3 upload for local testing | +| `LAKEHOUSE_BUCKET` | `"cdm-lake"` (or `None`) | bucket name | set to prune assemblies already in the Lakehouse | +| `STORE_KEY_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | S3 key prefix | default Lakehouse path prefix | +| `OUTPUT_DIR` | `Path("output")` | local path | keep as-is (local directory) | + +### Initialise the S3 client for MinIO + +If you set `PREVIOUS_SUMMARY_URI`, `SNAPSHOT_UPLOAD_URI`, `LAKEHOUSE_BUCKET`, +or `STAGING_URI` to point at your local MinIO, you must initialise +the S3 client **before** running the cells that use them. Insert a new cell +after Cell 1 (Imports) with: + +```python +from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client + +reset_s3_client() +get_s3_client({ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "minioadmin", + "aws_secret_access_key": "minioadmin", +}) +``` + +If all three S3 variables are `None` (purely local testing), this cell can +be skipped — though on repeat runs you should set `LAKEHOUSE_BUCKET` so +assemblies already promoted to the Lakehouse are pruned from the transfer +manifest. + +### Optional: Bootstrap from existing store (Cell 5) + +If you have a pre-populated S3 store but lack a baseline assembly summary, +you can scan the store to generate a synthetic baseline. This is especially +useful for large stores (100K+ assemblies) where verifying against FTP +checksums would take days. + +**When to use this:** +- First run against an existing, pre-populated store +- You want to start diffing without waiting for checksum verification +- You don't have a previous assembly summary snapshot to compare against + +**How it works:** +1. Set `SCAN_STORE = True` in Cell 5 +2. The notebook scans all objects under `s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX}` +3. For each unique assembly found, it extracts the accession and uses the + earliest object `LastModified` as a conservative `seq_rel_date` +4. It saves the synthetic summary to `LOCAL_SYNTHETIC_SUMMARY` (default: + `output/synthetic_summary_from_store.txt`) +5. This becomes the baseline for diffing; subsequent runs can load this + file as `PREVIOUS_SUMMARY_URI` + +**Example (for a 500K-assembly store):** +```python +SCAN_STORE = True +LAKEHOUSE_BUCKET = "cdm-lake" +STORE_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" +LOCAL_SYNTHETIC_SUMMARY = Path("output/synthetic_summary_from_store.txt") + +# After running Cell 5, upload the result to S3 for future runs: +# s3 cp output/synthetic_summary_from_store.txt s3://cdm-lake/assembly_summaries/synthetic_base.txt +# Then in future runs, set: +# PREVIOUS_SUMMARY_URI = "s3://cdm-lake/assembly_summaries/synthetic_base.txt" +``` + +**Performance:** Scanning typically takes 5–10 minutes for 500K assemblies +(vs. ~6 days of checksum verification). + +### Run the notebook + +Execute all cells in order. After Cell 7 finishes you should see files in +`output/`: + +``` +output/ + transfer_manifest.txt # ≤ 10 FTP directory paths + removed_manifest.txt # empty on first run + updated_manifest.txt # empty on first run + diff_summary.json # counts of new/updated/replaced/suppressed +``` + +Inspect `transfer_manifest.txt` — each line is an FTP directory path like: + +``` +/genomes/all/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly +``` + +### Optional: upload manifests to S3 for CTS + +Cell 7 optionally uploads the manifests to an S3 staging prefix so that CTS +can stage them into the container. For local testing, set +`STAGING_URI = None` (the default) and copy the manifest manually in +Step 3b below. + +If you are testing against MinIO and want to exercise the S3 upload path: + +```python +STAGING_URI = "s3://cdm-lake/staging/run1/" +``` + +> **Tip:** If you re-run later with `PREVIOUS_SUMMARY_URI` pointing at a +> snapshot from a prior run you will see `updated`, `replaced`, and +> `suppressed` entries in the diff. + +--- + +## 3. Phase 2 — Download assemblies (container) + +Phase 2 uses the `ncbi_ftp_sync` CLI, which is the container's built-in entry +point for parallel FTP downloads. + +> **CTS (CDM Task Service):** In production, Phase 2 runs as a CTS job. +> CTS stages input files from S3 into the container's filesystem mount +> (`/input_dir`) and copies container output back to S3 (`/output_dir`). +> The container itself never receives S3 credentials. +> See [cdm-task-service](https://github.com/kbase/cdm-task-service) for details. + +For local testing without a CTS instance we run the container directly with +Docker (or Podman), mounting the manifest produced in Phase 1 as input and a +local staging directory as output. + +### 3a. Build the container image + +```sh +# From the repository root +docker build -t cdm-data-loaders . +``` + +### 3b. Prepare local directories + +```sh +mkdir -p notebooks/staging +cp notebooks/output/transfer_manifest.txt notebooks/staging/ +``` + +### 3c. Run the download + +```sh +docker run --rm \ + --userns=keep-id \ + -v "$(pwd)/notebooks/staging:/input:ro" \ + -v "$(pwd)/notebooks/staging:/output" \ + cdm-data-loaders ncbi_ftp_sync \ + --manifest /input/transfer_manifest.txt \ + --output-dir /output \ + --threads 2 \ + --limit 10 +``` + +> **Note:** `--userns=keep-id` maps your host UID into the container so +> bind-mount writes work with Podman's rootless mode. If you use Docker +> instead, replace it with `--user "$(id -u):$(id -g)"`. + +| Flag | Purpose | +|-----------------|-----------------------------------------------------------| +| `--manifest` | Path to the transfer manifest inside the container | +| `--output-dir` | Where downloads land (mounted from host `staging/`) | +| `--threads` | Parallel FTP connections (2 is polite for testing) | +| `--limit` | Redundant safety cap (already limited in Phase 1) | + +After the container exits, `notebooks/staging/` will contain: + +``` +staging/ + raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/ + GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz + GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz.md5 + GCF_900000615.1_PRJEB7657_assembly_protein.faa.gz + GCF_900000615.1_PRJEB7657_assembly_protein.faa.gz.md5 + ... + download_report.json +``` + +Each data file has a `.md5` sidecar containing the hex digest verified against +the FTP server's `md5checksums.txt`. + +> **Without Docker:** You can also run the CLI directly if you have the project +> installed locally: +> +> ```sh +> uv run ncbi_ftp_sync \ +> --manifest notebooks/output/transfer_manifest.txt \ +> --output-dir staging \ +> --threads 2 --limit 10 +> ``` + +### 3d. Upload staged files to MinIO + +The download step writes to the local filesystem. To feed Phase 3 we need +to upload the staged files into MinIO under a staging prefix: + +```sh +uv run python scripts/s3_local.py cp notebooks/staging/raw_data/ s3://cts/staging/run1/raw_data/ +``` + +Verify the upload: + +```sh +uv run python scripts/s3_local.py ls s3://cts/staging/run1/ +``` + +--- + +## 4. Phase 3 — Promote & archive (notebook) + +Open `notebooks/ncbi_ftp_promote.ipynb`. + +### Constants to change (Cell 3) + +| Constant | Walkthrough value | Format | Why | +|-------------------------|------------------------------------------------------|--------|---------------------------------------------| +| `STAGING_BUCKET` | `"cts"` | bucket name | CTS staging bucket (Phase 2 writes here) | +| `LAKEHOUSE_BUCKET` | `"cdm-lake"` | bucket name | final Lakehouse destination | +| `STAGING_KEY_PREFIX` | `"staging/run1/"` | S3 key prefix | matches the upload prefix from Step 3d | +| `REMOVED_MANIFEST_PATH` | `None` | local path | nothing to remove on first run | +| `UPDATED_MANIFEST_PATH` | `None` | local path | nothing to archive on first run | +| `NCBI_RELEASE` | `None` | string | no release tag needed for local testing | +| `MANIFEST_S3_KEY` | `None` | S3 object key | skip manifest trimming | +| `LAKEHOUSE_KEY_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | S3 key prefix | keep default | +| `DRY_RUN` | `True` | bool | **start with dry-run!** | + +### Initialise the S3 client for MinIO + +The notebook calls `get_s3_client()` which, by default, tries to import +credentials from `berdl_notebook_utils`. For local MinIO you need to +initialise the client manually **before** running Cell 4. Insert a new cell +after Cell 2 (Imports) with: + +```python +from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client + +reset_s3_client() # clear any cached client +get_s3_client({ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "minioadmin", + "aws_secret_access_key": "minioadmin", +}) +``` + +### Run the notebook + +1. Execute all cells. With `DRY_RUN = True` the promote step will log what it + *would* do without moving any objects. +2. Review the report in Cell 6. +3. If the dry-run looks correct, set `DRY_RUN = False` in Cell 3 and re-run + from Cell 3. + +After promotion the final Lakehouse layout in MinIO will look like: + +``` +cdm-lake/ + tenant-general-warehouse/kbase/datasets/ncbi/ + raw_data/GCF/900/000/615/GCF_900000615.1_.../ + GCF_900000615.1_..._genomic.fna.gz (with md5 in user metadata) + GCF_900000615.1_..._protein.faa.gz + ... +``` + +--- + +## 5. Inspect results in MinIO + +Browse the [MinIO console](http://localhost:9001) or use the CLI: + +```sh +# List final Lakehouse objects +uv run python scripts/s3_local.py ls \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/ + +# Check user metadata (md5) on a specific object +uv run python scripts/s3_local.py head \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz +``` + +### Frictionless metadata descriptors + +Each promoted assembly gets a [frictionless](https://framework.frictionlessdata.io/) data package descriptor stored at: + +``` +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}metadata/{assembly_dir}_datapackage.json +``` + +For example: + +``` +s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/GCF_900000615.1_PRJEB7657_assembly_datapackage.json +``` + +The descriptor follows the KBase credit metadata schema (v1.0) and records: + +- **identifier** — `NCBI:{accession}`, e.g. `NCBI:GCF_900000615.1` +- **resource_type** — always `"dataset"` +- **resources** — list of promoted files with their final S3 key, byte size, + file format, and MD5 hash (when available) +- **contributors / publisher** — NCBI organizational metadata +- **meta.saved_by** — `"cdm-data-loaders-ncbi-ftp"` + +When an assembly is archived (updated or removed), its live descriptor is +copied to: + +``` +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json +``` + +Use the last cell of `notebooks/ncbi_ftp_promote.ipynb` to list and preview +all descriptors written in a promote run. + +To inspect a descriptor directly: + +```sh +uv run python scripts/s3_local.py cat \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/GCF_900000615.1_PRJEB7657_assembly_datapackage.json +``` + +--- + +## 6. Incremental run (second sync) + +To exercise the diff/update/archive logic, repeat the pipeline with a +previous snapshot: + +2. **Phase 1:** Set `PREVIOUS_SUMMARY_URI` to an S3 path where you upload the + raw summary from the first run, or save the `raw_summary` string from Cell 4 + to a local file and pass it via `parse_assembly_summary(Path("prev.txt"))`. +2. **Phase 1:** The diff will now show `updated`, `replaced`, and + `suppressed` entries (if any changed between runs). +3. **Phase 2:** Download the new manifest. +4. **Phase 3:** Set `REMOVED_MANIFEST_PATH` and `UPDATED_MANIFEST_PATH` to the paths + from Phase 1. Updated assemblies will be archived before overwrite; + removed assemblies will be archived and deleted. + +--- + +## 7. Cleanup + +```sh +# Stop and remove MinIO +docker stop minio && docker rm minio + +# Remove local staging data +rm -rf staging/ output/ +``` + +--- + +## Troubleshooting + +| Symptom | Cause | Fix | +|---------|-------|-----| +| `berdl_notebook_utils` import error in notebook | Missing local MinIO client init | Add the `get_s3_client({...})` cell described in Step 4 | +| `connect_ftp() timeout` | NCBI FTP may be slow or rate-limited | Retry; reduce `--threads` to 1 | +| `CRC64NVME` errors uploading to MinIO | MinIO version too old (needs ≥ `2025-02-07`) | Pin to `minio/minio:RELEASE.2025-02-28T09-55-16Z` or newer | +| Phase 3 shows 0 promoted | Staging prefix doesn't match or bucket is wrong | Verify `STAGING_KEY_PREFIX` matches the S3 upload path from Step 3d | +| Container can't reach FTP | Docker network isolation | Use `--network host` or ensure DNS resolution works inside the container | + +--- + +## Reference: file filters + +Phase 2 downloads only files matching these suffixes (defined in +`cdm_data_loaders.ncbi_ftp.assembly.FILE_FILTERS`): + +| Suffix | Content | +|--------|---------| +| `_genomic.fna.gz` | Genome nucleotide sequences | +| `_genomic.gff.gz` | Genome annotations (GFF3) | +| `_protein.faa.gz` | Protein sequences | +| `_gene_ontology.gaf.gz` | GO annotations | +| `_assembly_report.txt` | Assembly metadata | +| `_assembly_stats.txt` | Assembly statistics | +| `_assembly_regions.txt` | Assembly regions | +| `_ani_contam_ranges.tsv` | ANI contamination ranges | +| `_gene_expression_counts.txt.gz` | Gene expression counts | +| `_normalized_gene_expression_counts.txt.gz` | Normalised expression counts | + +Plus the per-assembly `md5checksums.txt` which is always downloaded for +integrity verification. diff --git a/notebooks/ncbi_ftp_download.ipynb b/notebooks/ncbi_ftp_download.ipynb new file mode 100644 index 00000000..c8e10603 --- /dev/null +++ b/notebooks/ncbi_ftp_download.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79505884", + "metadata": {}, + "source": [ + "# NCBI Assembly Download & Stage (Phase 2)\n", + "\n", + "Downloads NCBI assemblies listed in a transfer manifest from the NCBI FTP server\n", + "and uploads them to an S3 staging prefix for Phase 3 promotion.\n", + "\n", + "**When to use this notebook vs the CTS container:**\n", + "- Use the CTS container (`ncbi_ftp_sync`) for production runs — it has restart/retry\n", + " support and runs in the data-transfer environment.\n", + "- Use this notebook when CTS is unavailable (e.g. local development, debugging, or\n", + " one-off re-downloads of failed assemblies).\n", + "\n", + "Steps:\n", + "1. Configure bucket, manifest source, staging prefix, and thread count\n", + "2. Preview the first 10 manifest lines to verify before committing\n", + "3. Download assemblies from NCBI FTP and upload to staging\n", + "4. Review the download/stage report" + ] + }, + { + "cell_type": "markdown", + "id": "76d92c54", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_BUCKET` | bucket name only | `cts` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `staging/run1/` |\n", + "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/run1/input/transfer_manifest.txt` |\n", + "| `_PATH` | local filesystem path | `output/transfer_manifest.txt` |\n", + "\n", + "Staging object: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`\n", + "Report: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}download_report.json`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38e7aa6d", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "import json\n", + "\n", + "from cdm_data_loaders.pipelines.ncbi_ftp_download import (\n", + " DEFAULT_STAGING_KEY_PREFIX,\n", + " download_and_stage,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34a18261", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\n", + "\n", + "Provide exactly one of MANIFEST_S3_KEY (read from S3) or MANIFEST_LOCAL_PATH (read from disk).\n", + "Set the other to None.\n", + "\n", + "Disk space note: ensure sufficient free space in the system temp directory before running.\n", + "A rough estimate is ~500 MB per 1000 assemblies; large genomes can exceed 1 GB each.\n", + "Set LIMIT to a small number (e.g. 5) to test the workflow before a full run.\n", + "\"\"\"\n", + "\n", + "# S3 bucket where the manifest lives and where staged files will be written\n", + "# format: bucket name (no s3:// scheme)\n", + "STAGING_BUCKET = \"cts\"\n", + "\n", + "# S3 object key of the transfer manifest written by Phase 1\n", + "# format: S3 object key within STAGING_BUCKET (no scheme, no bucket)\n", + "# Set to None to use MANIFEST_LOCAL_PATH instead\n", + "MANIFEST_S3_KEY: str | None = \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\"\n", + "\n", + "# Local path to the transfer manifest (alternative to MANIFEST_S3_KEY)\n", + "# format: local filesystem path\n", + "# Set to None to use MANIFEST_S3_KEY instead\n", + "MANIFEST_LOCAL_PATH: str | None = None\n", + "\n", + "# S3 key prefix for staged output files (must match what Phase 3 expects)\n", + "# format: S3 key prefix within STAGING_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = \"io/matt-cohere/staging/run1/output/\"\n", + "\n", + "# Number of parallel download and upload threads\n", + "THREADS = 4\n", + "\n", + "# Limit to first N assemblies (None = process all)\n", + "LIMIT: int | None = None\n", + "\n", + "# Dry-run mode — download locally but skip S3 uploads\n", + "DRY_RUN = False\n", + "\n", + "print(f\"Bucket: {STAGING_BUCKET}\")\n", + "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Manifest local: {MANIFEST_LOCAL_PATH}\")\n", + "print(f\"Staging prefix: {STAGING_KEY_PREFIX}\")\n", + "print(f\"Threads: {THREADS}\")\n", + "print(f\"Limit: {LIMIT}\")\n", + "print(f\"Dry-run: {DRY_RUN}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51a857f9", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69beeaa9", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Preview the first 10 manifest lines before committing to the full run.\"\"\"\n", + "\n", + "if MANIFEST_S3_KEY is not None:\n", + " s3 = get_s3_client()\n", + " response = s3.get_object(Bucket=STAGING_BUCKET, Key=MANIFEST_S3_KEY)\n", + " manifest_lines = response[\"Body\"].read().decode().splitlines()\n", + "else:\n", + " with open(MANIFEST_LOCAL_PATH) as f:\n", + " manifest_lines = f.read().splitlines()\n", + "\n", + "data_lines = [l for l in manifest_lines if l.strip() and not l.startswith(\"#\")]\n", + "\n", + "print(f\"Total entries: {len(data_lines)}\")\n", + "print(\"First 10:\")\n", + "for line in data_lines[:10]:\n", + " print(f\" {line}\")\n", + "if len(data_lines) > 10:\n", + " print(f\" ... and {len(data_lines) - 10} more\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b76d273", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Download assemblies from NCBI FTP and upload to S3 staging.\"\"\"\n", + "\n", + "report = download_and_stage(\n", + " bucket=STAGING_BUCKET,\n", + " staging_key_prefix=STAGING_KEY_PREFIX,\n", + " manifest_s3_key=MANIFEST_S3_KEY,\n", + " manifest_local_path=MANIFEST_LOCAL_PATH,\n", + " threads=THREADS,\n", + " limit=LIMIT,\n", + " dry_run=DRY_RUN,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "192b9d34", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Display download and staging report.\"\"\"\n", + "\n", + "FAILURE_PREVIEW = 10\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"DOWNLOAD & STAGE REPORT\")\n", + "print(\"=\" * 50)\n", + "print(f\"Attempted: {report['total_attempted']}\")\n", + "print(f\"Succeeded: {report['succeeded']}\")\n", + "print(f\"Failed: {report['failed']}\")\n", + "print(f\"Staged objects: {report['staged_objects']}\")\n", + "print(f\"Staging prefix: {report['staging_key_prefix']}\")\n", + "print(f\"Dry-run: {report['dry_run']}\")\n", + "print(f\"Timestamp: {report['timestamp']}\")\n", + "\n", + "if report[\"failed\"] > 0:\n", + " print(\"\\nFailed assemblies:\")\n", + " for failure in report[\"failures\"][:FAILURE_PREVIEW]:\n", + " print(f\" {failure['path']}: {failure['error']}\")\n", + " if report[\"failed\"] > FAILURE_PREVIEW:\n", + " print(f\" ... and {report['failed'] - FAILURE_PREVIEW} more\")\n", + "\n", + "if report[\"dry_run\"]:\n", + " print(\"\\nThis was a dry-run. Set DRY_RUN = False and re-run to upload to S3.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb new file mode 100644 index 00000000..49045123 --- /dev/null +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a05af26", + "metadata": {}, + "source": [ + "# NCBI Assembly Manifest Generation (Phase 1)\n", + "\n", + "Downloads the current NCBI assembly summary from FTP, compares it against a\n", + "previous snapshot, and produces:\n", + "\n", + "- `transfer_manifest.txt` — assemblies to download in Phase 2\n", + "- `removed_manifest.txt` — assemblies to archive in Phase 3\n", + "- `diff_summary.json` — human-readable summary of changes\n", + "\n", + "All filtering (prefix range, limit) is applied here so downstream phases\n", + "receive a final, pre-filtered manifest.\n", + "\n", + "Optionally verifies candidates against the S3 Lakehouse (`LAKEHOUSE_BUCKET`) so\n", + "assemblies that were already downloaded and promoted are pruned from the\n", + "transfer manifest." + ] + }, + { + "cell_type": "markdown", + "id": "d0d3063c", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_URI` | `s3://bucket/key/…` | `s3://cdm-lake/staging/run1/` |\n", + "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `tenant-general-warehouse/kbase/datasets/ncbi/` |\n", + "| `_DIR` / `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", + "\n", + "Lakehouse object: `s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX}raw_data/…/{filename}`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319383dc", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST\n", + "from cdm_data_loaders.ncbi_ftp.manifest import (\n", + " AssemblyRecord,\n", + " compute_diff,\n", + " download_assembly_summary,\n", + " filter_by_prefix_range,\n", + " parse_assembly_summary,\n", + " write_diff_summary,\n", + " write_removed_manifest,\n", + " write_transfer_manifest,\n", + " write_updated_manifest,\n", + ")\n", + "from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b196d5a3", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8cdb6f", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\"\"\"\n", + "\n", + "# Which NCBI database to sync: \"refseq\" or \"genbank\"\n", + "DATABASE = \"refseq\"\n", + "\n", + "# Accession prefix filtering (3-digit, inclusive). Set to None to skip.\n", + "PREFIX_FROM: str | None = \"900\" # e.g. \"000\"\n", + "PREFIX_TO: str | None = \"900\" # e.g. \"003\"\n", + "\n", + "# Maximum number of new/updated assemblies to include (None = unlimited)\n", + "LIMIT: int | None = 10\n", + "\n", + "# Previous assembly summary snapshot\n", + "# format: s3:// URI (e.g. \"s3://cdm-lake/.../assembly_summary_refseq_prev.txt\")\n", + "PREVIOUS_SUMMARY_URI: str | None = None\n", + "\n", + "# S3 location where the new snapshot will be uploaded after diffing\n", + "# format: s3:// URI\n", + "SNAPSHOT_UPLOAD_URI: str | None = (\n", + " \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/assembly_summary_refseq.txt\"\n", + ")\n", + "\n", + "# Verify candidates against the S3 Lakehouse — prune assemblies already present.\n", + "# Set LAKEHOUSE_BUCKET to your bucket name to enable, or None to skip.\n", + "# STORE_KEY_PREFIX should point to the directory containing `raw_data/`.\n", + "# format: bucket name (no s3:// scheme)\n", + "LAKEHOUSE_BUCKET: str | None = \"cdm-lake\"\n", + "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", + "STORE_KEY_PREFIX = \"tenant-general-warehouse/kbase/datasets/ncbi/\"\n", + "\n", + "# Local output directory for manifest files\n", + "# format: local directory path\n", + "OUTPUT_DIR = Path(\"output\")\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print(f\"Database: {DATABASE}\")\n", + "print(f\"Prefix range: {PREFIX_FROM} -> {PREFIX_TO}\")\n", + "print(f\"Limit: {LIMIT}\")\n", + "print(f\"Verify against S3: {LAKEHOUSE_BUCKET or 'disabled'}\")\n", + "print(f\"Output dir: {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b10c3aaf", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Download current assembly summary from NCBI FTP.\"\"\"\n", + "\n", + "raw_summary = download_assembly_summary(database=DATABASE, ftp_host=FTP_HOST)\n", + "current = parse_assembly_summary(raw_summary)\n", + "print(f\"Parsed {len(current)} assemblies from current {DATABASE} summary\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceb5af5a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Optional: Bootstrap baseline by scanning current store (if no previous summary available).\n", + "\n", + "If you have a pre-populated S3 store but no previous assembly summary snapshot,\n", + "you can scan the store to generate a synthetic summary. This becomes the baseline\n", + "for the diff.\n", + "\n", + "Set SCAN_STORE=True below to enable. The scan will:\n", + " 1. List all objects under LAKEHOUSE_BUCKET/STORE_KEY_PREFIX\n", + " 2. Extract accessions matching the DATABASE type (GCF_ for refseq, GCA_ for genbank)\n", + " 3. Apply user-provided SYNTHETIC_RELEASE_DATE to all records\n", + " 4. Build AssemblyRecord for each assembly found\n", + " 5. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", + "\n", + "Typical use case: First run against 500K+ existing assemblies. Scanning takes\n", + "significant time (potentially 15-30+ min for large stores with many files per assembly).\n", + "On subsequent runs the saved file is loaded directly — set FORCE_RESCAN=True to override.\n", + "\"\"\"\n", + "\n", + "SCAN_STORE = True # Set to True to scan your store\n", + "FORCE_RESCAN = False # Set to True to ignore an existing LOCAL_SYNTHETIC_SUMMARY and rescan\n", + "SYNTHETIC_RELEASE_DATE = \"2025/10/31\" # YYYY/MM/DD applied to all synthetic records\n", + "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", + "\n", + "if SCAN_STORE and LAKEHOUSE_BUCKET:\n", + " if LOCAL_SYNTHETIC_SUMMARY.exists() and not FORCE_RESCAN:\n", + " print(f\"Loading existing synthetic summary from {LOCAL_SYNTHETIC_SUMMARY} (set FORCE_RESCAN=True to rescan)\")\n", + " previous = parse_assembly_summary(LOCAL_SYNTHETIC_SUMMARY)\n", + " print(f\"Loaded {len(previous)} assemblies from synthetic summary\")\n", + " else:\n", + " import time as _time\n", + " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", + " from tqdm.notebook import tqdm\n", + "\n", + " print(f\"Scanning s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX} for existing {DATABASE} assemblies ...\")\n", + " print(\"Note: large stores (500K+ assemblies) may take 15-30+ minutes.\")\n", + " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True, mininterval=2.0)\n", + "\n", + " _last_refresh = _time.monotonic()\n", + " _REFRESH_INTERVAL = 2.0 # seconds between display updates\n", + "\n", + " def _track_scan(count: int, acc: str) -> None:\n", + " global _last_refresh\n", + " progress.n = count\n", + " now = _time.monotonic()\n", + " if now - _last_refresh >= _REFRESH_INTERVAL:\n", + " progress.set_postfix(acc=acc, refresh=True)\n", + " _last_refresh = now\n", + "\n", + " synthetic = scan_store_to_synthetic_summary(\n", + " LAKEHOUSE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " SYNTHETIC_RELEASE_DATE,\n", + " database=DATABASE,\n", + " progress_callback=_track_scan,\n", + " )\n", + " progress.n = len(synthetic)\n", + " progress.refresh()\n", + "\n", + " print(f\"Found {len(synthetic)} {DATABASE} assemblies in store\")\n", + "\n", + " # Save synthetic summary to file for future runs\n", + " LOCAL_SYNTHETIC_SUMMARY.parent.mkdir(parents=True, exist_ok=True)\n", + " with LOCAL_SYNTHETIC_SUMMARY.open(\"w\") as f:\n", + " for acc in sorted(synthetic.keys()):\n", + " rec = synthetic[acc]\n", + " f.write(\n", + " f\"{rec.accession}\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t{rec.status}\\t.\\t.\\t.\\t{rec.seq_rel_date}\\t.\\t.\\t.\\t.\\t{rec.ftp_path}\\t.\\n\"\n", + " )\n", + " print(f\"Saved synthetic summary to {LOCAL_SYNTHETIC_SUMMARY}\")\n", + "\n", + " # Use it as the previous baseline\n", + " previous = synthetic\n", + "else:\n", + " if SCAN_STORE:\n", + " print(\"SCAN_STORE=True but LAKEHOUSE_BUCKET not set. Skipping.\")\n", + " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88954378", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Load previous summary from S3 (or from synthetic scan, or start fresh).\n", + "\n", + "If you ran the store scan in the previous cell, SCAN_STORE=True above will have\n", + "set `previous` already. Otherwise, try to load from PREVIOUS_SUMMARY_URI.\n", + "\"\"\"\n", + "\n", + "if \"previous\" not in locals() or previous is None:\n", + " # Store scan didn't run, or was skipped. Try to load from S3.\n", + " if PREVIOUS_SUMMARY_URI:\n", + " s3 = get_s3_client()\n", + " bucket, key = split_s3_path(PREVIOUS_SUMMARY_URI)\n", + " resp = s3.get_object(Bucket=bucket, Key=key)\n", + " prev_text = resp[\"Body\"].read().decode(\"utf-8\")\n", + " previous = parse_assembly_summary(prev_text)\n", + " print(f\"Loaded {len(previous)} assemblies from previous snapshot\")\n", + " else:\n", + " print(\"No previous snapshot and SCAN_STORE=False — all current 'latest' assemblies will be marked as new\")\n", + " previous = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18482b3c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Compute diff and apply prefix filter.\"\"\"\n", + "\n", + "# Filter current assemblies by prefix range\n", + "filtered = filter_by_prefix_range(current, prefix_from=PREFIX_FROM, prefix_to=PREFIX_TO)\n", + "print(f\"After prefix filter: {len(filtered)} assemblies\")\n", + "\n", + "# Also filter previous if present\n", + "filtered_prev = filter_by_prefix_range(previous, prefix_from=PREFIX_FROM, prefix_to=PREFIX_TO) if previous else None\n", + "\n", + "# Compute diff\n", + "diff = compute_diff(filtered, previous_assemblies=filtered_prev)\n", + "\n", + "print(f\"New: {len(diff.new)}\")\n", + "print(f\"Updated: {len(diff.updated)}\")\n", + "print(f\"Replaced: {len(diff.replaced)}\")\n", + "print(f\"Suppressed: {len(diff.suppressed)}\")\n", + "print(f\"Total to transfer: {len(diff.new) + len(diff.updated)}\")\n", + "print(f\"Total to remove: {len(diff.replaced) + len(diff.suppressed)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91ad314a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Verify candidates against S3 Lakehouse, then apply LIMIT.\n", + "\n", + "Verification (optional): for each candidate, fetch md5checksums.txt from\n", + "NCBI FTP and compare against md5 metadata on existing S3 objects.\n", + "Assemblies already present with matching checksums are pruned.\n", + "\n", + "LIMIT is applied *after* verification so the cap counts only assemblies\n", + "that genuinely need downloading.\n", + "\"\"\"\n", + "\n", + "# -- Verify against Lakehouse --\n", + "if LAKEHOUSE_BUCKET:\n", + " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", + " from tqdm.notebook import tqdm\n", + "\n", + " candidates = diff.new + diff.updated\n", + " total = len(candidates)\n", + " print(f\"Verifying {total} candidates against s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", + "\n", + " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\", leave=True)\n", + "\n", + " def _update_progress(done: int, _total: int, acc: str) -> None:\n", + " progress.n = done\n", + " progress.set_postfix(acc=acc, refresh=False)\n", + " progress.refresh()\n", + "\n", + " if total == 0:\n", + " print(\"No candidates to verify; skipping checksum checks.\")\n", + " confirmed = set()\n", + " else:\n", + " confirmed = set(\n", + " verify_transfer_candidates(\n", + " candidates,\n", + " filtered,\n", + " LAKEHOUSE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " ftp_host=FTP_HOST,\n", + " progress_callback=_update_progress,\n", + " )\n", + " )\n", + "\n", + " progress.refresh()\n", + "\n", + " before = len(diff.new) + len(diff.updated)\n", + " diff.new = [a for a in diff.new if a in confirmed]\n", + " diff.updated = [a for a in diff.updated if a in confirmed]\n", + " after = len(diff.new) + len(diff.updated)\n", + " print(f\"Verified: {after} need downloading, {before - after} pruned (already in store)\")\n", + "else:\n", + " print(\"Skipping S3 verification (LAKEHOUSE_BUCKET not set)\")\n", + "\n", + "# -- Apply LIMIT --\n", + "if LIMIT is not None:\n", + " original_new = len(diff.new)\n", + " original_updated = len(diff.updated)\n", + " combined = diff.new + diff.updated\n", + " limited = combined[:LIMIT]\n", + " limited_set = set(limited)\n", + " diff.new = [a for a in diff.new if a in limited_set]\n", + " diff.updated = [a for a in diff.updated if a in limited_set]\n", + " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", + " print(f\" (was {original_new} new, {original_updated} updated)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9e2b631", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Write manifest files and upload snapshot to S3.\"\"\"\n", + "\n", + "# Write transfer manifest\n", + "transfer_path = OUTPUT_DIR / \"transfer_manifest.txt\"\n", + "paths = write_transfer_manifest(diff, filtered, transfer_path, ftp_host=FTP_HOST)\n", + "print(f\"Transfer manifest: {len(paths)} entries -> {transfer_path}\")\n", + "\n", + "# Write removed manifest\n", + "removed_path = OUTPUT_DIR / \"removed_manifest.txt\"\n", + "removed = write_removed_manifest(diff, removed_path)\n", + "print(f\"Removed manifest: {len(removed)} entries -> {removed_path}\")\n", + "\n", + "# Write updated manifest (for Phase 3 pre-overwrite archiving)\n", + "updated_path = OUTPUT_DIR / \"updated_manifest.txt\"\n", + "updated = write_updated_manifest(diff, updated_path)\n", + "print(f\"Updated manifest: {len(updated)} entries -> {updated_path}\")\n", + "\n", + "# Write diff summary\n", + "summary_path = OUTPUT_DIR / \"diff_summary.json\"\n", + "summary = write_diff_summary(diff, summary_path, DATABASE, PREFIX_FROM, PREFIX_TO)\n", + "print(f\"Diff summary -> {summary_path}\")\n", + "print(json.dumps(summary[\"counts\"], indent=2))\n", + "\n", + "# Upload new snapshot to S3 for future diffing\n", + "if SNAPSHOT_UPLOAD_URI:\n", + " s3 = get_s3_client()\n", + " bucket, key = split_s3_path(SNAPSHOT_UPLOAD_URI)\n", + " s3.put_object(Bucket=bucket, Key=key, Body=raw_summary.encode(\"utf-8\"))\n", + " print(f\"Uploaded new snapshot to {SNAPSHOT_UPLOAD_URI}\")\n", + "else:\n", + " print(\"Skipping S3 snapshot upload (SNAPSHOT_UPLOAD_URI not set)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "112c497a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Upload manifests to S3 for CTS input staging (optional).\n", + "\n", + "Note: STAGING_URI is a full s3:// URI. The promote notebook splits this into\n", + "LAKEHOUSE_BUCKET + STAGING_KEY_PREFIX (separate bucket and key prefix parameters).\n", + "\n", + "This is for local testing. The CTS will stage the container's input folder in production.\n", + "\"\"\"\n", + "\n", + "# S3 location where CTS will read input files from.\n", + "# Set to None to skip upload (local-only testing).\n", + "# format: s3:// URI (e.g. \"s3://cdm-lake/staging/run1/\")\n", + "STAGING_URI: str | None = \"s3://cts/io/matt-cohere/staging/run1/input/\"\n", + "\n", + "if STAGING_URI:\n", + " s3 = get_s3_client()\n", + " bucket, prefix = split_s3_path(STAGING_URI)\n", + " prefix = prefix.rstrip(\"/\") + \"/\"\n", + "\n", + " for manifest in [\"transfer_manifest.txt\", \"removed_manifest.txt\", \"updated_manifest.txt\", \"diff_summary.json\"]:\n", + " local_path = OUTPUT_DIR / manifest\n", + " if local_path.exists():\n", + " key = f\"{prefix}{manifest}\"\n", + " s3.upload_file(Filename=str(local_path), Bucket=bucket, Key=key)\n", + " print(f\"Uploaded {manifest} -> s3://{bucket}/{key}\")\n", + "\n", + " print(f\"\\nManifests staged for CTS at s3://{bucket}/{prefix}\")\n", + "else:\n", + " print(\"Skipping S3 manifest upload (STAGING_URI not set)\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb new file mode 100644 index 00000000..6e16e1a4 --- /dev/null +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -0,0 +1,316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2eda19a9", + "metadata": {}, + "source": [ + "# NCBI Assembly Promote & Archive (Phase 3)\n", + "\n", + "Promotes staged assembly files from S3 staging (written by CTS Phase 2)\n", + "to their final Lakehouse paths, archives replaced/suppressed assemblies,\n", + "and trims the transfer manifest for resumability.\n", + "\n", + "Steps:\n", + "1. Configure staging prefix, removed manifest, updated manifest, and release tag\n", + "2. Scan staged files and display summary\n", + "3. Archive existing versions of updated assemblies (pre-overwrite)\n", + "4. Promote files to final paths with MD5 metadata\n", + "5. Archive replaced/suppressed assemblies\n", + "6. Trim manifest (remove promoted entries)" + ] + }, + { + "cell_type": "markdown", + "id": "2f98c43e", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `staging/run1/` |\n", + "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/transfer_manifest.txt` |\n", + "| `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", + "\n", + "Lakehouse object: `s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", + "Staging object: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b736665", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "import json\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.promote import (\n", + " DEFAULT_LAKEHOUSE_KEY_PREFIX,\n", + " promote_from_s3,\n", + ")\n", + "from cdm_data_loaders.utils.s3 import get_s3_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b36a556c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\n", + "\n", + "Path layout (how variables compose into a full S3 object path):\n", + " s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{file}\n", + " s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{assembly_dir}/{file}\n", + "\"\"\"\n", + "\n", + "# S3 bucket where CTS Phase 2 writes staged files\n", + "# format: bucket name (no s3:// scheme)\n", + "STAGING_BUCKET = \"cts\"\n", + "\n", + "# S3 bucket for the final Lakehouse destination\n", + "# format: bucket name (no s3:// scheme)\n", + "LAKEHOUSE_BUCKET = \"cdm-lake\"\n", + "\n", + "# Staging prefix written by CTS Phase 2\n", + "# format: S3 key prefix within STAGING_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = \"io/matt-cohere/staging/run1/output/\"\n", + "\n", + "# Local path to removed_manifest.txt from Phase 1 (or None to skip archiving)\n", + "# format: local file path\n", + "REMOVED_MANIFEST_PATH: str | None = None # e.g. \"output/removed_manifest.txt\"\n", + "\n", + "# Local path to updated_manifest.txt from Phase 1 (or None to skip pre-overwrite archiving)\n", + "# format: local file path\n", + "UPDATED_MANIFEST_PATH: str | None = None # e.g. \"output/updated_manifest.txt\"\n", + "\n", + "# NCBI release tag for archive metadata (e.g. \"2024-01\")\n", + "NCBI_RELEASE: str | None = None\n", + "\n", + "# S3 key of transfer_manifest.txt for trimming after promotion (or None to skip).\n", + "# Only needed if the manifest was uploaded to S3 (e.g. via the staging cell in Phase 1).\n", + "# format: S3 object key within STAGING_BUCKET (no scheme, no bucket)\n", + "MANIFEST_S3_KEY: str | None = (\n", + " \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", + ")\n", + "\n", + "# Local path to transfer_manifest.txt (used when the manifest has not been uploaded to S3).\n", + "# Used only for the object-count estimate in the scan step; set to None to skip.\n", + "# format: local file path\n", + "MANIFEST_LOCAL_PATH: str | None = None # e.g. \"output/transfer_manifest.txt\"\n", + "\n", + "# Final Lakehouse path prefix\n", + "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", + "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", + "\n", + "# Dry-run mode — log actions without making changes\n", + "DRY_RUN = False\n", + "\n", + "print(f\"Staging bucket: {STAGING_BUCKET}\")\n", + "print(f\"Lakehouse bucket: {LAKEHOUSE_BUCKET}\")\n", + "print(f\"Staging key prefix: {STAGING_KEY_PREFIX}\")\n", + "print(f\"Removed manifest: {REMOVED_MANIFEST_PATH}\")\n", + "print(f\"Updated manifest: {UPDATED_MANIFEST_PATH}\")\n", + "print(f\"NCBI release: {NCBI_RELEASE}\")\n", + "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Manifest local path: {MANIFEST_LOCAL_PATH}\")\n", + "print(f\"Lakehouse prefix: {LAKEHOUSE_KEY_PREFIX}\")\n", + "print(f\"Dry-run: {DRY_RUN}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccfd88d9", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e521fd45", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Scan staged files and display summary.\"\"\"\n", + "\n", + "import tqdm\n", + "\n", + "s3 = get_s3_client()\n", + "paginator = s3.get_paginator(\"list_objects_v2\")\n", + "\n", + "# Estimate total objects from manifest so tqdm can show a percentage.\n", + "# Each assembly typically produces ~11 data files + ~10 .md5 sidecars = ~21 objects.\n", + "_FILES_PER_ASSEMBLY_EST = 21\n", + "estimated_total = None\n", + "if MANIFEST_S3_KEY:\n", + " try:\n", + " _resp = s3.get_object(Bucket=STAGING_BUCKET, Key=MANIFEST_S3_KEY)\n", + " _lines = [\n", + " ln.strip() for ln in _resp[\"Body\"].read().decode().splitlines() if ln.strip() and not ln.startswith(\"#\")\n", + " ]\n", + " estimated_total = len(_lines) * _FILES_PER_ASSEMBLY_EST\n", + " print(f\"Manifest (S3) has {len(_lines)} assemblies → estimated ~{estimated_total} staged objects\")\n", + " except Exception as e:\n", + " print(f\"Could not read S3 manifest for estimate: {e}\")\n", + "elif MANIFEST_LOCAL_PATH:\n", + " try:\n", + " from pathlib import Path\n", + "\n", + " _lines = [\n", + " ln.strip()\n", + " for ln in Path(MANIFEST_LOCAL_PATH).read_text().splitlines()\n", + " if ln.strip() and not ln.startswith(\"#\")\n", + " ]\n", + " estimated_total = len(_lines) * _FILES_PER_ASSEMBLY_EST\n", + " print(f\"Manifest (local) has {len(_lines)} assemblies → estimated ~{estimated_total} staged objects\")\n", + " except Exception as e:\n", + " print(f\"Could not read local manifest for estimate: {e}\")\n", + "\n", + "staged: list[str] = []\n", + "with tqdm.tqdm(total=estimated_total, unit=\"obj\", desc=\"Scanning staging prefix\", dynamic_ncols=True) as pbar:\n", + " for page in paginator.paginate(Bucket=STAGING_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", + " keys = [obj[\"Key\"] for obj in page.get(\"Contents\", [])]\n", + " staged.extend(keys)\n", + " pbar.update(len(keys))\n", + "\n", + "sidecars = []\n", + "data_files = []\n", + "for k in staged:\n", + " if k.endswith((\".md5\", \".crc64nvme\")):\n", + " sidecars.append(k)\n", + " else:\n", + " data_files.append(k)\n", + "\n", + "print(f\"Staged objects: {len(staged)}\")\n", + "print(f\" Data files: {len(data_files)}\")\n", + "print(f\" Sidecars: {len(sidecars)}\")\n", + "\n", + "# Show first few data files\n", + "PREVIEW_COUNT = 10\n", + "for key in data_files[:PREVIEW_COUNT]:\n", + " print(f\" {key}\")\n", + "if len(data_files) > PREVIEW_COUNT:\n", + " print(f\" ... and {len(data_files) - PREVIEW_COUNT} more\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a46367", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Promote staged files to final Lakehouse paths.\"\"\"\n", + "\n", + "report = promote_from_s3(\n", + " staging_key_prefix=STAGING_KEY_PREFIX,\n", + " staging_bucket=STAGING_BUCKET,\n", + " lakehouse_bucket=LAKEHOUSE_BUCKET,\n", + " removed_manifest_path=REMOVED_MANIFEST_PATH,\n", + " updated_manifest_path=UPDATED_MANIFEST_PATH,\n", + " ncbi_release=NCBI_RELEASE,\n", + " manifest_s3_key=MANIFEST_S3_KEY,\n", + " lakehouse_key_prefix=LAKEHOUSE_KEY_PREFIX,\n", + " dry_run=DRY_RUN,\n", + ")\n", + "\n", + "print(json.dumps(report, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d18a1e0", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Display promotion report.\"\"\"\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"PROMOTION REPORT\")\n", + "print(\"=\" * 50)\n", + "print(f\"Promoted: {report['promoted']}\")\n", + "print(f\"Archived: {report['archived']}\")\n", + "print(f\"Failed: {report['failed']}\")\n", + "print(f\"Dry-run: {report['dry_run']}\")\n", + "print(f\"Timestamp: {report['timestamp']}\")\n", + "\n", + "if report[\"failed\"] > 0:\n", + " print(\"\\n⚠️ Some operations failed — check logs above for details.\")\n", + "\n", + "if report[\"dry_run\"]:\n", + " print(\"\\n📋 This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fb27b941602401d91542211134fc71a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Inspect frictionless descriptors written to metadata/.\"\"\"\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.metadata import build_descriptor_key\n", + "\n", + "s3 = get_s3_client()\n", + "paginator = s3.get_paginator(\"list_objects_v2\")\n", + "\n", + "descriptor_keys: list[str] = []\n", + "for page in paginator.paginate(Bucket=LAKEHOUSE_BUCKET, Prefix=LAKEHOUSE_KEY_PREFIX + \"metadata/\"):\n", + " descriptor_keys.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", + "\n", + "print(f\"Found {len(descriptor_keys)} descriptor(s) in metadata/\")\n", + "\n", + "for key in descriptor_keys[:5]: # preview first 5\n", + " obj = s3.get_object(Bucket=LAKEHOUSE_BUCKET, Key=key)\n", + " descriptor = json.loads(obj[\"Body\"].read())\n", + " print()\n", + " print(f\" Key: {key}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index c1642088..60785ba0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ [project.scripts] all_the_bacteria = "cdm_data_loaders.pipelines.all_the_bacteria:cli" +ncbi_ftp_sync = "cdm_data_loaders.pipelines.ncbi_ftp_download:cli" ncbi_rest_api = "cdm_data_loaders.pipelines.ncbi_rest_api:cli" uniprot = "cdm_data_loaders.pipelines.uniprot_kb:cli" uniref = "cdm_data_loaders.pipelines.uniref:cli" @@ -167,7 +168,8 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "*.ipynb" = ["T201"] # ignore printing in notebooks -"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002"] # use of assert, booleans +"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002", "ARG002"] # use of assert, booleans, unused mock args +"tests/integration/**/*.py" = ["S101", "T201", "FBT001", "FBT002", "ARG002", "ANN401"] "tests/utils/test_s3.py" = ["ANN401"] "**/__init__.py" = ["D104"] @@ -188,16 +190,16 @@ log_cli = true log_cli_level = "INFO" log_level = "INFO" addopts = ["-v"] -markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin"] +markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin", "integration: end-to-end tests requiring a running MinIO instance and network access", "external_request: tests that make real network requests to external services (e.g. NCBI FTP)"] # environment settings for running tests [tool.pytest_env] USER = "fake_user" KBASE_AUTH_TOKEN = "test-token-123" CDM_TASK_SERVICE_URL = "http://localhost:8080" -MINIO_ENDPOINT_URL = "http://localhost:9000" -MINIO_ACCESS_KEY = "minioadmin" -MINIO_SECRET_KEY = "minioadmin" +MINIO_ENDPOINT_URL = { value = "http://localhost:9000", skip_if_set = true } +MINIO_ACCESS_KEY = { value = "minioadmin", skip_if_set = true } +MINIO_SECRET_KEY = { value = "minioadmin", skip_if_set = true } MINIO_SECURE_FLAG = "false" BERDL_POD_IP = "192.168.1.100" SPARK_MASTER_URL = "spark://localhost:7077" diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index f9edcbfd..c1c0b6d3 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -VALID_COMMANDS=(all_the_bacteria ncbi_rest_api uniprot uniref xml_split test bash) +VALID_COMMANDS=(all_the_bacteria ncbi_ftp_sync ncbi_rest_api uniprot uniref xml_split test integration-test bash) usage() { local joined @@ -21,6 +21,10 @@ case "$cmd" in all_the_bacteria) exec /usr/bin/tini -- uv run --no-sync all_the_bacteria "$@" ;; + ncbi_ftp_sync) + # Run the NCBI FTP assembly download pipeline (Phase 2) + exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" + ;; ncbi_rest_api) exec /usr/bin/tini -- uv run --no-sync ncbi_rest_api "$@" ;; @@ -36,6 +40,10 @@ case "$cmd" in test) exec /usr/bin/tini -- uv run --no-sync pytest -m "not requires_spark" ;; + integration-test) + # run the integration tests (requires a running MinIO instance) + exec /usr/bin/tini -- uv run --no-sync pytest -m "integration" -v "$@" + ;; bash) exec /usr/bin/tini -- /bin/bash ;; diff --git a/scripts/s3_local.py b/scripts/s3_local.py new file mode 100755 index 00000000..60bac49f --- /dev/null +++ b/scripts/s3_local.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# ruff: noqa: T201, EM101, EM102, TRY003, D103 +"""Thin S3 CLI for local MinIO testing (no aws-cli install required). + +Usage (all commands assume ``uv run`` from the repo root): + + uv run python scripts/s3_local.py mb s3://cdm-lake + uv run python scripts/s3_local.py cp staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ + uv run python scripts/s3_local.py ls s3://cdm-lake/staging/run1/ + uv run python scripts/s3_local.py head s3://cdm-lake/some/key.gz + +Environment variables (with defaults for the walkthrough): + + MINIO_ENDPOINT_URL http://localhost:9000 + MINIO_ACCESS_KEY minioadmin + MINIO_SECRET_KEY minioadmin +""" + +import json +import os +import sys +from pathlib import Path + +import boto3 +from botocore.client import BaseClient + + +def _client() -> BaseClient: + return boto3.client( + "s3", + endpoint_url=os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000"), + aws_access_key_id=os.environ.get("MINIO_ACCESS_KEY", "minioadmin"), + aws_secret_access_key=os.environ.get("MINIO_SECRET_KEY", "minioadmin"), + ) + + +def _split(uri: str) -> tuple[str, str]: + """Split ``s3://bucket/key`` into ``(bucket, key)``.""" + if not uri.startswith("s3://"): + raise SystemExit(f"Expected s3:// URI, got: {uri}") + parts = uri[5:].split("/", 1) + return parts[0], parts[1] if len(parts) > 1 else "" + + +# ── subcommands ───────────────────────────────────────────────────────── + + +def cmd_mb(args: list[str]) -> None: + """Create a bucket: ``mb s3://bucket``.""" + if not args: + raise SystemExit("Usage: s3_local.py mb s3://BUCKET") + bucket, _ = _split(args[0]) + s3 = _client() + try: + s3.head_bucket(Bucket=bucket) + print(f"Bucket already exists: {bucket}") + except Exception: # noqa: BLE001 + s3.create_bucket(Bucket=bucket) + print(f"Created bucket: {bucket}") + + +def cmd_cp(args: list[str]) -> None: + """Recursive upload: ``cp LOCAL_DIR s3://bucket/prefix/``.""" + if len(args) < 2: # noqa: PLR2004 + raise SystemExit("Usage: s3_local.py cp LOCAL_DIR s3://BUCKET/PREFIX/") + local_dir = Path(args[0]) + bucket, prefix = _split(args[1]) + prefix = prefix.rstrip("/") + "/" if prefix else "" + s3 = _client() + count = 0 + for path in sorted(local_dir.rglob("*")): + if path.is_dir(): + continue + rel = path.relative_to(local_dir) + key = f"{prefix}{rel}" + s3.upload_file(Filename=str(path), Bucket=bucket, Key=key) + count += 1 + print(f" {key}") + print(f"Uploaded {count} files to s3://{bucket}/{prefix}") + + +def cmd_ls(args: list[str]) -> None: + """List objects: ``ls s3://bucket/prefix/ [--limit N]``.""" + if not args: + raise SystemExit("Usage: s3_local.py ls s3://BUCKET/PREFIX/ [--limit N]") + bucket, prefix = _split(args[0]) + limit = 20 + if "--limit" in args: + idx = args.index("--limit") + limit = int(args[idx + 1]) + s3 = _client() + paginator = s3.get_paginator("list_objects_v2") + shown = 0 + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + print(f" {obj['Size']:>10} {obj['Key']}") + shown += 1 + if shown >= limit: + return + + +def cmd_head(args: list[str]) -> None: + """Show metadata: ``head s3://bucket/key``.""" + if not args: + raise SystemExit("Usage: s3_local.py head s3://BUCKET/KEY") + bucket, key = _split(args[0]) + s3 = _client() + resp = s3.head_object(Bucket=bucket, Key=key) + meta = resp.get("Metadata", {}) + print(json.dumps(meta, indent=2)) + + +# ── dispatch ──────────────────────────────────────────────────────────── + +COMMANDS = {"mb": cmd_mb, "cp": cmd_cp, "ls": cmd_ls, "head": cmd_head} + + +def main() -> None: + if len(sys.argv) < 2 or sys.argv[1] not in COMMANDS: # noqa: PLR2004 + cmds = ", ".join(COMMANDS) + raise SystemExit(f"Usage: s3_local.py <{cmds}> [args ...]\n\n{__doc__}") + COMMANDS[sys.argv[1]](sys.argv[2:]) + + +if __name__ == "__main__": + main() diff --git a/src/cdm_data_loaders/ncbi_ftp/__init__.py b/src/cdm_data_loaders/ncbi_ftp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/cdm_data_loaders/ncbi_ftp/assembly.py b/src/cdm_data_loaders/ncbi_ftp/assembly.py new file mode 100644 index 00000000..b424702b --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/assembly.py @@ -0,0 +1,206 @@ +"""NCBI FTP assembly-specific domain logic. + +Provides path helpers, file filters, MD5 checksum parsing, and single-assembly +download logic for NCBI GenBank/RefSeq assemblies. Orchestration (batching, +threading, CLI) lives in :mod:`cdm_data_loaders.pipelines.ncbi_ftp_download`. +""" + +import contextlib +import re +import time +from ftplib import FTP +from pathlib import Path +from typing import Any + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.checksums import compute_md5 +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_noop_keepalive, ftp_retrieve_text + +logger = get_cdm_logger() + +FTP_HOST = "ftp.ncbi.nlm.nih.gov" + +FILE_FILTERS = [ + "_gene_ontology.gaf.gz", + "_genomic.fna.gz", + "_genomic.gff.gz", + "_protein.faa.gz", + "_ani_contam_ranges.tsv", + "_assembly_regions.txt", + "_assembly_report.txt", + "_assembly_stats.txt", + "_gene_expression_counts.txt.gz", + "_normalized_gene_expression_counts.txt.gz", +] + + +def parse_md5_checksums_file(text: str) -> dict[str, str]: + """Parse an NCBI ``md5checksums.txt`` file into a filename-to-hash mapping. + + Each line has the format `` ./`` (two-space separator). + + :param text: raw text of the md5checksums.txt file + :return: dict mapping filename to MD5 hex digest + """ + checksums: dict[str, str] = {} + for raw_line in text.strip().splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + parts = stripped.split(" ", maxsplit=1) + if len(parts) == 2: # noqa: PLR2004 + md5_hash, filename = parts + checksums[filename.removeprefix("./")] = md5_hash.strip() + return checksums + + +# ── Path helpers ───────────────────────────────────────────────────────── + + +def build_accession_path(assembly_dir: str) -> str: + """Build the relative output path for an assembly directory. + + Produces ``raw_data/{GCF|GCA}/{000}/{001}/{215}/{assembly_dir}/``. + + :param assembly_dir: full assembly directory name (e.g. ``GCF_000001215.4_Release_6...``) + :return: relative path string + :raises ValueError: if the assembly directory name cannot be parsed + """ + m = re.match(r"GC[AF]_(\d{3})(\d{3})(\d{3})\.\d+.*", assembly_dir) + if not m: + msg = f"Cannot parse accession: {assembly_dir}" + raise ValueError(msg) + p1, p2, p3 = m.groups() + return f"raw_data/{assembly_dir[:3]}/{p1}/{p2}/{p3}/{assembly_dir}/" + + +def parse_assembly_path(assembly_path: str) -> tuple[str, str, str]: + """Extract database, assembly_dir, and accession from an FTP assembly path. + + :param assembly_path: FTP directory path (e.g. ``/genomes/all/GCF/000/.../GCF_000001215.4_Rel.../``) + :return: tuple of ``(database, assembly_dir, accession)`` + :raises ValueError: if the path cannot be parsed + """ + m = re.search( + r"/(GC[AF])/\d{3}/\d{3}/\d{3}/((GC[AF]_\d{9}\.\d+)_[^/]+)/?$", + assembly_path.rstrip("/"), + ) + if not m: + msg = f"Cannot parse assembly path: {assembly_path}" + raise ValueError(msg) + return m.group(1), m.group(2), m.group(3) + + +# ── Single assembly download ──────────────────────────────────────────── + + +def _download_and_verify( # noqa: PLR0913 + ftp: FTP, + filename: str, + dest_dir: Path, + md5_checksums: dict[str, str], + stats: dict[str, Any], + last_activity: float, +) -> float: + """Download one file, verify its MD5, and write a sidecar if valid.""" + last_activity = ftp_noop_keepalive(ftp, last_activity) + local_file = dest_dir / filename + expected_md5 = md5_checksums.get(filename) + + for attempt in range(1, 4): + logger.debug(" Downloading %s (attempt %d/3)", filename, attempt) + with local_file.open("wb") as f: + ftp.retrbinary(f"RETR {filename}", f.write) + last_activity = time.monotonic() + + if expected_md5: + actual_md5 = compute_md5(str(local_file)) + if actual_md5 != expected_md5: + logger.warning( + " MD5 mismatch for %s: expected %s, got %s", + filename, + expected_md5, + actual_md5, + ) + if attempt < 3: # noqa: PLR2004 + continue + stats["files_skipped_checksum_mismatch"] += 1 + local_file.unlink(missing_ok=True) + return last_activity + logger.debug(" MD5 verified: %s", filename) + else: + stats["files_without_checksum"] += 1 + + if expected_md5: + (dest_dir / f"{filename}.md5").write_text(expected_md5) + + stats["files_downloaded"] += 1 + return last_activity + + return last_activity + + +def download_assembly_to_local( + assembly_path: str, + output_dir: str | Path, + ftp_host: str = FTP_HOST, + ftp: FTP | None = None, +) -> dict[str, Any]: + """Download one assembly from NCBI FTP to a local directory. + + Creates a directory structure under *output_dir* matching the S3 layout, + downloads filtered files, verifies MD5 checksums, and writes ``.md5`` + sidecar files for downstream metadata. + + :param assembly_path: FTP directory path for the assembly + :param output_dir: base output directory + :param ftp_host: FTP hostname + :param ftp: optional existing FTP connection (caller manages lifecycle) + :return: dict with download statistics + """ + _database, assembly_dir, accession = parse_assembly_path(assembly_path) + rel_path = build_accession_path(assembly_dir) + dest_dir = Path(output_dir) / rel_path + dest_dir.mkdir(parents=True, exist_ok=True) + + logger.debug("Downloading %s -> %s", accession, dest_dir) + + owns_ftp = ftp is None + if owns_ftp: + ftp = connect_ftp(ftp_host) + stats: dict[str, Any] = { + "accession": accession, + "assembly_dir": assembly_dir, + "files_downloaded": 0, + "files_skipped_checksum_mismatch": 0, + "files_without_checksum": 0, + } + + try: + ftp.cwd(assembly_path.rstrip("/")) + + files: list[str] = [] + ftp.retrlines("NLST", files.append) + + # Download and parse md5checksums.txt + md5_checksums: dict[str, str] = {} + if "md5checksums.txt" in files: + md5_text = ftp_retrieve_text(ftp, "md5checksums.txt") + md5_checksums = parse_md5_checksums_file(md5_text) + (dest_dir / "md5checksums.txt").write_text(md5_text) + stats["files_downloaded"] += 1 + + target_files = [f for f in files if any(f.endswith(s) for s in FILE_FILTERS)] + last_activity = time.monotonic() + + for filename in target_files: + last_activity = _download_and_verify(ftp, filename, dest_dir, md5_checksums, stats, last_activity) + + logger.debug(" %s: %d files downloaded", accession, stats["files_downloaded"]) + + finally: + if owns_ftp: + with contextlib.suppress(Exception): + ftp.quit() + + return stats diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py new file mode 100644 index 00000000..b14d2923 --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -0,0 +1,624 @@ +"""Phase 1: Assembly summary diffing and manifest generation. + +Downloads the current NCBI assembly summary from FTP, compares it against a +previous snapshot, and produces ``transfer_manifest.txt`` (assemblies to +download), ``removed_manifest.txt`` (assemblies to archive), and a JSON diff +summary. All filtering logic (prefix range, limit) lives here so that +downstream phases receive a final, pre-filtered manifest. +""" + +import contextlib +import csv +import json +import re +import time +from collections.abc import Callable, Iterable +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from cdm_data_loaders.ncbi_ftp.assembly import ( + FILE_FILTERS, + FTP_HOST, + build_accession_path, + parse_md5_checksums_file, +) +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_noop_keepalive, ftp_retrieve_text +from cdm_data_loaders.utils.s3 import get_s3_client, head_object + +logger = get_cdm_logger() + +SUMMARY_FTP_PATHS: dict[str, str] = { + "refseq": "/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt", + "genbank": "/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt", +} + + +# ── Data structures ───────────────────────────────────────────────────── + + +@dataclass +class AssemblyRecord: + """Parsed row from an NCBI assembly summary file.""" + + accession: str + status: str + seq_rel_date: str + ftp_path: str + assembly_dir: str + + +@dataclass +class DiffResult: + """Result of comparing current and previous assembly summaries.""" + + new: list[str] = field(default_factory=list) + updated: list[str] = field(default_factory=list) + replaced: list[str] = field(default_factory=list) + suppressed: list[str] = field(default_factory=list) + + +# ── Assembly summary download & parsing ────────────────────────────────── + + +def download_assembly_summary(database: str = "refseq", ftp_host: str = FTP_HOST) -> str: + """Download the assembly summary file from NCBI FTP. + + :param database: ``"refseq"`` or ``"genbank"`` + :param ftp_host: FTP hostname + :return: raw text content of the summary file + """ + ftp_path = SUMMARY_FTP_PATHS.get(database) + if not ftp_path: + msg = f"Unknown database: {database}" + raise ValueError(msg) + + logger.info("Downloading assembly_summary_%s.txt from NCBI FTP ...", database) + ftp = connect_ftp(ftp_host) + try: + content = ftp_retrieve_text(ftp, ftp_path) + finally: + with contextlib.suppress(Exception): + ftp.quit() + + logger.info("Downloaded assembly summary (%d bytes)", len(content)) + return content + + +def parse_assembly_summary(source: str | Path | list[str]) -> dict[str, AssemblyRecord]: + """Parse an NCBI assembly summary into a dict of assembly records. + + Accepts a file path, raw text string, or list of lines. + + Columns of interest (0-indexed): + 0: assembly_accession (e.g. GCF_000001215.4) + 10: version_status ("latest", "replaced", "suppressed") + 14: seq_rel_date + 19: ftp_path (full FTP URL or "na") + + :param source: file path, raw text, or list of lines + :return: dict mapping accession to :class:`AssemblyRecord` + """ + assemblies: dict[str, AssemblyRecord] = {} + + def _parse_lines(lines: Iterable[str]) -> None: + reader = csv.reader( + (line.rstrip("\n") for line in lines if not line.startswith("#")), + delimiter="\t", + ) + for row in reader: + if len(row) < 20: # noqa: PLR2004 + continue + accession = row[0] + ftp_path = row[19] + if ftp_path == "na": + continue + assemblies[accession] = AssemblyRecord( + accession=accession, + status=row[10], + seq_rel_date=row[14], + ftp_path=ftp_path, + assembly_dir=ftp_path.rstrip("/").split("/")[-1], + ) + + if isinstance(source, Path) or (isinstance(source, str) and "\n" not in source and Path(source).is_file()): + with Path(source).open() as f: + _parse_lines(f) + elif isinstance(source, list): + _parse_lines(source) + else: + _parse_lines(source.splitlines(keepends=True)) + + logger.info("Parsed %d assemblies from summary", len(assemblies)) + return assemblies + + +def get_latest_assembly_paths(assemblies: dict[str, AssemblyRecord], ftp_host: str = FTP_HOST) -> list[tuple[str, str]]: + """Extract FTP directory paths for all assemblies with ``latest`` status. + + :param assemblies: parsed assembly records + :param ftp_host: FTP hostname for URL stripping + :return: list of ``(accession, ftp_dir_path)`` tuples + """ + paths: list[tuple[str, str]] = [] + for accession, rec in assemblies.items(): + if rec.status != "latest": + continue + ftp_path = _ftp_dir_from_url(rec.ftp_path, ftp_host) + paths.append((accession, ftp_path.rstrip("/") + "/")) + return paths + + +# ── Prefix filtering ──────────────────────────────────────────────────── + + +def accession_prefix(accession: str) -> str | None: + """Extract the 3-digit prefix from an accession (e.g. ``GCF_000005845.2`` → ``"000"``).""" + m = re.match(r"GC[AF]_(\d{3})\d{6}\.\d+", accession) + return m.group(1) if m else None + + +def filter_by_prefix_range( + assemblies: dict[str, AssemblyRecord], + prefix_from: str | None = None, + prefix_to: str | None = None, +) -> dict[str, AssemblyRecord]: + """Filter assemblies to those whose 3-digit accession prefix is in range. + + Both bounds are inclusive. If neither is set, returns all assemblies. + + :param assemblies: dict of parsed assembly records + :param prefix_from: lower bound (inclusive), e.g. ``"000"`` + :param prefix_to: upper bound (inclusive), e.g. ``"003"`` + :return: filtered dict + """ + if prefix_from is None and prefix_to is None: + return assemblies + filtered: dict[str, AssemblyRecord] = {} + for acc, rec in assemblies.items(): + pfx = accession_prefix(acc) + if pfx is None: + continue + if prefix_from is not None and pfx < prefix_from: + continue + if prefix_to is not None and pfx > prefix_to: + continue + filtered[acc] = rec + return filtered + + +# ── Diff computation ──────────────────────────────────────────────────── + + +def compute_diff( # noqa: PLR0912 + current: dict[str, AssemblyRecord], + previous_assemblies: dict[str, AssemblyRecord] | None = None, + previous_accessions: set[str] | None = None, +) -> DiffResult: + """Compute the diff between current and previous assembly state. + + :param current: the new NCBI summary (parsed) + :param previous_assemblies: full parsed previous summary, or None if using fallback + :param previous_accessions: set of known accessions (store-scan fallback) + :return: diff result with new/updated/replaced/suppressed lists + """ + diff = DiffResult() + + if previous_assemblies is not None: + known = set(previous_assemblies.keys()) + elif previous_accessions is not None: + known = previous_accessions + else: + known = set() + + for acc, rec in current.items(): + if rec.status == "replaced": + if acc in known: + diff.replaced.append(acc) + continue + if rec.status == "suppressed": + if acc in known: + diff.suppressed.append(acc) + continue + if rec.status != "latest": + continue + + if acc not in known: + diff.new.append(acc) + elif previous_assemblies is not None: + prev = previous_assemblies.get(acc) + if prev and (rec.seq_rel_date > prev.seq_rel_date or rec.assembly_dir != prev.assembly_dir): + diff.updated.append(acc) + + # Accessions in previous but entirely absent from current (withdrawn) + current_accs = set(current.keys()) + for acc in known: + if acc not in current_accs and acc not in diff.suppressed: + diff.suppressed.append(acc) + + diff.new.sort() + diff.updated.sort() + diff.replaced.sort() + diff.suppressed.sort() + return diff + + +# ── FTP URL helpers ────────────────────────────────────────────────────── + + +def _ftp_dir_from_url(ftp_url: str, ftp_host: str = FTP_HOST) -> str: + """Convert an FTP URL from the assembly summary to an FTP directory path.""" + if ftp_url.startswith("https://"): + return ftp_url.replace(f"https://{ftp_host}", "") + if ftp_url.startswith("ftp://"): + return ftp_url.replace(f"ftp://{ftp_host}", "") + return ftp_url + + +# ── Synthetic summary from S3 store scan ──────────────────────────────── + + +def _extract_accession_from_s3_key(key: str) -> str | None: + """Extract the assembly accession from an S3 object key. + + Looks for the pattern GCF_######.# or GCA_######.# in the key path. + + :param key: S3 object key + :return: accession (e.g. "GCF_000001215.4") or None if not found + """ + m = re.search(r"(GC[AF]_\d{3}\d{6}\.\d+)", key) + return m.group(1) if m else None + + +def _extract_assembly_dir_from_s3_key(key: str) -> str | None: + """Extract the assembly directory name from an S3 object key. + + The assembly directory is the path component that follows the accession + and contains assembly metadata (e.g. "GCF_000001215.4_Release_6_plus_ISO1_MT"). + + :param key: S3 object key + :return: assembly directory name or None if not found + """ + # Match accession followed by underscore and then capture until next / + m = re.search(r"(GC[AF]_\d{3}\d{6}\.\d+[^/]*)/", key) + return m.group(1) if m else None + + +_DATABASE_ACC_PREFIX: dict[str, str] = { + "refseq": "GCF_", + "genbank": "GCA_", +} + + +def scan_store_to_synthetic_summary( + bucket: str, + key_prefix: str, + release_date: str, + database: str = "refseq", + progress_callback: Callable[[int, str], None] | None = None, +) -> dict[str, AssemblyRecord]: + """Scan S3 store and build a synthetic assembly summary from existing objects. + + This function is useful when bootstrapping a diffs against an existing, + pre-populated S3 store that lacks a baseline assembly summary. + + For each assembly found in the store: + - Extracts the accession and assembly directory name from S3 paths + - Applies the provided ``release_date`` as synthetic ``seq_rel_date`` for + all assemblies + - Creates an ``AssemblyRecord`` with ``status="latest"`` + - Filters to accessions matching the expected prefix for ``database`` + (``GCF_`` for ``"refseq"``, ``GCA_`` for ``"genbank"``) + + The function paginates through S3 to handle large stores efficiently. + + :param bucket: S3 bucket name + :param key_prefix: S3 key prefix (all objects under this prefix are scanned) + :param release_date: release date string in ``YYYY/MM/DD`` format used for + all synthetic records + :param database: ``"refseq"`` or ``"genbank"`` — controls which accession + prefix is included (``GCF_`` or ``GCA_`` respectively) + :param progress_callback: optional callable invoked after each accession is + processed with ``(count, accession)`` where count is the running total + of unique accessions found + :return: dict mapping accession to ``AssemblyRecord`` + """ + try: + datetime.strptime(release_date, "%Y/%m/%d") + except ValueError as exc: + msg = f"Invalid release_date '{release_date}'. Expected format YYYY/MM/DD." + raise ValueError(msg) from exc + + acc_prefix = _DATABASE_ACC_PREFIX.get(database) + if acc_prefix is None: + msg = f"Unknown database: {database!r}. Expected 'refseq' or 'genbank'." + raise ValueError(msg) + + s3 = get_s3_client() + assemblies: dict[str, AssemblyRecord] = {} + processed_count = 0 + + try: + paginator = s3.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=bucket, Prefix=key_prefix) + + for page in pages: + for obj in page.get("Contents", []): + acc = _extract_accession_from_s3_key(obj["Key"]) + if not acc or not acc.startswith(acc_prefix): + continue + assembly_dir = _extract_assembly_dir_from_s3_key(obj["Key"]) + + if not acc or not assembly_dir: + continue + + if acc not in assemblies: + # First object for this accession; store it. + # Construct a fake FTP path that ends with assembly_dir so + # that round-tripping through parse_assembly_summary (which + # derives assembly_dir via ftp_path.rstrip("/").split("/")[-1]) + # yields the correct assembly_dir and therefore correct diffs. + fake_ftp_path = f"https://ftp.ncbi.nlm.nih.gov/synthetic/{assembly_dir}" + assemblies[acc] = AssemblyRecord( + accession=acc, + status="latest", + seq_rel_date=release_date, + ftp_path=fake_ftp_path, + assembly_dir=assembly_dir, + ) + processed_count += 1 + if progress_callback is not None: + progress_callback(processed_count, acc) + + except Exception as e: # noqa: BLE001 + logger.error("Error scanning store: %s", e) + raise + + logger.info("Scanned S3 store: found %d unique assemblies", len(assemblies)) + return assemblies + + +# ── Checksum verification against S3 store ─────────────────────────────── + + +def verify_transfer_candidates( # noqa: PLR0912, PLR0915 + accessions: list[str], + current_assemblies: dict[str, AssemblyRecord], + bucket: str, + key_prefix: str, + ftp_host: str = FTP_HOST, + progress_callback: Callable[[int, int, str], None] | None = None, +) -> list[str]: + """Verify which transfer candidates actually need downloading. + + For each accession, downloads ``md5checksums.txt`` from NCBI FTP and + compares the checksums of filtered files against the ``md5`` user metadata + on corresponding S3 objects. Only accessions where at least one file + differs or is missing from S3 are returned. + + This acts as a final gate before Phase 2: even if the summary diff flags an + assembly, we skip it if every file in the store already matches. + + :param accessions: list of candidate accessions (new + updated from diff) + :param current_assemblies: parsed current assembly summary + :param bucket: S3 bucket name + :param key_prefix: S3 key prefix for the Lakehouse dataset root + :param ftp_host: NCBI FTP hostname + :param progress_callback: optional callable invoked after each accession is + processed with ``(done, total, accession)`` so callers can display a + progress bar. ``done`` is the 1-based count of completed accessions. + :return: filtered list of accessions that actually need downloading + """ + if not accessions: + return [] + + s3 = get_s3_client() + ftp: Any = None # lazily connected only when needed + confirmed: list[str] = [] + pruned = 0 + skipped_missing = 0 + last_activity = time.monotonic() + + try: + for done, acc in enumerate(accessions, start=1): + rec = current_assemblies.get(acc) + if not rec: + confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) + continue + + # Build S3 prefix for this assembly + s3_rel = build_accession_path(rec.assembly_dir) + s3_prefix = f"{key_prefix}{s3_rel}" + + # Quick check: does *anything* exist under this prefix? + resp = s3.list_objects_v2(Bucket=bucket, Prefix=s3_prefix, MaxKeys=1) + if resp.get("KeyCount", 0) == 0: + # Nothing in the store — definitely needs downloading + confirmed.append(acc) + skipped_missing += 1 + if progress_callback is not None: + progress_callback(done, len(accessions), acc) + continue + + # Objects exist — need FTP md5 checksums to decide + if ftp is None: + ftp = connect_ftp(ftp_host) + + last_activity = ftp_noop_keepalive(ftp, last_activity) + + ftp_dir = _ftp_dir_from_url(rec.ftp_path, ftp_host) + try: + md5_text = ftp_retrieve_text(ftp, ftp_dir.rstrip("/") + "/md5checksums.txt") + last_activity = time.monotonic() + ftp_checksums = parse_md5_checksums_file(md5_text) + except Exception: # noqa: BLE001 + logger.warning("Cannot fetch md5checksums.txt for %s, keeping in transfer list", acc) + confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) + continue + + # Filter to files we'd actually download + target_checksums = { + fname: md5 + for fname, md5 in ftp_checksums.items() + if any(fname.endswith(suffix) for suffix in FILE_FILTERS) + } + + if not target_checksums: + confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) + continue + + # Short-circuit: if any file differs or is missing, keep the assembly + needs_update = False + for fname, expected_md5 in target_checksums.items(): + s3_path = f"{bucket}/{s3_prefix}{fname}" + obj_info = head_object(s3_path) + + if obj_info is None: + needs_update = True + break + + s3_md5 = obj_info["metadata"].get("md5", "") + if s3_md5 != expected_md5: + logger.debug("MD5 mismatch for %s/%s: S3=%s FTP=%s", acc, fname, s3_md5, expected_md5) + needs_update = True + break + + if needs_update: + confirmed.append(acc) + else: + pruned += 1 + logger.debug("Pruned %s — all files match S3 checksums", acc) + + if progress_callback is not None: + progress_callback(done, len(accessions), acc) + finally: + if ftp is not None: + with contextlib.suppress(Exception): + ftp.quit() + + logger.info( + "Checksum verification: %d confirmed (%d missing from store), %d pruned (of %d candidates)", + len(confirmed), + skipped_missing, + pruned, + len(accessions), + ) + return confirmed + + +# ── Manifest writing ──────────────────────────────────────────────────── + + +def write_transfer_manifest( + diff: DiffResult, + current_assemblies: dict[str, AssemblyRecord], + output_path: str | Path, + ftp_host: str = FTP_HOST, +) -> list[str]: + """Write the transfer manifest (new + updated assemblies). + + Each line is an FTP directory path suitable for Phase 2 download. + + :param diff: computed diff result + :param current_assemblies: parsed current assembly summary + :param output_path: path to write the manifest file + :param ftp_host: FTP hostname for URL stripping + :return: list of FTP paths written + """ + to_transfer = diff.new + diff.updated + paths: list[str] = [] + for acc in sorted(to_transfer): + rec = current_assemblies.get(acc) + if not rec: + continue + ftp_path = _ftp_dir_from_url(rec.ftp_path, ftp_host) + paths.append(ftp_path.rstrip("/") + "/") + + with Path(output_path).open("w") as f: + f.writelines(p + "\n" for p in paths) + + logger.info("Wrote %d entries to transfer manifest: %s", len(paths), output_path) + return paths + + +def write_removed_manifest(diff: DiffResult, output_path: str | Path) -> list[str]: + """Write the removed manifest (replaced + suppressed accessions). + + :param diff: computed diff result + :param output_path: path to write the manifest file + :return: list of accessions written + """ + removed = sorted(diff.replaced + diff.suppressed) + with Path(output_path).open("w") as f: + f.writelines(acc + "\n" for acc in removed) + logger.info("Wrote %d entries to removed manifest: %s", len(removed), output_path) + return removed + + +def write_updated_manifest(diff: DiffResult, output_path: str | Path) -> list[str]: + """Write the updated manifest (accessions whose content changed). + + This file is consumed by Phase 3 to archive existing S3 objects + before they are overwritten by the new versions. + + :param diff: computed diff result + :param output_path: path to write the manifest file + :return: list of accessions written + """ + updated = sorted(diff.updated) + with Path(output_path).open("w") as f: + f.writelines(acc + "\n" for acc in updated) + logger.info("Wrote %d entries to updated manifest: %s", len(updated), output_path) + return updated + + +def write_diff_summary( + diff: DiffResult, + output_path: str | Path, + database: str, + prefix_from: str | None = None, + prefix_to: str | None = None, +) -> dict[str, Any]: + """Write a JSON diff summary file. + + :param diff: computed diff result + :param output_path: path to write the JSON file + :param database: database name (``"refseq"`` or ``"genbank"``) + :param prefix_from: lower bound of prefix filter (if any) + :param prefix_to: upper bound of prefix filter (if any) + :return: the summary dict that was written + """ + summary: dict[str, Any] = { + "database": database, + "timestamp": datetime.now(UTC).isoformat(), + "prefix_range": { + "from": prefix_from, + "to": prefix_to, + }, + "counts": { + "new": len(diff.new), + "updated": len(diff.updated), + "replaced": len(diff.replaced), + "suppressed": len(diff.suppressed), + "total_to_transfer": len(diff.new) + len(diff.updated), + "total_to_remove": len(diff.replaced) + len(diff.suppressed), + }, + "accessions": { + "new": diff.new, + "updated": diff.updated, + "replaced": diff.replaced, + "suppressed": diff.suppressed, + }, + } + with Path(output_path).open("w") as f: + json.dump(summary, f, indent=2) + logger.info("Wrote diff summary to: %s", output_path) + return summary diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py new file mode 100644 index 00000000..b5b6175e --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -0,0 +1,254 @@ +"""Frictionless data package descriptor creation for NCBI FTP assemblies. + +Creates KBase credit metadata descriptors for each promoted assembly, +matching the schema produced by ``kbase-transfers/scripts/ncbi/download_genomes.py``. + +Each descriptor is a frictionless ``Package``-compatible JSON document +describing the assembly's data files, stored at:: + + {key_prefix}metadata/{assembly_dir}_datapackage.json + +and archived alongside raw data at:: + + {key_prefix}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json + +The descriptor ``resources`` list records the final Lakehouse S3 key, byte +size, file format, and MD5 hash of each promoted data file. +""" + +import json +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any, TypedDict + +from frictionless import Package + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.s3 import copy_object, get_s3_client + +logger = get_cdm_logger() + +_NCBI_CONTRIBUTOR = { + "contributor_type": "Organization", + "name": "National Center for Biotechnology Information", + "contributor_id": "ROR:02meqm098", + "contributor_roles": "DataCurator", +} +_NCBI_PUBLISHER = { + "organization_name": "National Center for Biotechnology Information", + "organization_id": "ROR:02meqm098", +} +_SAVED_BY = "cdm-data-loaders-ncbi-ftp" +_SCHEMA_VERSION = "1.0" + + +class DescriptorResource(TypedDict, total=False): + """A single resource entry in the frictionless descriptor ``resources`` list.""" + + name: str + path: str + format: str + bytes: int | None + hash: str | None + + +# ── Public helpers ──────────────────────────────────────────────────────── + + +def build_descriptor_key(assembly_dir: str, key_prefix: str) -> str: + """Return the S3 key for the live descriptor of *assembly_dir*. + + :param assembly_dir: full assembly directory name, e.g. ``GCF_000001215.4_Release_6_plus_ISO1_MT`` + :param key_prefix: Lakehouse key prefix (trailing slash optional) + :return: S3 key, e.g. ``tenant-general-warehouse/.../ncbi/metadata/GCF_..._datapackage.json`` + """ + prefix = key_prefix.rstrip("/") + "/" + return f"{prefix}metadata/{assembly_dir}_datapackage.json" + + +def build_archive_descriptor_key( + assembly_dir: str, release_tag: str, key_prefix: str, archive_reason: str = "unknown" +) -> str: + """Return the S3 key for the archived descriptor of *assembly_dir*. + + :param assembly_dir: full assembly directory name + :param release_tag: NCBI release tag used in the archive path, e.g. ``"2024-01"`` + :param key_prefix: Lakehouse key prefix + :param archive_reason: reason for archival, encoded as a path segment + :return: S3 key under ``archive/{release_tag}/{archive_reason}/metadata/`` + """ + prefix = key_prefix.rstrip("/") + "/" + return f"{prefix}archive/{release_tag}/{archive_reason}/metadata/{assembly_dir}_datapackage.json" + + +def create_descriptor( + assembly_dir: str, + accession_full: str, + resources: list[DescriptorResource], + *, + timestamp: int | None = None, +) -> dict[str, Any]: + """Build a KBase credit metadata descriptor for an NCBI assembly. + + Matches the schema produced by + ``kbase-transfers/scripts/ncbi/download_genomes.py::create_frictionless_descriptor()``. + + Resource names are lowercased. Resources whose ``hash`` value is ``None`` + have the ``hash`` key removed entirely (frictionless does not accept null + hash values). + + :param assembly_dir: full assembly directory name (includes the accession + suffix, e.g. ``GCF_000001215.4_Release_6_plus_ISO1_MT``) + :param accession_full: accession without suffix, e.g. ``GCF_000001215.4`` + :param resources: list of :class:`DescriptorResource` dicts + :param timestamp: Unix timestamp to embed; defaults to ``datetime.now(UTC)`` + :return: descriptor dict ready for serialisation and frictionless validation + """ + ts = timestamp if timestamp is not None else int(datetime.now(UTC).timestamp()) + version = accession_full.rsplit(".", 1)[-1] # e.g. "4" from "GCF_000001215.4" + + # Normalise resources: lowercase name, drop null hash + normalised: list[dict[str, Any]] = [] + for res in resources: + entry: dict[str, Any] = { + "name": res["name"].lower(), + "path": res["path"], + "format": res.get("format", ""), + } + if res.get("bytes") is not None: + entry["bytes"] = res["bytes"] + if res.get("hash") is not None: + entry["hash"] = res["hash"] + normalised.append(entry) + + return { + "identifier": f"NCBI:{accession_full}", + "resource_type": "dataset", + "version": version, + "titles": [{"title": f"NCBI Genome Assembly {assembly_dir}"}], + "descriptions": [ + {"description_text": (f"Genome assembly files for {accession_full} downloaded from NCBI Datasets")} + ], + "url": f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession_full}/", + "contributors": [_NCBI_CONTRIBUTOR], + "publisher": _NCBI_PUBLISHER, + "license": {}, + "meta": { + "credit_metadata_schema_version": _SCHEMA_VERSION, + "credit_metadata_source": [ + { + "source_name": "NCBI Genomes FTP", + "source_url": "ftp.ncbi.nlm.nih.gov/genomes/all/", + "access_timestamp": ts, + } + ], + "saved_by": _SAVED_BY, + "timestamp": ts, + }, + "resources": normalised, + } + + +def validate_descriptor(descriptor: dict[str, Any], accession_full: str) -> None: + """Validate a descriptor with frictionless. + + :param descriptor: descriptor dict from :func:`create_descriptor` + :param accession_full: accession (used only in error messages) + :raises ValueError: if frictionless reports any metadata errors + """ + errors = list(Package.metadata_validate(descriptor)) + if errors: + error_details = "; ".join(str(e) for e in errors) + msg = f"Frictionless validation failed for {accession_full}: {error_details}" + raise ValueError(msg) + logger.debug("Frictionless descriptor valid for %s", accession_full) + + +def upload_descriptor( + descriptor: dict[str, Any], + assembly_dir: str, + bucket: str, + key_prefix: str, + *, + dry_run: bool = False, +) -> str: + """Serialise and upload a descriptor to the live ``metadata/`` path. + + :param descriptor: descriptor dict from :func:`create_descriptor` + :param assembly_dir: full assembly directory name + :param bucket: S3 bucket name + :param key_prefix: Lakehouse key prefix + :param dry_run: if True, log without uploading + :return: S3 key the descriptor was (or would be) written to + """ + key = build_descriptor_key(assembly_dir, key_prefix) + + if dry_run: + logger.debug("[dry-run] would upload descriptor: s3://%s/%s", bucket, key) + return key + + s3 = get_s3_client() + body = json.dumps(descriptor, indent=2).encode() + + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp: + tmp_path = tmp.name + tmp.write(body) + + try: + s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=key) + logger.debug("Uploaded descriptor: s3://%s/%s", bucket, key) + finally: + Path(tmp_path).unlink() + + return key + + +def archive_descriptor( # noqa: PLR0913 + assembly_dir: str, + bucket: str, + key_prefix: str, + release_tag: str, + *, + archive_reason: str = "unknown", + dry_run: bool = False, +) -> bool: + """Copy the live descriptor to the archive path. + + If the live descriptor does not yet exist (e.g. archival is triggered + before the first promote), logs a warning and returns ``False``. + + :param assembly_dir: full assembly directory name + :param bucket: S3 bucket name + :param key_prefix: Lakehouse key prefix + :param release_tag: NCBI release tag for the archive path + :param archive_reason: metadata value describing why archived (matches raw data metadata) + :param dry_run: if True, log without copying + :return: ``True`` if the descriptor was (or would be) archived; ``False`` if not found + """ + source_key = build_descriptor_key(assembly_dir, key_prefix) + archive_key = build_archive_descriptor_key(assembly_dir, release_tag, key_prefix, archive_reason) + + if dry_run: + logger.debug("[dry-run] would archive descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) + return True + + s3 = get_s3_client() + try: + s3.head_object(Bucket=bucket, Key=source_key) + except s3.exceptions.NoSuchKey: + logger.warning("Descriptor not found, skipping archive: s3://%s/%s", bucket, source_key) + return False + except Exception as e: + # head_object raises ClientError with 404 when key is absent + if hasattr(e, "response") and e.response.get("Error", {}).get("Code") in ("404", "NoSuchKey"): # type: ignore[union-attr] + logger.warning("Descriptor not found, skipping archive: s3://%s/%s", bucket, source_key) + return False + raise + + _ = copy_object( + f"{bucket}/{source_key}", + f"{bucket}/{archive_key}", + ) + logger.debug("Archived descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) + return True diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py new file mode 100644 index 00000000..4fbe6707 --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -0,0 +1,468 @@ +"""Phase 3: Promote staged files to final Lakehouse paths in S3. + +Walks staged files in an S3 staging prefix (written by CTS after Phase 2), +uploads each to the final Lakehouse path with MD5 metadata from sidecar files, +archives replaced/suppressed and updated assemblies, and trims the transfer +manifest so that a re-run of Phase 2 only downloads remaining entries. +""" + +import re +import tempfile +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import UTC, datetime +from pathlib import Path, PurePosixPath +from typing import Any + +import botocore.exceptions +import tqdm + +from cdm_data_loaders.ncbi_ftp.metadata import ( + DescriptorResource, + archive_descriptor, + build_descriptor_key, + create_descriptor, + upload_descriptor, +) +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.s3 import ( + copy_object, + delete_objects, + get_s3_client, + object_exists, + upload_file, +) + +logger = get_cdm_logger() + +DEFAULT_LAKEHOUSE_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" + + +# ── Promote from S3 staging prefix ────────────────────────────────────── + + +def promote_from_s3( # noqa: PLR0913 + staging_key_prefix: str, + staging_bucket: str, + lakehouse_bucket: str, + removed_manifest_path: str | Path | None = None, + updated_manifest_path: str | Path | None = None, + ncbi_release: str | None = None, + manifest_s3_key: str | None = None, + lakehouse_key_prefix: str = DEFAULT_LAKEHOUSE_KEY_PREFIX, + *, + dry_run: bool = False, +) -> dict[str, Any]: + """Promote files from an S3 staging prefix to the final Lakehouse path. + + Downloads each file to a temp location and re-uploads to the final path + with MD5 metadata from ``.md5`` sidecar files. + + :param staging_key_prefix: S3 key prefix where CTS output was written + :param staging_bucket: S3 bucket containing the staged files (e.g. ``"cts"``) + :param lakehouse_bucket: S3 bucket for the final Lakehouse destination (e.g. ``"cdm-lake"``) + :param removed_manifest_path: local path to the removed_manifest file + :param updated_manifest_path: local path to the updated_manifest file + :param ncbi_release: NCBI release version tag for archiving + :param manifest_s3_key: S3 object key for transfer_manifest.txt (for trimming) + :param lakehouse_key_prefix: S3 key prefix for final Lakehouse locations + :param dry_run: if True, log actions without side effects + :return: report dict with counts + """ + s3 = get_s3_client() + paginator = s3.get_paginator("list_objects_v2") + normalized_staging_key_prefix = staging_key_prefix.rstrip("/") + "/" + + # Collect all objects under the staging prefix + staged_objects: list[str] = [] + for page in paginator.paginate(Bucket=staging_bucket, Prefix=normalized_staging_key_prefix): + staged_objects.extend(obj["Key"] for obj in page.get("Contents", [])) + + # Separate data files from sidecars + sidecars = {k for k in staged_objects if k.endswith((".crc64nvme", ".md5"))} + data_files = [k for k in staged_objects if k not in sidecars] + + logger.info("Found %d data files and %d sidecars in staging", len(data_files), len(sidecars)) + + # Archive all affected assemblies BEFORE promoting or deleting + archived = 0 + for manifest_file, reason, delete in [ + (updated_manifest_path, "updated", False), + (removed_manifest_path, "replaced_or_suppressed", True), + ]: + if manifest_file and Path(str(manifest_file)).is_file(): + archived += _archive_assemblies( + str(manifest_file), + lakehouse_bucket=lakehouse_bucket, + ncbi_release=ncbi_release, + lakehouse_key_prefix=lakehouse_key_prefix, + archive_reason=reason, + delete_source=delete, + dry_run=dry_run, + ) + + promoted, failed, descriptors_written, promoted_accessions = _promote_data_files( + data_files, + sidecars, + normalized_staging_key_prefix, + lakehouse_key_prefix, + staging_bucket, + lakehouse_bucket, + dry_run=dry_run, + ) + + # Trim manifest for resumability + if manifest_s3_key and promoted_accessions and not dry_run: + _trim_manifest(manifest_s3_key, staging_bucket, promoted_accessions) + + if descriptors_written: + logger.info("Wrote %d frictionless descriptor(s)", descriptors_written) + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "promoted": promoted, + "archived": archived, + "failed": failed, + "dry_run": dry_run, + } + + logger.info( + "PROMOTE SUMMARY: %d promoted, %d archived, %d failed%s", + promoted, + archived, + failed, + " (dry-run)" if dry_run else "", + ) + return report + + +# ── Promote data files (per-file loop) ────────────────────────────────── + + +def _promote_data_files( # noqa: PLR0913, PLR0915 + data_files: list[str], + sidecars: set[str], + normalized_staging_prefix: str, + lakehouse_key_prefix: str, + staging_bucket: str, + lakehouse_bucket: str, + *, + dry_run: bool, +) -> tuple[int, int, int, set[str]]: + """Promote each data file from staging to the final Lakehouse path. + + Files are grouped by assembly. When all files for an assembly are promoted + successfully, the frictionless descriptor is written immediately and the staged + files (including sidecars) are deleted from staging. This prevents staging + accumulation across runs and ensures partial runs leave descriptors for all + completed assemblies. + + :return: (promoted_count, failed_count, descriptors_written, promoted_accessions) + """ + s3 = get_s3_client() + promoted = 0 + failed = 0 + descriptors_written = 0 + promoted_accessions: set[str] = set() + + # Group files by assembly; skip download_report.json and non-raw_data paths + assembly_files: defaultdict[tuple[str, str], list[str]] = defaultdict(list) + for staged_key in data_files: + if staged_key.endswith("download_report.json"): + continue + rel_path = staged_key[len(normalized_staging_prefix) :] + if not rel_path.startswith("raw_data/"): + continue + acc_match = re.search(r"(GC[AF]_\d{9}\.\d+)", staged_key) + adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", staged_key) + if acc_match and adir_match: + assembly_files[(adir_match.group(1), acc_match.group(1))].append(staged_key) + + def _promote_one(staged_key: str) -> tuple[DescriptorResource, str]: + """Download one staged file, re-upload to Lakehouse with MD5 metadata. + + :return: ``(resource_dict, staged_key)`` on success; raises on failure. + """ + rel_path = staged_key[len(normalized_staging_prefix) :] + final_key = lakehouse_key_prefix + rel_path + final_key_path = PurePosixPath(final_key) + + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp_path = tmp.name + try: + s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) + + metadata: dict[str, str] = {} + md5_key = staged_key + ".md5" + if md5_key in sidecars: + md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) + metadata["md5"] = md5_obj["Body"].read().decode().strip() + + upload_succeeded = upload_file( + tmp_path, + f"{lakehouse_bucket}/{final_key_path.parent}", + tags=metadata, + object_name=final_key_path.name, + show_progress=False, + ) + if not upload_succeeded: + msg = f"upload_file returned False for {staged_key}" + raise RuntimeError(msg) + + fname = final_key_path.name + ext = fname.rsplit(".", 1)[-1] if "." in fname else "" + resource: DescriptorResource = { + "name": fname.lower(), + "path": final_key, + "format": ext, + "bytes": Path(tmp_path).stat().st_size, + "hash": metadata.get("md5"), + } + return resource, staged_key + finally: + Path(tmp_path).unlink() + + total_files = sum(len(v) for v in assembly_files.values()) + _dry_run_log_count = 0 + with tqdm.tqdm(total=total_files, unit="file", desc="Promoting") as pbar: + for (adir, acc), files in assembly_files.items(): + assembly_failed = 0 + resources: list[DescriptorResource] = [] + promoted_keys: list[str] = [] + + if dry_run: + for staged_key in files: + rel_path = staged_key[len(normalized_staging_prefix) :] + final_key = lakehouse_key_prefix + rel_path + if _dry_run_log_count < 10: + logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) + else: + logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) + _dry_run_log_count += 1 + promoted += 1 + pbar.update(1) + continue + + # Download and re-upload all files for this assembly concurrently + n_workers = min(32, len(files)) + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = {executor.submit(_promote_one, key): key for key in files} + for future in as_completed(futures): + staged_key = futures[future] + try: + resource, _ = future.result() + resources.append(resource) + promoted_keys.append(staged_key) + promoted += 1 + promoted_accessions.add(acc) + except Exception: + logger.exception("Failed to promote %s", staged_key) + assembly_failed += 1 + pbar.update(1) + + failed += assembly_failed + + # Write descriptor and delete staged files immediately after a fully successful assembly + if assembly_failed == 0 and promoted_keys: + try: + descriptor_key = build_descriptor_key(adir, lakehouse_key_prefix) + if object_exists(f"{lakehouse_bucket}/{descriptor_key}"): + logger.debug("Descriptor already exists, skipping: %s", descriptor_key) + else: + descriptor = create_descriptor(adir, acc, resources) + descriptor_key = upload_descriptor( + descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False + ) + logger.debug("Uploaded descriptor: %s", descriptor_key) + descriptors_written += 1 + except Exception: + logger.exception("Failed to write descriptor for %s", adir) + + # Batch-delete all staged data files and their sidecars in one API call + keys_to_delete = list(promoted_keys) + for key in promoted_keys: + for sidecar_ext in (".md5", ".crc64nvme"): + if key + sidecar_ext in sidecars: + keys_to_delete.append(key + sidecar_ext) + del_errors = delete_objects(staging_bucket, keys_to_delete) + for err in del_errors: + logger.warning("Failed to delete staged file %s: %s", err.get("Key"), err.get("Message")) + + return promoted, failed, descriptors_written, promoted_accessions + + +# ── Archive assemblies ────────────────────────────────────────────────── + + +def _archive_assemblies( # noqa: PLR0913 + manifest_local_path: str, + lakehouse_bucket: str, + ncbi_release: str | None = None, + lakehouse_key_prefix: str = DEFAULT_LAKEHOUSE_KEY_PREFIX, + archive_reason: str = "unknown", + *, + delete_source: bool = False, + dry_run: bool = False, +) -> int: + """Archive assembly objects to ``archive/{release_tag}/``. + + Copies S3 objects matching each accession to the archive prefix. + When *delete_source* is True (replaced/suppressed), the original + objects are deleted after copying. When False (updated), the + originals remain in place to be overwritten by the promote step. + + :param manifest_local_path: local path to a manifest file (one accession per line) + :param lakehouse_bucket: S3 bucket for the Lakehouse (source and archive destination) + :param ncbi_release: release tag used in the archive path + :param lakehouse_key_prefix: S3 key prefix for the Lakehouse dataset root + :param archive_reason: metadata value describing why the object was archived + :param delete_source: if True, delete the source object after copying + :param dry_run: if True, log without making changes + :return: number of objects archived + """ + s3 = get_s3_client() + release_tag = ncbi_release or "unknown" + archived = 0 + + with Path(manifest_local_path).open() as f: + accessions = [line.strip() for line in f if line.strip()] + + _dry_run_log_count = 0 + for accession in tqdm.tqdm(accessions, unit="accession", desc="Archiving"): + m = re.match(r"(GC[AF])_(\d{3})(\d{3})(\d{3})\.\d+", accession) + if not m: + logger.warning("Cannot parse accession for archival: %s", accession) + continue + + db = m.group(1) + p1, p2, p3 = m.group(2), m.group(3), m.group(4) + source_prefix = f"{lakehouse_key_prefix}raw_data/{db}/{p1}/{p2}/{p3}/" + + paginator = s3.get_paginator("list_objects_v2") + matching_keys: list[str] = [] + for page in paginator.paginate(Bucket=lakehouse_bucket, Prefix=source_prefix): + matching_keys.extend(obj["Key"] for obj in page.get("Contents", []) if accession in obj["Key"]) + + if not matching_keys: + logger.debug("No objects found for %s, skipping archive", accession) + continue + + # Infer assembly_dir from key paths for descriptor archival + assembly_dir: str | None = None + for key in matching_keys: + adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", key) + if adir_match: + assembly_dir = adir_match.group(1) + break + + key_pairs = [ + ( + source_key, + f"{lakehouse_key_prefix}archive/{release_tag}/{archive_reason}/{source_key[len(lakehouse_key_prefix) :]}", + ) + for source_key in matching_keys + ] + + if dry_run: + for source_key, archive_key in key_pairs: + if _dry_run_log_count < 10: + logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) + else: + logger.debug("[dry-run] would archive: %s -> %s", source_key, archive_key) + _dry_run_log_count += 1 + archived += len(key_pairs) + continue + + # Copy all files for this accession concurrently + keys_to_delete: list[str] = [] + n_workers = min(32, len(key_pairs)) + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = { + executor.submit( + copy_object, + f"{lakehouse_bucket}/{src}", + f"{lakehouse_bucket}/{arch}", + ): src + for src, arch in key_pairs + } + for future in as_completed(futures): + src = futures[future] + try: + future.result() + archived += 1 + if delete_source: + keys_to_delete.append(src) + logger.debug(" Archived: %s", src) + except Exception: + logger.exception("Failed to archive %s", src) + + # Batch-delete source keys in a single API call + if keys_to_delete: + del_errors = delete_objects(lakehouse_bucket, keys_to_delete) + for err in del_errors: + logger.warning("Failed to delete %s: %s", err.get("Key"), err.get("Message")) + + # Archive the frictionless descriptor alongside raw data + if assembly_dir: + try: + archived_desc = archive_descriptor( + assembly_dir, + lakehouse_bucket, + lakehouse_key_prefix, + release_tag, + archive_reason=archive_reason, + dry_run=dry_run, + ) + if not archived_desc: + logger.debug("No descriptor found to archive for %s", assembly_dir) + except Exception: + logger.exception("Failed to archive descriptor for %s", assembly_dir) + + logger.info("Archived %d objects for %d accessions (%s)", archived, len(accessions), archive_reason) + return archived + + +# ── Manifest trimming ─────────────────────────────────────────────────── + + +def _trim_manifest(manifest_s3_key: str, staging_bucket: str, promoted_accessions: set[str]) -> None: + """Remove promoted accessions from the transfer manifest in S3. + + :param manifest_s3_key: S3 object key of the transfer_manifest.txt + :param staging_bucket: S3 bucket containing the transfer manifest + :param promoted_accessions: set of accessions that were successfully promoted + """ + s3 = get_s3_client() + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as tmp: + tmp_path = tmp.name + + try: + try: + s3.download_file(Bucket=staging_bucket, Key=manifest_s3_key, Filename=tmp_path) + except s3.exceptions.NoSuchKey: + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", staging_bucket, manifest_s3_key) + return + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", staging_bucket, manifest_s3_key) + return + raise + + with Path(tmp_path).open() as f: + lines = f.readlines() + + remaining = [line for line in lines if line.strip() and not any(acc in line for acc in promoted_accessions)] + + with Path(tmp_path).open("w") as f: + f.writelines(remaining) + + s3.upload_file(Filename=tmp_path, Bucket=staging_bucket, Key=manifest_s3_key) + logger.info( + "Trimmed manifest: %d -> %d entries (%d promoted)", + len(lines), + len(remaining), + len(lines) - len(remaining), + ) + finally: + Path(tmp_path).unlink() diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py new file mode 100644 index 00000000..9eff6af3 --- /dev/null +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -0,0 +1,383 @@ +"""NCBI FTP assembly download pipeline (Phase 2). + +Orchestrates parallel downloading of NCBI assemblies listed in a transfer +manifest. Settings, batching, CLI entry point, and CTS integration live here; +domain-specific download logic is in :mod:`cdm_data_loaders.ncbi_ftp.assembly`. +""" + +import json +import logging +import shutil +import tempfile +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import UTC, datetime +from ftplib import error_temp +from pathlib import Path +from typing import Any + +import tqdm +from pydantic import AliasChoices, Field +from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential +from pydantic_settings import BaseSettings, SettingsConfigDict + +from cdm_data_loaders.ncbi_ftp.assembly import ( + FTP_HOST, + build_accession_path, + download_assembly_to_local, + parse_assembly_path, +) +from cdm_data_loaders.pipelines.core import run_cli +from cdm_data_loaders.pipelines.cts_defaults import DEFAULT_SETTINGS_CONFIG_DICT, INPUT_MOUNT, OUTPUT_MOUNT +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.ftp_client import ThreadLocalFTP +from cdm_data_loaders.utils.s3 import get_s3_client, upload_file + +logger = get_cdm_logger() + + +# ── Constants ──────────────────────────────────────────────────────────── + +DEFAULT_STAGING_KEY_PREFIX = "staging/" + + +class DownloadSettings(BaseSettings): + """Configuration for the NCBI FTP assembly download pipeline.""" + + model_config = SettingsConfigDict(**DEFAULT_SETTINGS_CONFIG_DICT) + + manifest: str = Field( + default=f"{INPUT_MOUNT}/transfer_manifest.txt", + description="Path to the transfer manifest file listing FTP paths to download", + validation_alias=AliasChoices("m", "manifest"), + ) + output_dir: str = Field( + default=OUTPUT_MOUNT, + description="Output directory for downloaded assembly files", + validation_alias=AliasChoices("output-dir", "output_dir"), + ) + threads: int = Field( + default=4, + ge=1, + le=32, + description="Number of parallel download threads", + validation_alias=AliasChoices("t", "threads"), + ) + ftp_host: str = Field( + default=FTP_HOST, + description="NCBI FTP hostname", + validation_alias=AliasChoices("ftp-host", "ftp_host"), + ) + limit: int | None = Field( + default=None, + ge=1, + description="Limit to first N assemblies (for testing)", + validation_alias=AliasChoices("l", "limit"), + ) + + +# ── Private helpers ───────────────────────────────────────────────────── + + +def _upload_assembly_dir( + assembly_dir: Path, + tmp_root: Path, + bucket: str, + staging_key_prefix: str, +) -> int: + """Upload all files under *assembly_dir* to S3, deleting each file immediately after upload. + + Empty directories are removed after all files are uploaded. If the + directory does not exist (e.g. the assembly had no files) the function + returns zero without raising. + + :param assembly_dir: local directory for one assembly + :param tmp_root: root of the temp directory (used to compute relative S3 paths) + :param bucket: destination S3 bucket + :param staging_key_prefix: S3 key prefix within *bucket* + :return: number of files uploaded + """ + if not assembly_dir.exists(): + return 0 + count = 0 + for f in sorted(assembly_dir.rglob("*")): + if f.is_file(): + relative = f.relative_to(tmp_root) + dest_prefix = f"{bucket}/{staging_key_prefix.rstrip('/')}/{relative.parent}" + if upload_file(f, dest_prefix, show_progress=False): + count += 1 + else: + logger.warning("Failed to upload %s to %s", f, dest_prefix) + f.unlink() + shutil.rmtree(assembly_dir, ignore_errors=True) + return count + + +# ── Batch download ─────────────────────────────────────────────────────── + + +def download_batch( + manifest_path: str | Path, + output_dir: str | Path, + threads: int = 4, + ftp_host: str = FTP_HOST, + limit: int | None = None, +) -> dict[str, Any]: + """Download all assemblies listed in the manifest. + + :param manifest_path: path to the transfer manifest file + :param output_dir: base output directory + :param threads: number of parallel download threads + :param ftp_host: FTP hostname + :param limit: optional limit for testing + :return: report dict with overall stats + """ + with Path(manifest_path).open() as f: + assembly_paths = [line.strip() for line in f if line.strip() and not line.startswith("#")] + + if limit: + assembly_paths = assembly_paths[:limit] + + logger.info("Starting download of %d assemblies with %d threads", len(assembly_paths), threads) + + pool = ThreadLocalFTP(ftp_host) + lock = threading.Lock() + success_count = 0 + failed: list[dict[str, str]] = [] + all_stats: list[dict[str, Any]] = [] + + def _download_one(path: str) -> tuple[str, Exception | None]: + nonlocal success_count + + @retry( + retry=retry_if_exception_type((error_temp, BrokenPipeError, ConnectionResetError, EOFError)), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=5, max=60), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _attempt() -> dict[str, Any]: + return download_assembly_to_local(path, output_dir, ftp_host=ftp_host, ftp=pool.get()) + + try: + stats = _attempt() + except Exception as e: # noqa: BLE001 + return path, e + else: + with lock: + success_count += 1 + all_stats.append(stats) + return path, None + + try: + with tqdm.tqdm( + total=len(assembly_paths), unit="assembly", desc="Downloading from NCBI FTP", smoothing=0.01 + ) as pbar: + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = {executor.submit(_download_one, p): p for p in assembly_paths} + for future in as_completed(futures): + path, error = future.result() + if error: + logger.error("FAILED: %s: %s", path, error) + with lock: + failed.append({"path": path, "error": str(error)}) + pbar.update(1) + finally: + pool.close_all() + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "total_attempted": len(assembly_paths), + "succeeded": success_count, + "failed": len(failed), + "failures": failed, + "assembly_stats": all_stats, + } + + report_path = Path(output_dir) / "download_report.json" + report_path.parent.mkdir(parents=True, exist_ok=True) + with report_path.open("w") as f: + json.dump(report, f, indent=2) + logger.info("Download report written to: %s", report_path) + + logger.info( + "SUMMARY: %d attempted, %d succeeded, %d failed", + len(assembly_paths), + success_count, + len(failed), + ) + + return report + + +# ── CTS entry point ───────────────────────────────────────────────────── + + +def run_download(config: DownloadSettings) -> None: + """Main CTS entry point for Phase 2 download. + + :param config: validated download settings + """ + report = download_batch( + manifest_path=config.manifest, + output_dir=config.output_dir, + threads=config.threads, + ftp_host=config.ftp_host, + limit=config.limit, + ) + if report["failed"] > 0: + msg = f"Download completed with {report['failed']} failures" + raise RuntimeError(msg) + + +def cli() -> None: + """CLI entry point for ``ncbi_ftp_sync``.""" + run_cli(DownloadSettings, run_download) + + +# ── Notebook / interactive entry point ────────────────────────────────── + + +def download_and_stage( + *, + bucket: str, + staging_key_prefix: str, + manifest_s3_key: str | None = None, + manifest_local_path: str | Path | None = None, + threads: int = 4, + ftp_host: str = FTP_HOST, + limit: int | None = None, + dry_run: bool = False, +) -> dict[str, Any]: + """Download assemblies from NCBI FTP and stage them to S3 (Phase 2). + + Exactly one of *manifest_s3_key* or *manifest_local_path* must be given. + + Downloads and uploads are pipelined per assembly: each worker downloads one + assembly, immediately uploads its files to S3, then deletes the local copies + before picking up the next assembly. At most *threads* assembly directories + exist on disk simultaneously, preventing disk exhaustion on large batches. + + :param bucket: destination S3 bucket name + :param staging_key_prefix: key prefix inside the bucket (e.g. ``"staging/run1/"``) + :param manifest_s3_key: S3 object key of the transfer manifest within *bucket* + :param manifest_local_path: local path to the transfer manifest file + :param threads: number of parallel download-and-upload workers + :param ftp_host: NCBI FTP hostname + :param limit: optional limit for testing + :param dry_run: when ``True``, download but skip all S3 uploads + :return: download report extended with ``staged_objects``, ``staging_key_prefix``, ``dry_run`` + """ + if manifest_s3_key is not None and manifest_local_path is not None: + msg = "Provide exactly one of manifest_s3_key or manifest_local_path, not both" + raise ValueError(msg) + if manifest_s3_key is None and manifest_local_path is None: + msg = "One of manifest_s3_key or manifest_local_path must be provided" + raise ValueError(msg) + + with tempfile.TemporaryDirectory() as _tmpdir: + tmp = Path(_tmpdir) + manifest_dest = tmp / "transfer_manifest.txt" + + if manifest_s3_key is not None: + s3 = get_s3_client() + response = s3.get_object(Bucket=bucket, Key=manifest_s3_key) + manifest_dest.write_bytes(response["Body"].read()) + logger.info("Manifest read from S3: s3://%s/%s", bucket, manifest_s3_key) + else: + manifest_dest.write_bytes(Path(manifest_local_path).read_bytes()) + logger.info("Manifest read from local path: %s", manifest_local_path) + + with manifest_dest.open() as f: + assembly_paths = [line.strip() for line in f if line.strip() and not line.startswith("#")] + + if limit: + assembly_paths = assembly_paths[:limit] + + logger.info("Starting download & stage of %d assemblies with %d threads", len(assembly_paths), threads) + + pool = ThreadLocalFTP(ftp_host) + lock = threading.Lock() + success_count = 0 + staged_objects = 0 + failed: list[dict[str, str]] = [] + all_stats: list[dict[str, Any]] = [] + + def _download_upload_one(path: str) -> tuple[str, Exception | None]: + nonlocal success_count, staged_objects + + @retry( + retry=retry_if_exception_type((error_temp, BrokenPipeError, ConnectionResetError, EOFError)), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=5, max=60), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _attempt() -> dict[str, Any]: + return download_assembly_to_local(path, tmp, ftp_host=ftp_host, ftp=pool.get()) + + try: + stats = _attempt() + except Exception as e: # noqa: BLE001 + return path, e + + if not dry_run: + _db, assembly_dir_name, _accession = parse_assembly_path(path) + assembly_local_dir = tmp / build_accession_path(assembly_dir_name) + count = _upload_assembly_dir(assembly_local_dir, tmp, bucket, staging_key_prefix) + with lock: + staged_objects += count + + with lock: + success_count += 1 + all_stats.append(stats) + return path, None + + try: + with tqdm.tqdm( + total=len(assembly_paths), unit="assembly", desc="Downloading & staging", smoothing=0.01 + ) as pbar: + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = {executor.submit(_download_upload_one, p): p for p in assembly_paths} + for future in as_completed(futures): + path, error = future.result() + if error: + logger.error("FAILED: %s: %s", path, error) + with lock: + failed.append({"path": path, "error": str(error)}) + pbar.update(1) + finally: + pool.close_all() + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "total_attempted": len(assembly_paths), + "succeeded": success_count, + "failed": len(failed), + "failures": failed, + "assembly_stats": all_stats, + } + + if not dry_run: + report_path = tmp / "download_report.json" + with report_path.open("w") as f: + json.dump(report, f, indent=2) + if upload_file(report_path, f"{bucket}/{staging_key_prefix.rstrip('/')}", show_progress=False): + staged_objects += 1 + else: + logger.warning("Failed to upload download report to s3://%s/%s", bucket, staging_key_prefix) + logger.info("Staged %d objects to s3://%s/%s", staged_objects, bucket, staging_key_prefix) + + logger.info( + "SUMMARY: %d attempted, %d succeeded, %d failed", + len(assembly_paths), + success_count, + len(failed), + ) + + return { + **report, + "staged_objects": staged_objects, + "staging_key_prefix": staging_key_prefix, + "dry_run": dry_run, + } diff --git a/src/cdm_data_loaders/utils/checksums.py b/src/cdm_data_loaders/utils/checksums.py new file mode 100644 index 00000000..021098a6 --- /dev/null +++ b/src/cdm_data_loaders/utils/checksums.py @@ -0,0 +1,55 @@ +"""General-purpose file checksum utilities. + +Provides MD5 and CRC64/NVME checksum computation and verification for local +files. These are protocol-agnostic primitives used by download pipelines +and S3 metadata workflows. +""" + +import base64 +import hashlib +from pathlib import Path + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger + +logger = get_cdm_logger() + + +def compute_md5(file_path: str | Path) -> str: + """Compute the MD5 hex digest of a file. + + :param file_path: path to the file + :return: lowercase hex MD5 string + """ + md5_hash = hashlib.md5() # noqa: S324 + with Path(file_path).open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + md5_hash.update(chunk) + return md5_hash.hexdigest() + + +def verify_md5(file_path: str | Path, expected_md5: str) -> bool: + """Verify a file's MD5 checksum against an expected value. + + :param file_path: path to the file + :param expected_md5: expected lowercase hex MD5 string + :return: True if the checksum matches + """ + return compute_md5(file_path) == expected_md5 + + +def compute_crc64nvme(file_path: str | Path) -> str: + """Compute the CRC64/NVME checksum of a file. + + Returns the base64-encoded string matching the format used by S3-native + checksums (``ChecksumCRC64NVME``). + + :param file_path: path to the file + :return: base64-encoded CRC64/NVME checksum + """ + from awscrt.checksums import crc64nvme as _crc64nvme # noqa: PLC0415 + + crc = 0 + with Path(file_path).open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + crc = _crc64nvme(chunk, crc) + return base64.b64encode(crc.to_bytes(8, byteorder="big")).decode() diff --git a/src/cdm_data_loaders/utils/ftp_client.py b/src/cdm_data_loaders/utils/ftp_client.py new file mode 100644 index 00000000..bd372924 --- /dev/null +++ b/src/cdm_data_loaders/utils/ftp_client.py @@ -0,0 +1,189 @@ +"""General-purpose FTP client utilities. + +Provides resilient FTP connections with TCP keepalive, NOOP pings, retry +on transient errors, and thread-local connection management for parallel +downloads. Protocol-agnostic — callers supply the FTP hostname. +""" + +import contextlib +import logging +import socket +import threading +import time +from ftplib import FTP, error_temp +from pathlib import Path + +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_fixed, +) + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger + +logger = get_cdm_logger() + +DEFAULT_TIMEOUT = 60 + + +def connect_ftp(host: str, timeout: int = DEFAULT_TIMEOUT) -> FTP: + """Connect and log in to an FTP server with TCP keepalive enabled. + + :param host: FTP hostname + :param timeout: connection timeout in seconds + :return: logged-in FTP connection + """ + ftp = FTP(host, timeout=timeout) # noqa: S321 + ftp.login() + _set_keepalive(ftp) + return ftp + + +def _set_keepalive(ftp: FTP, idle: int = 30, interval: int = 10, count: int = 3) -> None: + """Enable TCP keepalive on the FTP control socket. + + Prevents idle-timeout disconnects (e.g. '421 No transfer timeout') when + the control connection sits idle during data transfers or checksum + verification. + """ + sock = ftp.sock + sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + if hasattr(socket, "TCP_KEEPIDLE"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, idle) + if hasattr(socket, "TCP_KEEPINTVL"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, interval) + if hasattr(socket, "TCP_KEEPCNT"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, count) + + +def ftp_noop_keepalive(ftp: FTP, last_activity: float, interval: int = 25) -> float: + """Send NOOP if the connection has been idle longer than *interval* seconds. + + :param ftp: active FTP connection + :param last_activity: monotonic timestamp of last FTP activity + :param interval: seconds of idle time before sending NOOP + :return: updated last-activity timestamp + """ + if time.monotonic() - last_activity > interval: + with contextlib.suppress(Exception): + ftp.sendcmd("NOOP") + return time.monotonic() + return last_activity + + +def ftp_list_dir(ftp: FTP, path: str, retries: int = 3) -> list[str]: + """List files in an FTP directory with retry on transient errors. + + :param ftp: active FTP connection + :param path: remote directory path + :param retries: number of retry attempts + :return: list of filenames + """ + ftp.cwd(path) + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(retries), + wait=wait_fixed(2), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _list() -> list[str]: + files: list[str] = [] + ftp.retrlines("NLST", files.append) + return files + + return _list() + + +def ftp_download_file(ftp: FTP, remote_path: str, local_path: str, retries: int = 3) -> None: + """Download a single file from FTP with retry on transient errors. + + :param ftp: active FTP connection + :param remote_path: full remote file path + :param local_path: local destination path + :param retries: number of retry attempts + """ + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(retries), + wait=wait_fixed(2), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _download() -> None: + with Path(local_path).open("wb") as f: + ftp.retrbinary(f"RETR {remote_path}", f.write) + + _download() + + +def ftp_retrieve_text(ftp: FTP, remote_path: str) -> str: + """Retrieve a text file from FTP, returning its content as a string. + + :param ftp: active FTP connection + :param remote_path: full remote file path + :return: file content + """ + lines: list[str] = [] + ftp.retrlines(f"RETR {remote_path}", lines.append) + return "\n".join(lines) + + +class ThreadLocalFTP: + """Manage thread-local FTP connections for parallel downloads. + + Each thread gets its own FTP connection, created on first access. + Call :meth:`close_all` when done to cleanly shut down all connections. + """ + + def __init__(self, host: str, timeout: int = DEFAULT_TIMEOUT) -> None: + """Initialise with FTP host and timeout. + + :param host: FTP hostname (required — no default) + :param timeout: connection timeout in seconds + """ + self._host = host + self._timeout = timeout + self._local = threading.local() + self._lock = threading.Lock() + self._connections: list[FTP] = [] + + def get(self) -> FTP: + """Return the FTP connection for the current thread, reconnecting if stale. + + Sends a NOOP to verify the connection is still alive before returning it. + If the server has closed the connection (e.g. after an idle timeout or + session limit), the dead socket is discarded and a fresh connection is + established transparently. + """ + ftp = getattr(self._local, "ftp", None) + if ftp is not None: + try: + ftp.voidcmd("NOOP") + except Exception: + # Connection is stale — discard it and reconnect below + with contextlib.suppress(Exception): + ftp.quit() + with self._lock: + with contextlib.suppress(ValueError): + self._connections.remove(ftp) + ftp = None + self._local.ftp = None + if ftp is None: + ftp = connect_ftp(self._host, self._timeout) + self._local.ftp = ftp + with self._lock: + self._connections.append(ftp) + return ftp + + def close_all(self) -> None: + """Close all thread-local FTP connections.""" + with self._lock: + for ftp in self._connections: + with contextlib.suppress(Exception): + ftp.quit() + self._connections.clear() diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 5d803f00..738309e8 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -66,10 +66,8 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli "aws_access_key_id": settings.MINIO_ACCESS_KEY, "aws_secret_access_key": settings.MINIO_SECRET_KEY, } - except (ModuleNotFoundError, ImportError, NameError): - logger.exception("Error initialising boto3 client") - raise - except Exception: + except (ModuleNotFoundError, ImportError, NameError) as e: + logger.exception("Failed to load berdl settings") raise required_args = ["endpoint_url", "aws_access_key_id", "aws_secret_access_key"] @@ -149,17 +147,32 @@ def list_matching_objects(s3_path: str) -> list[dict[str, Any]]: return contents -def head_object(s3_path: str) -> dict[str, Any]: - """Check whether an object exists on s3. +def head_object(s3_path: str) -> dict[str, Any] | None: + """Return metadata for an S3 object, or None if it does not exist. + + The returned dict contains: + - ``size``: content length in bytes + - ``metadata``: user metadata dict + - ``checksum_crc64nvme``: CRC64NVME checksum string (if available) :param s3_path: path to the object on s3, INCLUDING the bucket name :type s3_path: str - :return: response from the head_object request - :rtype: dict[str, Any] + :return: dict with object info, or None if the object does not exist + :rtype: dict[str, Any] | None """ s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) - return s3.head_object(Bucket=bucket, Key=key) + try: + resp = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") + except Exception as e: # noqa: BLE001 + if e.response["Error"]["Code"] == "404": # type: ignore[union-attr] + return None + raise + return { + "size": resp["ContentLength"], + "metadata": resp.get("Metadata", {}), + "checksum_crc64nvme": resp.get("ChecksumCRC64NVME"), + } def object_exists(s3_path: str) -> bool: @@ -170,9 +183,12 @@ def object_exists(s3_path: str) -> bool: :return: True if the object exists, False otherwise :rtype: bool """ + s3 = get_s3_client() + + (bucket, key) = split_s3_path(s3_path) try: - head_object(s3_path) - except Exception as e: + s3.head_object(Bucket=bucket, Key=key) + except Exception as e: # noqa: BLE001 error_string = str(e) if not error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): logger.exception("Error performing head operation on s3 object") @@ -184,15 +200,24 @@ def upload_file( local_file_path: Path | str, destination_dir: str, object_name: str | None = None, + tags: dict[str, str] | None = None, + show_progress: bool = True, ) -> bool: """Upload an object to an S3 bucket. + When *metadata* is supplied the file is always uploaded (no existence check) + and the dict is attached as S3 user metadata. When *metadata* is ``None`` + (the default) the existing behaviour is preserved: the upload is skipped if + the object is already present. + :param local_file_path: File to upload :type local_file_path: Path | str :param destination_dir: path to the destination directory on s3, INCLUDING the bucket name and EXCLUDING the file name :type destination_dir: str :param object_name: S3 object name. If not specified, the name of the file from local_file_path is used. :type object_name: str | None + :param metadata: user metadata key/value pairs to attach to the object; when provided the upload always runs + :type metadata: dict[str, str] | None :return: True if file was uploaded, else False :rtype: bool """ @@ -207,29 +232,39 @@ def upload_file( object_name = local_file_path.name s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" - if object_exists(s3_path): - logger.info("File already present: %s", s3_path) + if tags is None and object_exists(s3_path): + logger.debug("File already present: %s", s3_path) return True s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) + extra_args = {**DEFAULT_EXTRA_ARGS, **(({"Metadata": tags}) if tags is not None else {})} + # Upload the file - file_size = local_file_path.stat().st_size - with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - logger.info("uploading %s to %s", str(local_file_path), s3_path) - try: + logger.debug("uploading %s to %s", str(local_file_path), s3_path) + try: + if show_progress: + file_size = local_file_path.stat().st_size + with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.upload_file( + Filename=str(local_file_path), + Bucket=bucket, + Key=key, + Callback=pbar.update, + ExtraArgs=extra_args, + ) + else: s3.upload_file( Filename=str(local_file_path), Bucket=bucket, Key=key, - Callback=pbar.update, - ExtraArgs=DEFAULT_EXTRA_ARGS, + ExtraArgs=extra_args, ) - except Exception: - logger.exception("Error uploading to s3") - return False - return True + except Exception as e: # noqa: BLE001 + logger.exception("Error uploading to s3") + return False + return True def stream_to_s3(url: str, s3_path: str, requests: ModuleType) -> str: @@ -261,7 +296,9 @@ def stream_to_s3(url: str, s3_path: str, requests: ModuleType) -> str: return f"{bucket}/{key}" -def download_file(s3_path: str, local_file_path: str | Path, version_id: str | None = None) -> None: +def download_file( + s3_path: str, local_file_path: str | Path, version_id: str | None = None, show_progress: bool = True +) -> None: """Download an object from s3. WARNING: will overwrite existing files but will not overwrite a file whilst trying to make a directory @@ -294,7 +331,7 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N # Get the object size try: object_size = s3.head_object(**kwargs)["ContentLength"] - except Exception as e: + except Exception as e: # noqa: BLE001 error_string = str(e) if error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): logger.exception("File not found: %s", s3_path) @@ -307,13 +344,21 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N # set ``unit_scale=True`` so tqdm uses SI unit prefixes # ``unit="B"`` means it adds the string "B" as a suffix # progress is reported as (e.g.) "14.5kB/s". - with tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + if show_progress: + with tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.download_file( + Bucket=bucket, + Key=key, + ExtraArgs=extra_args, + Filename=str(local_file_path), + Callback=pbar.update, + ) + else: s3.download_file( Bucket=bucket, Key=key, ExtraArgs=extra_args, Filename=str(local_file_path), - Callback=pbar.update, ) @@ -391,8 +436,14 @@ def upload_dir( return all_successful -def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: - """Copy an object from one place to another, adding in a CRC64NVME checksum. +def copy_object( + current_s3_path: str, + new_s3_path: str, +) -> dict[str, Any]: + """Copy an object from one place to another, inheriting the source user metadata. + + Source user metadata (e.g. ``md5``) is preserved on the destination because + ``MetadataDirective`` is omitted, which defaults to ``COPY``. A successful copy operation will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 200 @@ -400,11 +451,11 @@ def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: Errors (e.g, buckets or keys not existing, wrong credentials, etc.) are passed directly to the user without being caught. - :param current_path: path to the file on s3, INCLUDING the bucket name - :type current_path: str - :param new_path: the desired new file path on s3, INCLUDING the bucket name - :type new_path: str - :return: dictionary containing response + :param current_s3_path: path to the file on s3, INCLUDING the bucket name + :type current_s3_path: str + :param new_s3_path: the desired new file path on s3, INCLUDING the bucket name + :type new_s3_path: str + :return: dictionary containing response from the copy operation :rtype: dict[str, Any] """ s3 = get_s3_client() @@ -415,7 +466,6 @@ def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: CopySource={"Bucket": current_s3_bucket, "Key": current_s3_key}, Bucket=new_s3_bucket, Key=new_s3_key, - **DEFAULT_EXTRA_ARGS, ) @@ -468,7 +518,6 @@ def copy_directory(current_s3_path: str, new_s3_path: str) -> tuple[dict[str, st CopySource={"Bucket": current_s3_bucket, "Key": current_key}, Bucket=new_s3_bucket, Key=new_key, - **DEFAULT_EXTRA_ARGS, ) if resp["ResponseMetadata"]["HTTPStatusCode"] == SUCCESS_RESPONSE: successes[source_path] = dest_path @@ -498,3 +547,28 @@ def delete_object(s3_path: str) -> dict[str, Any]: s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) return s3.delete_object(Bucket=bucket, Key=key) + + +def delete_objects(bucket: str, keys: list[str]) -> list[dict[str, Any]]: + """Delete multiple objects from an S3 bucket in a single API call. + + Splits into batches of 1000 (the S3 API maximum per request). + + :param bucket: S3 bucket name (no protocol prefix) + :param keys: list of S3 keys to delete + :return: list of per-key error dicts returned by S3 (empty if all succeeded) + :rtype: list[dict[str, Any]] + """ + if not keys: + return [] + + s3 = get_s3_client() + errors: list[dict[str, Any]] = [] + for i in range(0, len(keys), 1000): + batch = keys[i : i + 1000] + resp = s3.delete_objects( + Bucket=bucket, + Delete={"Objects": [{"Key": k} for k in batch], "Quiet": False}, + ) + errors.extend(resp.get("Errors", [])) + return errors diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..b8bdf9ba --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,263 @@ +"""Shared fixtures and helpers for MinIO-backed integration tests. + +Integration tests are auto-skipped when MinIO is not reachable. Each test +method gets its own bucket (derived from the test node name) that is emptied +on re-run but **never deleted** after the test — this lets developers inspect +the final state of the object store via the MinIO console. +""" + +import hashlib +import os +import re +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import boto3 +import botocore.client +import botocore.config +import pytest + +import cdm_data_loaders.ncbi_ftp.manifest as manifest_mod +import cdm_data_loaders.ncbi_ftp.promote as promote_mod +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path +from cdm_data_loaders.utils.s3 import reset_s3_client + +# ── MinIO connection defaults ─────────────────────────────────────────── + +MINIO_ENDPOINT_URL = os.environ["MINIO_ENDPOINT_URL"] +MINIO_ACCESS_KEY = os.environ["MINIO_ACCESS_KEY"] +MINIO_SECRET_KEY = os.environ["MINIO_SECRET_KEY"] + +# Maximum length of a bucket name per S3/DNS spec +_MAX_BUCKET_LEN = 63 + + +# ── MinIO reachability check ──────────────────────────────────────────── + +_minio_available: bool | None = None + + +def _minio_reachable() -> bool: + """Return True if the MinIO endpoint accepts connections.""" + try: + client = boto3.client( + "s3", + endpoint_url=MINIO_ENDPOINT_URL, + aws_access_key_id=MINIO_ACCESS_KEY, + aws_secret_access_key=MINIO_SECRET_KEY, + config=botocore.config.Config( + connect_timeout=1, + read_timeout=1, + retries={"max_attempts": 1}, + ), + ) + client.list_buckets() + except Exception: # noqa: BLE001 + return False + return True + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: # noqa: ARG001 + """Auto-skip ``@pytest.mark.integration`` tests when MinIO is unreachable.""" + global _minio_available # noqa: PLW0603 + if _minio_available is None: + _minio_available = _minio_reachable() + if _minio_available: + return + skip_marker = pytest.mark.skip(reason="MinIO not reachable — skipping integration tests") + for item in items: + if "integration" in item.keywords: + item.add_marker(skip_marker) + + +# ── Fixtures ──────────────────────────────────────────────────────────── + + +@pytest.fixture +def minio_s3_client() -> botocore.client.BaseClient: + """Session-scoped real boto3 S3 client pointed at the local MinIO instance. + + Patches ``get_s3_client`` on every module that uses it so internal calls + are transparently routed to MinIO. + """ + client = boto3.client( + "s3", + endpoint_url=MINIO_ENDPOINT_URL, + aws_access_key_id=MINIO_ACCESS_KEY, + aws_secret_access_key=MINIO_SECRET_KEY, + ) + + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(promote_mod, "get_s3_client", return_value=client), + patch.object(manifest_mod, "head_object", wraps=s3_utils.head_object), + patch.object(s3_utils, "_s3_client", client), + ): + yield client + reset_s3_client() + + +def _bucket_name_from_node(node_id: str) -> str: + """Derive a DNS-compliant S3 bucket name from a pytest node ID. + + :param node_id: e.g. ``tests/integration/test_promote_e2e.py::test_dry_run`` + :return: e.g. ``integ-test-dry-run`` + """ + # Extract test function name from the node ID + parts = node_id.split("::") + name = parts[-1] if parts else node_id + # Lowercase, replace non-alphanumeric with hyphens, collapse multiples + name = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") + name = f"integ-{name}" + if len(name) > _MAX_BUCKET_LEN: + # Truncate but keep it unique via a short hash suffix + suffix = hashlib.md5(name.encode()).hexdigest()[:6] # noqa: S324 + name = f"{name[: _MAX_BUCKET_LEN - 7]}-{suffix}" + return name + + +@pytest.fixture +def test_bucket(minio_s3_client: botocore.client.BaseClient, request: pytest.FixtureRequest) -> str: + """Create a per-test-method bucket in MinIO and return its name. + + On re-run, any existing objects are deleted first so the test starts clean. + The bucket is **not** deleted after the test. + """ + bucket = _bucket_name_from_node(request.node.nodeid) + s3 = minio_s3_client + + try: + s3.head_bucket(Bucket=bucket) + # Bucket exists — empty it for a clean run + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + for obj in page.get("Contents", []): + s3.delete_object(Bucket=bucket, Key=obj["Key"]) + except s3.exceptions.NoSuchBucket: + s3.create_bucket(Bucket=bucket) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("404", "NoSuchBucket"): + s3.create_bucket(Bucket=bucket) + else: + raise + + return bucket + + +@pytest.fixture +def staging_test_bucket(minio_s3_client: botocore.client.BaseClient, request: pytest.FixtureRequest) -> str: + """Create a per-test staging bucket in MinIO and return its name. + + Mirrors ``test_bucket`` but uses a ``staging-`` prefix so staging and + Lakehouse buckets are distinct within the same test. + """ + bucket = "staging-" + _bucket_name_from_node(request.node.nodeid) + if len(bucket) > _MAX_BUCKET_LEN: + suffix = hashlib.md5(bucket.encode()).hexdigest()[:6] # noqa: S324 + bucket = f"{bucket[: _MAX_BUCKET_LEN - 7]}-{suffix}" + s3 = minio_s3_client + + try: + s3.head_bucket(Bucket=bucket) + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + for obj in page.get("Contents", []): + s3.delete_object(Bucket=bucket, Key=obj["Key"]) + except s3.exceptions.NoSuchBucket: + s3.create_bucket(Bucket=bucket) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("404", "NoSuchBucket"): + s3.create_bucket(Bucket=bucket) + else: + raise + + return bucket + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def stage_files_to_minio( + s3: botocore.client.BaseClient, + bucket: str, + local_dir: str | Path, + staging_prefix: str, +) -> list[str]: + """Upload a local directory tree to a MinIO staging prefix. + + :param s3: boto3 S3 client + :param bucket: target bucket + :param local_dir: local root directory to upload + :param staging_prefix: S3 key prefix (e.g. ``"staging/run1/"``) + :return: list of S3 keys uploaded + """ + local_dir = Path(local_dir) + keys: list[str] = [] + for path in sorted(local_dir.rglob("*")): + if path.is_dir(): + continue + rel = path.relative_to(local_dir) + key = f"{staging_prefix.rstrip('/')}/{rel}" + s3.upload_file(Filename=str(path), Bucket=bucket, Key=key) + keys.append(key) + return keys + + +def seed_lakehouse( # noqa: PLR0913 + s3: botocore.client.BaseClient, + bucket: str, + accession: str, + files: dict[str, str | bytes], + path_prefix: str, + assembly_dir: str | None = None, +) -> list[str]: + """Seed assembly files at the final Lakehouse path in MinIO. + + :param s3: boto3 S3 client + :param bucket: target bucket + :param accession: assembly accession (e.g. ``"GCF_000001215.4"``) + :param files: mapping of filename → content (str or bytes) + :param path_prefix: Lakehouse prefix (e.g. ``"tenant-general-warehouse/…/ncbi/"``) + :param assembly_dir: full assembly dir name; if None, uses ``accession`` + :return: list of S3 keys created + """ + adir = assembly_dir or accession + rel = build_accession_path(adir) + keys: list[str] = [] + for fname, content in files.items(): + key = f"{path_prefix}{rel}{fname}" + body = content.encode() if isinstance(content, str) else content + md5 = hashlib.md5(body).hexdigest() # noqa: S324 + s3.put_object(Bucket=bucket, Key=key, Body=body, Metadata={"md5": md5}) + keys.append(key) + return keys + + +def list_all_keys(s3: botocore.client.BaseClient, bucket: str, prefix: str = "") -> list[str]: + """List all object keys in a bucket under a prefix. + + :param s3: boto3 S3 client + :param bucket: bucket name + :param prefix: optional key prefix filter + :return: sorted list of keys + """ + keys: list[str] = [] + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + keys.extend(obj["Key"] for obj in page.get("Contents", [])) + return sorted(keys) + + +def get_object_metadata(s3: botocore.client.BaseClient, bucket: str, key: str) -> dict[str, Any]: + """Return the S3 user metadata dict for an S3 object (from HeadObject). + + :param s3: boto3 S3 client + :param bucket: bucket name + :param key: object key + :return: user metadata dict + """ + resp = s3.head_object(Bucket=bucket, Key=key) + return resp.get("Metadata", {}) diff --git a/tests/integration/test_download_e2e.py b/tests/integration/test_download_e2e.py new file mode 100644 index 00000000..37456e3e --- /dev/null +++ b/tests/integration/test_download_e2e.py @@ -0,0 +1,187 @@ +"""End-to-end tests for Phase 2 — FTP download of assemblies. + +These tests download real (small) assemblies from the NCBI FTP server. +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. +""" + +import json + +import pytest + +from pathlib import Path +from unittest.mock import patch + +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.ncbi_ftp.manifest import ( + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_transfer_manifest, +) +from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch, download_and_stage + +# Use same stable prefix as manifest tests +STABLE_PREFIX = "900" + + +def _manifest_for_one_assembly(tmp_path: Path) -> tuple[Path, str]: + """Create a transfer manifest containing exactly one FTP path. + + Returns ``(manifest_path, accession)`` for the first latest assembly + in the stable prefix range. + """ + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + diff = compute_diff(filtered, previous_assemblies=None) + + assert len(diff.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest_path = tmp_path / "transfer_manifest.txt" + write_transfer_manifest(diff, filtered, manifest_path) + + return manifest_path, diff.new[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestDownloadSmallBatch: + """Download a single assembly from NCBI FTP and verify local output.""" + + def test_download_small_batch(self, tmp_path: Path) -> None: + """Download one assembly and verify directory structure and report.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + output_dir = tmp_path / "output" + output_dir.mkdir() + + report = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + + # Verify directory structure exists + raw_data = output_dir / "raw_data" + assert raw_data.exists(), "Expected raw_data/ directory in output" + + # Should have at least one assembly directory with files + assembly_dirs = list(raw_data.rglob("GCF_*")) + assert len(assembly_dirs) > 0, "Expected at least one assembly directory" + + # Check for .md5 sidecar files + md5_files = list(raw_data.rglob("*.md5")) + assert len(md5_files) > 0, "Expected .md5 sidecar files" + + # Check download report + report_file = output_dir / "download_report.json" + assert report_file.exists() + with report_file.open() as f: + saved_report = json.load(f) + assert saved_report["succeeded"] >= 1 + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestDownloadResumeIncomplete: + """Verify download handles re-runs when some files are already present.""" + + def test_download_resume(self, tmp_path: Path) -> None: + """Re-running download on the same manifest succeeds without errors.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + output_dir = tmp_path / "output" + output_dir.mkdir() + + # First download + report1 = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + assert report1["succeeded"] >= 1 + + files_after_first = set(output_dir.rglob("*")) + + # Second download — same manifest + report2 = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + + # Should succeed without errors (files overwritten or skipped) + assert report2["succeeded"] >= 1 + assert report2["failed"] == 0 + + # All original files should still exist + files_after_second = set(output_dir.rglob("*")) + assert files_after_first.issubset(files_after_second) + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +def test_download_and_stage_e2e( + tmp_path: Path, + minio_s3_client, + test_bucket: str, +) -> None: + """Download one assembly and verify it is staged under the expected S3 prefix.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + staging_prefix = "staging/e2e-test/" + + # Seed the manifest in MinIO so download_and_stage can read it from S3 + manifest_s3_key = f"{staging_prefix}input/transfer_manifest.txt" + minio_s3_client.put_object( + Bucket=test_bucket, + Key=manifest_s3_key, + Body=manifest_path.read_bytes(), + ) + + with patch.object(s3_utils, "get_s3_client", return_value=minio_s3_client): + report = download_and_stage( + bucket=test_bucket, + staging_key_prefix=staging_prefix, + manifest_s3_key=manifest_s3_key, + threads=1, + limit=1, + dry_run=False, + ) + + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + assert report["staged_objects"] > 0 + assert report["staging_key_prefix"] == staging_prefix + assert report["dry_run"] is False + + # Verify raw_data/ files and .md5 sidecars are staged + paginator = minio_s3_client.get_paginator("list_objects_v2") + staged_keys = [ + obj["Key"] + for page in paginator.paginate(Bucket=test_bucket, Prefix=f"{staging_prefix}raw_data/") + for obj in page.get("Contents", []) + ] + assert len(staged_keys) > 0, "Expected staged files under raw_data/" + + data_files = [k for k in staged_keys if not k.endswith(".md5")] + md5_files = [k for k in staged_keys if k.endswith(".md5")] + assert len(data_files) > 0, "Expected data files" + assert len(md5_files) > 0, "Expected .md5 sidecar files" + + # Verify download_report.json was also uploaded + report_key = f"{staging_prefix}download_report.json" + resp = minio_s3_client.get_object(Bucket=test_bucket, Key=report_key) + saved_report = json.loads(resp["Body"].read()) + assert saved_report["succeeded"] >= 1 diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py new file mode 100644 index 00000000..63776ae5 --- /dev/null +++ b/tests/integration/test_full_pipeline.py @@ -0,0 +1,222 @@ +"""End-to-end tests for the full NCBI assembly pipeline (Phase 1 → 2 → 3). + +Exercises the entire flow: download summary from real NCBI FTP, compute diff, +download a single assembly, stage in MinIO, promote to final Lakehouse path. + +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pathlib import Path + +from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 +from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch + +from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 + +STABLE_PREFIX = "900" +STAGING_PREFIX = "staging/run1/" +PATH_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestFullPipelineSmallBatch: + """Run the complete pipeline for a single assembly: diff → download → promote.""" + + def test_full_pipeline_small_batch( + self, + minio_s3_client: object, + test_bucket: str, + staging_test_bucket: str, + tmp_path: Path, + ) -> None: + """Single assembly flows through all three phases into MinIO.""" + s3 = minio_s3_client + + # ── Phase 1: Manifest generation ──────────────────────────────── + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + + diff = compute_diff(filtered, previous_assemblies=None) + assert len(diff.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest_path = tmp_path / "transfer_manifest.txt" + _ = write_transfer_manifest(diff, filtered, manifest_path) + + # ── Phase 2: Download one assembly from real FTP ──────────────── + output_dir = tmp_path / "output" + output_dir.mkdir() + + report = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + + # ── Upload local output to MinIO staging ──────────────────────── + keys = stage_files_to_minio(s3, staging_test_bucket, output_dir, STAGING_PREFIX) + assert len(keys) > 0, "Expected files staged to MinIO" + + # ── Phase 3: Promote from staging to final path ───────────────── + promote_report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + assert promote_report["promoted"] >= 1 + assert promote_report["failed"] == 0 + + # ── Verify final state ────────────────────────────────────────── + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 1, "Expected files at final Lakehouse path" + + # At least one file should have MD5 metadata + has_md5 = False + for key in final_keys: + meta = get_object_metadata(s3, test_bucket, key) + if meta.get("md5"): + has_md5 = True + break + assert has_md5, "Expected at least one file with MD5 metadata" + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestFullPipelineIncrementalSync: + """Run the pipeline twice to test incremental sync with archival.""" + + def test_full_pipeline_incremental( + self, + minio_s3_client: object, + test_bucket: str, + staging_test_bucket: str, + tmp_path: Path, + ) -> None: + """Second sync archives the old version and promotes the new one.""" + s3 = minio_s3_client + + # ── First sync: Phase 1 → 2 → 3 ──────────────────────────────── + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + + diff1 = compute_diff(filtered, previous_assemblies=None) + assert len(diff1.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest1 = tmp_path / "transfer_manifest_1.txt" + _ = write_transfer_manifest(diff1, filtered, manifest1) + + output1 = tmp_path / "output1" + output1.mkdir() + report1 = download_batch(str(manifest1), str(output1), threads=1, limit=1) + assert report1["succeeded"] >= 1 + + stage_files_to_minio(s3, staging_test_bucket, output1, STAGING_PREFIX) + + # Upload manifest to MinIO for trimming (manifest lives in staging bucket) + manifest_key = "ncbi/transfer_manifest.txt" + s3.upload_file(Filename=str(manifest1), Bucket=staging_test_bucket, Key=manifest_key) + + promote1 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + manifest_s3_key=manifest_key, + lakehouse_key_prefix=PATH_PREFIX, + ) + assert promote1["promoted"] >= 1 + + first_sync_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(first_sync_keys) >= 1 + + # ── Second sync: Manufacture "previous" with a tweak ──────────── + # Treat first-sync state as "previous", but modify one assembly's + # seq_rel_date so it shows up as "updated". + previous: dict[str, AssemblyRecord] = {} + for acc, rec in filtered.items(): + previous[acc] = AssemblyRecord( + accession=rec.accession, + status=rec.status, + seq_rel_date=rec.seq_rel_date, + ftp_path=rec.ftp_path, + assembly_dir=rec.assembly_dir, + ) + + # Pick the first accession that was actually downloaded + downloaded_acc = diff1.new[0] + if downloaded_acc in previous: + previous[downloaded_acc].seq_rel_date = "1999/01/01" + + diff2 = compute_diff(filtered, previous_assemblies=previous) + + # The modified assembly should appear as "updated" + if downloaded_acc in previous: + assert downloaded_acc in diff2.updated, f"Expected {downloaded_acc} in updated list" + + manifest2 = tmp_path / "transfer_manifest_2.txt" + _ = write_transfer_manifest(diff2, filtered, manifest2) + + updated_manifest = tmp_path / "updated_manifest.txt" + _ = write_updated_manifest(diff2, updated_manifest) + + removed_manifest = tmp_path / "removed_manifest.txt" + _ = write_removed_manifest(diff2, removed_manifest) + + # Phase 2 — re-download the updated assembly + output2 = tmp_path / "output2" + output2.mkdir() + report2 = download_batch(str(manifest2), str(output2), threads=1, limit=1) + assert report2["succeeded"] >= 1 + + # Clean staging and re-stage + staging_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + for key in staging_keys: + s3.delete_object(Bucket=staging_test_bucket, Key=key) + stage_files_to_minio(s3, staging_test_bucket, output2, STAGING_PREFIX) + + # Phase 3 — promote with archival + promote2 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + updated_manifest_path=str(updated_manifest), + ncbi_release="test-incremental", + lakehouse_key_prefix=PATH_PREFIX, + ) + assert promote2["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/test-incremental/") + if promote2["archived"] > 0: + assert len(archive_keys) >= 1 + for key in archive_keys: + assert "/updated/" in key + + # Final Lakehouse path should still have files + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 1 diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py new file mode 100644 index 00000000..fb46ad3a --- /dev/null +++ b/tests/integration/test_manifest_e2e.py @@ -0,0 +1,301 @@ +"""End-to-end tests for Phase 1 — manifest generation and diffing. + +These tests hit the real NCBI FTP server (with tight prefix filters) and +optionally use MinIO for checksum verification. Marked ``integration`` +and ``slow_test``; auto-skipped when MinIO is unreachable. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import FILE_FILTERS, FTP_HOST, build_accession_path, parse_md5_checksums_file +from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + scan_store_to_synthetic_summary, + verify_transfer_candidates, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_retrieve_text + +if TYPE_CHECKING: + from pathlib import Path + +# Use a high-numbered prefix range that typically has only a handful of +# assemblies, keeping FTP traffic minimal. +STABLE_PREFIX = "900" + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _download_and_filter() -> tuple[dict[str, AssemblyRecord], dict[str, AssemblyRecord]]: + """Download the current refseq summary and filter to the stable prefix range. + + Returns ``(full_parsed, filtered)``. + """ + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + return full, filtered + + +# ── Tests ─────────────────────────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestFreshSyncNoPrevious: + """Phase 1 with no previous snapshot — everything is 'new'.""" + + def test_fresh_sync_no_previous(self, tmp_path: Path) -> None: + """All assemblies in range appear as new when there is no previous snapshot.""" + _full, filtered = _download_and_filter() + assert len(filtered) > 0, f"Expected assemblies in prefix {STABLE_PREFIX}" + + diff = compute_diff(filtered, previous_assemblies=None) + + # With no previous, every *latest* assembly is new + latest_count = sum(1 for r in filtered.values() if r.status == "latest") + assert len(diff.new) == latest_count + assert len(diff.updated) == 0 + assert len(diff.replaced) == 0 + assert len(diff.suppressed) == 0 + + # Write manifests + transfer_path = tmp_path / "transfer_manifest.txt" + removed_path = tmp_path / "removed_manifest.txt" + updated_path = tmp_path / "updated_manifest.txt" + summary_path = tmp_path / "diff_summary.json" + + paths = write_transfer_manifest(diff, filtered, transfer_path) + removed = write_removed_manifest(diff, removed_path) + updated = write_updated_manifest(diff, updated_path) + write_diff_summary(diff, summary_path, "refseq", STABLE_PREFIX, STABLE_PREFIX) + + assert len(paths) == latest_count + assert len(removed) == 0 + assert len(updated) == 0 + assert transfer_path.exists() + assert summary_path.exists() + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestIncrementalDiffSyntheticPrevious: + """Phase 1 incremental diff with a manufactured 'previous' snapshot.""" + + def test_incremental_diff(self, tmp_path: Path) -> None: + """Detects new, updated, replaced, and suppressed assemblies correctly.""" + _full, filtered = _download_and_filter() + latest = {a: r for a, r in filtered.items() if r.status == "latest"} + assert len(latest) >= 2, f"Need >=2 latest assemblies in prefix {STABLE_PREFIX}" # noqa: PLR2004 + + accs = sorted(latest.keys()) + + # Build synthetic previous: copy current, then mutate + previous: dict[str, AssemblyRecord] = {} + for acc, rec in filtered.items(): + previous[acc] = AssemblyRecord( + accession=rec.accession, + status=rec.status, + seq_rel_date=rec.seq_rel_date, + ftp_path=rec.ftp_path, + assembly_dir=rec.assembly_dir, + ) + + # Remove the first latest → should appear as "new" in diff + new_acc = accs[0] + del previous[new_acc] + + # Modify seq_rel_date of the second latest → should appear as "updated" + updated_acc = accs[1] + previous[updated_acc].seq_rel_date = "1999/01/01" + + # Add a fake accession to previous that is not in current → "suppressed" + fake_suppressed = "GCF_900999999.1" + previous[fake_suppressed] = AssemblyRecord( + accession=fake_suppressed, + status="latest", + seq_rel_date="2020/01/01", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/999/999/GCF_900999999.1_FakeAsm", + assembly_dir="GCF_900999999.1_FakeAsm", + ) + + diff = compute_diff(filtered, previous_assemblies=previous) + + assert new_acc in diff.new + assert updated_acc in diff.updated + assert fake_suppressed in diff.suppressed + + # Write and verify manifests + transfer_path = tmp_path / "transfer_manifest.txt" + removed_path = tmp_path / "removed_manifest.txt" + updated_path = tmp_path / "updated_manifest.txt" + + paths = write_transfer_manifest(diff, filtered, transfer_path) + removed = write_removed_manifest(diff, removed_path) + updated_list = write_updated_manifest(diff, updated_path) + + assert len(paths) >= 2 # noqa: PLR2004 # at least the new + updated + assert fake_suppressed in removed + assert updated_acc in updated_list + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +class TestVerifyTransferCandidatesPrunes: + """verify_transfer_candidates should prune assemblies already in the store.""" + + def test_prunes_existing_matching_md5( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Assemblies with matching MD5 metadata in MinIO are pruned from the transfer list.""" + _full, filtered = _download_and_filter() + latest = {a: r for a, r in filtered.items() if r.status == "latest"} + if not latest: + pytest.skip(f"No latest assemblies in prefix {STABLE_PREFIX}") + + # Pick one assembly to pre-seed in MinIO with correct checksums + acc = next(iter(sorted(latest))) + rec = latest[acc] + ftp_dir = rec.ftp_path.replace("https://ftp.ncbi.nlm.nih.gov", "") + + # Fetch the real md5checksums.txt from FTP + ftp = connect_ftp(FTP_HOST) + try: + md5_text = ftp_retrieve_text(ftp, ftp_dir.rstrip("/") + "/md5checksums.txt") + finally: + ftp.quit() + + checksums = parse_md5_checksums_file(md5_text) + + # Seed MinIO with dummy files that have the right MD5 metadata + rel = build_accession_path(rec.assembly_dir) + s3 = minio_s3_client + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX + for fname, md5 in checksums.items(): + if any(fname.endswith(suffix) for suffix in FILE_FILTERS): + key = f"{path_prefix}{rel}{fname}" + s3.put_object( + Bucket=test_bucket, + Key=key, + Body=b"placeholder", + Metadata={"md5": md5}, + ) + + # verify_transfer_candidates should prune the seeded assembly + candidates = sorted(latest.keys()) + result = verify_transfer_candidates( + candidates, + filtered, + bucket=test_bucket, + key_prefix=path_prefix, + ) + + assert acc not in result, f"Expected {acc} to be pruned (MD5 matches)" + # Other candidates without seeded data should remain + remaining_candidates = [c for c in candidates if c != acc] + for c in remaining_candidates: + assert c in result, f"Expected {c} to remain (not seeded)" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestScanStoreToSyntheticSummary: + """Test synthetic assembly summary generation from MinIO store.""" + + def test_builds_summary_from_minio_store( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Verify synthetic summary captures assemblies from MinIO.""" + s3 = minio_s3_client + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX + + # Seed MinIO with a couple of assemblies + assemblies = { + "GCF_000001215.4_v1": ["_genomic.fna.gz", "_protein.faa.gz"], + "GCF_000005845.2_v2": ["_genomic.fna.gz"], + } + + for assembly_dir, files in assemblies.items(): + for fname in files: + key = f"{path_prefix}refseq/{assembly_dir}/{assembly_dir}{fname}" + s3.put_object( + Bucket=test_bucket, + Key=key, + Body=b"placeholder", + ) + + # Scan the store + result = scan_store_to_synthetic_summary(test_bucket, path_prefix, "2024/04/01") + + # Should have found both assemblies + assert "GCF_000001215.4" in result + assert "GCF_000005845.2" in result + + # Verify basic record structure + rec1 = result["GCF_000001215.4"] + assert rec1.accession == "GCF_000001215.4" + assert rec1.status == "latest" + assert rec1.assembly_dir == "GCF_000001215.4_v1" + + def test_synthetic_summary_diff_against_current( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Verify synthetic summary can be used as baseline for diffing.""" + s3 = minio_s3_client + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX + + # Seed MinIO with one assembly + key1 = f"{path_prefix}refseq/GCF_000001215.4_old/GCF_000001215.4_old_genomic.fna.gz" + s3.put_object(Bucket=test_bucket, Key=key1, Body=b"data") + + # Build synthetic summary from store + synthetic = scan_store_to_synthetic_summary(test_bucket, path_prefix, "2024/04/20") + assert "GCF_000001215.4" in synthetic + + # Simulate current NCBI summary with one new and one existing + current = { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date=synthetic["GCF_000001215.4"].seq_rel_date, + ftp_path="", + assembly_dir="GCF_000001215.4_old", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="latest", + seq_rel_date="2024/04/20", + ftp_path="", + assembly_dir="GCF_000005845.2_new", + ), + } + + # Compute diff + diff = compute_diff(current, previous_assemblies=synthetic) + + # Should find one new and zero updated + assert "GCF_000005845.2" in diff.new + assert "GCF_000001215.4" not in diff.new # Already in store + assert len(diff.updated) == 0 # Same date, same dir diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py new file mode 100644 index 00000000..20c72022 --- /dev/null +++ b/tests/integration/test_promote_e2e.py @@ -0,0 +1,1315 @@ +"""End-to-end tests for Phase 3 — promote and archive in MinIO. + +Pre-stages fake assembly files in MinIO and exercises ``promote_from_s3`` +with various combinations of manifests, archive operations, dry-run mode, +manifest trimming, and incomplete staging. + +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. Each test method gets its own bucket. +""" + +import hashlib +import json + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path +from cdm_data_loaders.ncbi_ftp.metadata import ( + build_archive_descriptor_key, + build_descriptor_key, + create_descriptor, +) +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 + +from .conftest import get_object_metadata, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 + +from pathlib import Path + +# Fake assembly details used across tests +ACCESSION_A = "GCF_900000001.1" +ASSEMBLY_DIR_A = "GCF_900000001.1_FakeAssemblyA" +ACCESSION_B = "GCF_900000002.1" +ASSEMBLY_DIR_B = "GCF_900000002.1_FakeAssemblyB" +ACCESSION_C = "GCF_900000003.1" +ASSEMBLY_DIR_C = "GCF_900000003.1_FakeAssemblyC" + +STAGING_PREFIX = "staging/run1/" +PATH_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX + +# Fake file contents for staging +FAKE_GENOMIC = b">seq1\nATCGATCG\n" +FAKE_PROTEIN = b">prot1\nMKKL\n" + + +def _md5(data: bytes) -> str: + return hashlib.md5(data).hexdigest() # noqa: S324 + + +def _stage_assembly( + s3: object, + bucket: str, + assembly_dir: str, +) -> None: + """Stage a fake assembly with data files and .md5 sidecars under the staging prefix.""" + rel = build_accession_path(assembly_dir) + base = f"{STAGING_PREFIX}{rel}" + + files = { + f"{assembly_dir}_genomic.fna.gz": FAKE_GENOMIC, + f"{assembly_dir}_protein.faa.gz": FAKE_PROTEIN, + } + + for fname, content in files.items(): + key = f"{base}{fname}" + s3.put_object(Bucket=bucket, Key=key, Body=content) + # Write .md5 sidecar + md5_key = f"{key}.md5" + s3.put_object(Bucket=bucket, Key=md5_key, Body=_md5(content).encode()) + + +def _write_manifest(tmp_path: Path, accessions: list[str], name: str) -> Path: + """Write a manifest file (one accession per line).""" + path = tmp_path / name + path.write_text("\n".join(accessions) + "\n") + return path + + +# ── Tests ─────────────────────────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteFromStaging: + """Promote staged files to final Lakehouse paths.""" + + def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Staged files appear at the final Lakehouse path with MD5 metadata.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] >= 2 # noqa: PLR2004 # genomic + protein + assert report["failed"] == 0 + assert report["dry_run"] is False + + # Verify files at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 2 # noqa: PLR2004 + + # Verify MD5 metadata is set + for key in final_keys: + meta = get_object_metadata(s3, test_bucket, key) + assert "md5" in meta, f"Missing md5 metadata on {key}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteIdempotent: + """Promoting the same staging data twice should succeed without errors.""" + + def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Second promote on empty staging succeeds and leaves the lakehouse unchanged. + + After the first promote, staged files are deleted. A second run therefore + finds nothing to promote — which is correct and expected. The lakehouse + contents must be identical after both runs. + """ + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + report1 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + report2 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + assert report1["failed"] == 0 + assert report1["promoted"] >= 1 + assert report2["failed"] == 0 + assert report2["promoted"] == 0 # staging was cleared by the first run + assert keys_after_first == keys_after_second + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveUpdated: + """Archive existing assemblies before overwriting with updated versions.""" + + def test_archive_updated( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Updated assemblies are archived before being overwritten.""" + s3 = minio_s3_client + + # Seed "old" version at the final Lakehouse path + old_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "old genomic content", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": "old protein content", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, old_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + # Stage "new" version + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + updated_manifest_path=str(updated_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["archived"] >= 2 # noqa: PLR2004 + assert report["promoted"] >= 2 # noqa: PLR2004 + assert report["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/2024-01/") + assert len(archive_keys) >= 2 # noqa: PLR2004 + + # Verify archive metadata + for key in archive_keys: + assert "/updated/" in key + assert "/2024-01/" in key + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveRemoved: + """Archive and delete replaced/suppressed assemblies.""" + + def test_archive_removed( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Removed assemblies are archived and source objects are deleted.""" + s3 = minio_s3_client + + # Seed assemblies at final path + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "content to archive", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + + # Stage something (even empty staging is fine — promote won't find data files for this accession) + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + removed_manifest_path=str(removed_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["archived"] >= 1 + assert report["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/2024-01/") + assert len(archive_keys) >= 1 + + # Verify archive metadata + for key in archive_keys: + assert "/replaced_or_suppressed/" in key + + # Verify source objects are deleted + rel = build_accession_path(ASSEMBLY_DIR_A) + source_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + rel) + assert len(source_keys) == 0, f"Expected source objects deleted, found: {source_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRun: + """Dry-run mode should not create any objects.""" + + def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Dry-run logs actions but creates no objects at the final path.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + assert report["dry_run"] is True + assert report["promoted"] >= 1 + + # No objects should exist at the final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run should not create objects, found: {final_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteTrimsManifest: + """Manifest trimming removes promoted accessions.""" + + def test_trims_manifest( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Transfer manifest in MinIO is trimmed to exclude promoted accessions.""" + s3 = minio_s3_client + + # Upload a transfer manifest with 3 entries to MinIO (manifest lives in staging) + manifest_key = "ncbi/transfer_manifest.txt" + manifest_lines = [ + "/genomes/all/GCF/900/000/001/GCF_900000001.1_FakeAssemblyA/\n", + "/genomes/all/GCF/900/000/002/GCF_900000002.1_FakeAssemblyB/\n", + "/genomes/all/GCF/900/000/003/GCF_900000003.1_FakeAssemblyC/\n", + ] + s3.put_object(Bucket=staging_test_bucket, Key=manifest_key, Body="".join(manifest_lines).encode()) + + # Stage only assemblies A and B (not C) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_B) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + manifest_s3_key=manifest_key, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["failed"] == 0 + + # Read back the manifest from MinIO (it lives in staging) + resp = s3.get_object(Bucket=staging_test_bucket, Key=manifest_key) + remaining = resp["Body"].read().decode() + remaining_lines = [line.strip() for line in remaining.strip().splitlines() if line.strip()] + + # Only C should remain (A and B were promoted) + assert len(remaining_lines) == 1, f"Expected 1 remaining entry, got {len(remaining_lines)}: {remaining_lines}" + assert "GCF_900000003" in remaining_lines[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteIncompleteStaging: + """Incomplete staging (sidecar only, no data) should not promote anything.""" + + def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Only .md5 sidecars staged → nothing promoted.""" + s3 = minio_s3_client + + # Stage only .md5 sidecars (no data files) + rel = build_accession_path(ASSEMBLY_DIR_A) + base = f"{STAGING_PREFIX}{rel}" + fname = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + md5_key = f"{base}{fname}.md5" + s3.put_object(Bucket=staging_test_bucket, Key=md5_key, Body=_md5(FAKE_GENOMIC).encode()) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + # .md5 files are sidecars and should not be promoted as data + assert report["promoted"] == 0 + assert report["failed"] == 0 + + # No objects at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteCreatesDescriptor: + """Promote step writes a frictionless descriptor for each promoted assembly.""" + + def test_descriptor_created(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """After promote, a JSON descriptor exists under ``metadata/``.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + assert body["identifier"] == f"NCBI:{ACCESSION_A}" + assert body["resource_type"] == "dataset" + + def test_descriptor_resources_include_promoted_files( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Descriptor's ``resources`` list references the final Lakehouse key.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + resource_paths = [r["path"] for r in body["resources"]] + assert any(PATH_PREFIX + "raw_data/" in p for p in resource_paths) + + def test_descriptor_resources_have_md5( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Resources with .md5 sidecars include the hash value.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + # Both staged files have .md5 sidecars + for resource in body["resources"]: + assert "hash" in resource, f"Expected hash in resource: {resource}" + + def test_multiple_assemblies_get_separate_descriptors( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Each assembly gets its own descriptor file.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_B) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + for assembly_dir, accession in [(ASSEMBLY_DIR_A, ACCESSION_A), (ASSEMBLY_DIR_B, ACCESSION_B)]: + key = build_descriptor_key(assembly_dir, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=key) + body = json.loads(obj["Body"].read()) + assert body["identifier"] == f"NCBI:{accession}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveUpdatedIncludesDescriptor: + """Archiving updated assemblies also archives the descriptor.""" + + def test_archive_copies_descriptor( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """After archiving an updated assembly, the descriptor appears under archive/.""" + s3 = minio_s3_client + + # Seed old version at Lakehouse path *including* a live descriptor + old_files = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "old content"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, old_files, PATH_PREFIX, ASSEMBLY_DIR_A) + # Pre-upload a descriptor so archive_descriptor can find it + descriptor = create_descriptor(ASSEMBLY_DIR_A, ACCESSION_A, []) + # Upload directly to MinIO (not via promote) + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + s3.put_object(Bucket=test_bucket, Key=descriptor_key, Body=json.dumps(descriptor).encode()) + + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + updated_manifest_path=str(updated_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX, "updated") + # Confirm the archive descriptor object exists + resp = s3.head_object(Bucket=test_bucket, Key=archive_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveRemovedIncludesDescriptor: + """Archiving removed assemblies also archives the descriptor.""" + + def test_archive_removed_copies_descriptor( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """After archiving a removed assembly, the descriptor is under archive/.""" + s3 = minio_s3_client + + # Seed the assembly at final Lakehouse path + files = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "content"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + # Pre-upload a descriptor + descriptor = create_descriptor(ASSEMBLY_DIR_A, ACCESSION_A, []) + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + s3.put_object(Bucket=test_bucket, Key=descriptor_key, Body=json.dumps(descriptor).encode()) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + removed_manifest_path=str(removed_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX, "replaced_or_suppressed") + resp = s3.head_object(Bucket=test_bucket, Key=archive_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRunNoDescriptor: + """Dry-run must not write any descriptor files.""" + + def test_dry_run_no_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Dry-run does not upload a descriptor to the metadata/ prefix.""" + s3 = minio_s3_client + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + metadata_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "metadata/") + assert len(metadata_keys) == 0, f"Dry-run should not create descriptor files, found: {metadata_keys}" + + +# ── Parallel archiving tests ───────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveMultiFileConcurrent: + """Verify parallel copy archives all files correctly with correct content.""" + + def test_all_files_archived_with_correct_content( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Every file is archived with byte-identical content when copied concurrently.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + + # Seed many files for assembly A at final Lakehouse path + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC_CONTENT", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN_CONTENT", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA_CONTENT", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"ASSEMBLY_REPORT", + f"{ASSEMBLY_DIR_A}_assembly_stats.txt": b"ASSEMBLY_STATS", + f"{ASSEMBLY_DIR_A}_cds_from_genomic.fna.gz": b"CDS_CONTENT", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + archived = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + assert archived == len(many_files) + + # Verify every archived file has correct content + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in many_files.items(): + archive_key = f"{PATH_PREFIX}archive/2024-01/updated/{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=archive_key) + actual_body = obj["Body"].read() + assert actual_body == expected_body, f"Content mismatch for {fname}" + + def test_archive_key_paths_are_correct( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archived keys follow the exact ``archive/{release}/{reason}/{rel_path}`` pattern.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"content"} + seed_lakehouse(s3, test_bucket, ACCESSION_B, files, PATH_PREFIX, ASSEMBLY_DIR_B) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_B], "removed_manifest.txt") + _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-02", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + rel = build_accession_path(ASSEMBLY_DIR_B) + expected_key = f"{PATH_PREFIX}archive/2024-02/replaced_or_suppressed/{rel}{ASSEMBLY_DIR_B}_genomic.fna.gz" + resp = s3.head_object(Bucket=test_bucket, Key=expected_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveDeleteSourceBatch: + """Verify batch delete removes all source objects after concurrent copy.""" + + def test_all_sources_deleted_after_archive( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """After archive with delete_source=True, no source objects remain.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"rna", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"report", + } + source_keys = seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + assert archived == len(many_files) + # Source keys must all be gone + for key in source_keys: + remaining = list_all_keys(s3, test_bucket, key) + assert len(remaining) == 0, f"Source not deleted: {key}" + + def test_archive_present_source_gone( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archive destinations exist AND sources are gone after replaced_or_suppressed archive.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + # Archive keys present + archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/") + assert len(archive_keys) == len(files), f"Expected {len(files)} archive keys, got: {archive_keys}" + # Source keys absent + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == 0, f"Source objects remain: {source_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPartialArchiveResume: + """Corner case: a prior archive run was interrupted mid-way. + + Re-running must complete cleanly without errors, leave all archive keys + present with current content, and (when delete_source=True) remove all + source keys regardless of which files were processed in the prior run. + """ + + def test_partial_updated_archive_resumes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Re-running after a partial updated archive overwrites stale copies and archives missing files. + + Scenario: 3 files, file_a was archived in a prior run (stale content), + file_b and file_c were not. Re-run should overwrite file_a with current + content and archive file_b, file_c. + """ + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + rel = build_accession_path(ASSEMBLY_DIR_A) + + file_a = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + file_b = f"{ASSEMBLY_DIR_A}_protein.faa.gz" + file_c = f"{ASSEMBLY_DIR_A}_rna.fna.gz" + + current_content = {file_a: b"current-genomic", file_b: b"current-protein", file_c: b"current-rna"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, current_content, PATH_PREFIX, ASSEMBLY_DIR_A) + + # Pre-seed a stale archive copy for file_a (simulating prior partial run) + archive_prefix = f"{PATH_PREFIX}archive/2024-01/updated/{rel}" + s3.put_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}", Body=b"stale-genomic") + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + archived = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + # All 3 files counted + assert archived == 3 # noqa: PLR2004 + # file_a overwritten with current content + obj_a = s3.get_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}") + assert obj_a["Body"].read() == b"current-genomic", "file_a archive should be overwritten" + # file_b and file_c now archived + for fname in (file_b, file_c): + resp = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # Sources untouched (delete_source=False) + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == len(current_content) + + def test_partial_replaced_archive_resumes_and_deletes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Re-running replaced_or_suppressed archive after partial run completes and deletes all sources. + + Scenario: file_a was copied+deleted in prior run (no longer at source), + file_b was copied but NOT deleted (still at source), file_c was untouched. + Re-run processes file_b and file_c, deletes both. Result: no sources remain. + """ + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + rel = build_accession_path(ASSEMBLY_DIR_A) + + file_a = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + file_b = f"{ASSEMBLY_DIR_A}_protein.faa.gz" + file_c = f"{ASSEMBLY_DIR_A}_rna.fna.gz" + archive_prefix = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel}" + + # Only file_b and file_c remain at source (file_a already gone) + s3.put_object( + Bucket=test_bucket, + Key=f"{PATH_PREFIX}{rel}{file_b}", + Body=b"protein", + Metadata={"md5": hashlib.md5(b"protein").hexdigest()}, # noqa: S324 + ) + s3.put_object( + Bucket=test_bucket, + Key=f"{PATH_PREFIX}{rel}{file_c}", + Body=b"rna", + Metadata={"md5": hashlib.md5(b"rna").hexdigest()}, # noqa: S324 + ) + # file_a already at archive destination + s3.put_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}", Body=b"genomic") + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + # 2 newly archived (file_b and file_c) + assert archived == 2 # noqa: PLR2004 + # file_b and file_c archive keys exist + for fname in (file_b, file_c): + resp = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # No source keys remain (file_b and file_c were deleted) + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == 0, f"Source objects remain: {source_keys}" + # file_a archive key is still intact + resp_a = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}") + assert resp_a["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + def test_full_rerun_after_complete_archive_is_idempotent( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Running archive again when all files already exist at archive paths is safe (no errors).""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + # First run — archives all files + archived_1 = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + # Second run — same manifest, same source files still present + archived_2 = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + assert archived_1 == len(files) + assert archived_2 == len(files) + # Archive keys still present with correct content + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in files.items(): + key = f"{PATH_PREFIX}archive/2024-01/updated/{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=key) + assert obj["Body"].read() == expected_body + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveMultiAccessionManifest: + """Multiple accessions in a single manifest are all archived.""" + + def test_two_accessions_both_archived( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Both accessions are archived with correct keys when listed in one manifest.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + + files_a = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic-A"} + files_b = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"genomic-B"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, files_a, PATH_PREFIX, ASSEMBLY_DIR_A) + seed_lakehouse(s3, test_bucket, ACCESSION_B, files_b, PATH_PREFIX, ASSEMBLY_DIR_B) + + manifest = _write_manifest(tmp_path, [ACCESSION_A, ACCESSION_B], "removed_manifest.txt") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + assert archived == 2 # noqa: PLR2004 + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + key_a = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel_a}{ASSEMBLY_DIR_A}_genomic.fna.gz" + key_b = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel_b}{ASSEMBLY_DIR_B}_genomic.fna.gz" + assert s3.head_object(Bucket=test_bucket, Key=key_a)["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + assert s3.head_object(Bucket=test_bucket, Key=key_b)["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # Sources deleted + assert len(list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_a}")) == 0 + assert len(list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_b}")) == 0 + + def test_three_accessions_correct_archive_reason_segment( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archive keys for all three accessions include the archive_reason segment.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + accessions_and_dirs = [ + (ACCESSION_A, ASSEMBLY_DIR_A), + (ACCESSION_B, ASSEMBLY_DIR_B), + (ACCESSION_C, ASSEMBLY_DIR_C), + ] + for accession, assembly_dir in accessions_and_dirs: + seed_lakehouse( + s3, + test_bucket, + accession, + {f"{assembly_dir}_genomic.fna.gz": b"data"}, + PATH_PREFIX, + assembly_dir, + ) + + manifest = _write_manifest(tmp_path, [acc for acc, _ in accessions_and_dirs], "removed_manifest.txt") + _archive_assemblies( + str(manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + all_archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/2024-03/") + assert len(all_archive_keys) == 3 # noqa: PLR2004 + for key in all_archive_keys: + assert "/replaced_or_suppressed/" in key, f"Archive key missing reason segment: {key}" + assert "/2024-03/" in key, f"Archive key missing release segment: {key}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveDryRunParallel: + """Dry-run with many files leaves everything unchanged.""" + + def test_dry_run_no_copies_no_deletes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Dry-run with multiple files per accession creates no archive keys and keeps sources.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"rna", + } + source_keys = seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + dry_run=True, + ) + + assert archived == len(many_files) + # No archive keys + archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/") + assert len(archive_keys) == 0, f"Dry-run created archive keys: {archive_keys}" + # All sources still present + for key in source_keys: + remaining = list_all_keys(s3, test_bucket, key) + assert len(remaining) == 1, f"Source missing after dry-run: {key}" + + +# ── Concurrent promotion tests ──────────────────────────────────────────── + + +def _stage_many( + s3: object, + bucket: str, + assembly_dir: str, + files: dict[str, bytes], + *, + with_md5: bool = True, +) -> None: + """Stage *files* with optional .md5 sidecars under the standard staging prefix.""" + rel = build_accession_path(assembly_dir) + base = f"{STAGING_PREFIX}{rel}" + for fname, content in files.items(): + key = f"{base}{fname}" + s3.put_object(Bucket=bucket, Key=key, Body=content) + if with_md5: + s3.put_object(Bucket=bucket, Key=f"{key}.md5", Body=_md5(content).encode()) + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteMultiFileConcurrent: + """Verify concurrent promotion lands all files with correct content and MD5.""" + + def test_six_files_all_promoted_with_correct_content( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Every staged file arrives at the correct final key with byte-identical content.""" + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"REPORT", + f"{ASSEMBLY_DIR_A}_assembly_stats.txt": b"STATS", + f"{ASSEMBLY_DIR_A}_cds_from_genomic.fna.gz": b"CDS", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, many_files) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == len(many_files) + assert report["failed"] == 0 + + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in many_files.items(): + key = f"{PATH_PREFIX}{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=key) + assert obj["Body"].read() == expected_body, f"Content mismatch: {fname}" + + def test_md5_metadata_correct_per_file( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Each promoted file carries MD5 metadata matching its own content, not another file's.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC_UNIQUE", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN_UNIQUE", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA_UNIQUE", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, content in files.items(): + key = f"{PATH_PREFIX}{rel}{fname}" + meta = get_object_metadata(s3, test_bucket, key) + assert meta.get("md5") == _md5(content), f"Wrong MD5 metadata on {fname}" + + def test_file_without_sidecar_has_no_md5_metadata( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """A file staged without a .md5 sidecar is promoted but has no md5 metadata key.""" + s3 = minio_s3_client + fname = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {fname: FAKE_GENOMIC}, with_md5=False) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + meta = get_object_metadata(s3, test_bucket, f"{PATH_PREFIX}{rel}{fname}") + assert "md5" not in meta, f"Expected no md5 metadata, got: {meta}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteStagingCleanup: + """After a fully successful promote, all staged files and sidecars are deleted.""" + + def test_staged_data_files_deleted( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Data files are removed from staging after a successful assembly promote.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=False) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + remaining_staging = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(remaining_staging) == 0, f"Staging not cleaned: {remaining_staging}" + + def test_md5_sidecars_deleted(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Both data files and .md5 sidecars are removed from staging after promote.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=True) + + # Verify sidecars exist before promote + before_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert any(k.endswith(".md5") for k in before_keys), "Test setup: expected .md5 sidecars" + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + after_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(after_keys) == 0, f"Staging not fully cleaned (including sidecars): {after_keys}" + + def test_two_assemblies_staging_both_cleaned( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Staging for both assemblies is fully cleaned when both assemblies succeed.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_B, + {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + remaining = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(remaining) == 0, f"Staging not fully cleaned after two-assembly promote: {remaining}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteTwoAssembliesBothLand: + """Both assemblies staged together are both promoted to correct Lakehouse paths.""" + + def test_both_assemblies_at_correct_final_paths( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Each assembly's files appear at distinct, correctly-routed final Lakehouse paths.""" + s3 = minio_s3_client + files_a = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic-A"} + files_b = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"genomic-B"} + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files_a) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, files_b) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + obj_a = s3.get_object(Bucket=test_bucket, Key=f"{PATH_PREFIX}{rel_a}{ASSEMBLY_DIR_A}_genomic.fna.gz") + obj_b = s3.get_object(Bucket=test_bucket, Key=f"{PATH_PREFIX}{rel_b}{ASSEMBLY_DIR_B}_genomic.fna.gz") + assert obj_a["Body"].read() == b"genomic-A" + assert obj_b["Body"].read() == b"genomic-B" + + def test_final_path_keys_do_not_overlap( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Files for assembly A and assembly B land at distinct paths — no key collision.""" + s3 = minio_s3_client + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"a"}) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"b"}) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + keys_a = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_a}") + keys_b = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_b}") + assert len(keys_a) == 1 + assert len(keys_b) == 1 + assert keys_a[0] != keys_b[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRunMultiFile: + """dry_run leaves staging untouched and writes nothing to the Lakehouse.""" + + def test_dry_run_many_files_staging_untouched( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """All staged files (data + .md5) survive a dry-run promote unchanged.""" + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, many_files, with_md5=True) + staging_before = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + assert report["promoted"] == len(many_files) + assert report["dry_run"] is True + + # Staging unchanged + staging_after = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert staging_after == staging_before, "Dry-run should not alter staging" + + # Nothing at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run created Lakehouse objects: {final_keys}" + + def test_dry_run_two_assemblies_nothing_written( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Dry-run with two staged assemblies creates no Lakehouse objects.""" + s3 = minio_s3_client + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": FAKE_GENOMIC}) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run created objects: {final_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteSecondRunOnEmptyStaging: + """After staging is cleaned, a second promote run promotes 0 files without error.""" + + def test_second_run_promoted_zero( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Re-running promote on already-cleaned staging succeeds with promoted=0.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + + report1 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + report2 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report1["promoted"] == 1 + assert report2["promoted"] == 0 + assert report2["failed"] == 0 + + # Final key still present after second run + rel = build_accession_path(ASSEMBLY_DIR_A) + final_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(final_keys) == 1 + + def test_lakehouse_unchanged_on_second_run( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Lakehouse contents are identical before and after a second (no-op) promote run.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN}, + with_md5=True, + ) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + assert keys_after_first == keys_after_second diff --git a/tests/ncbi_ftp/__init__.py b/tests/ncbi_ftp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/ncbi_ftp/conftest.py b/tests/ncbi_ftp/conftest.py new file mode 100644 index 00000000..07fff0c6 --- /dev/null +++ b/tests/ncbi_ftp/conftest.py @@ -0,0 +1,69 @@ +"""Shared fixtures for ncbi_ftp tests.""" + +from collections.abc import Generator +from unittest.mock import patch + +import boto3 +import botocore.client +import pytest +from moto import mock_aws + +import cdm_data_loaders.ncbi_ftp.promote as promote_mod +import cdm_data_loaders.utils.s3 as s3_utils +from tests.s3_helpers import strip_checksum_algorithm +from cdm_data_loaders.utils.s3 import CDM_LAKE_BUCKET, reset_s3_client + +AWS_REGION = "us-east-1" +TEST_BUCKET = CDM_LAKE_BUCKET + + +# Minimal assembly_summary_refseq.txt content (tab-separated, 20+ columns) +SAMPLE_SUMMARY = ( + "# assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\t" + "taxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\t" + "version_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\t" + "asm_name\t16\t17\t18\tftp_path\n" + "GCF_000001215.4\tPRJNA13812\tSAMN02803731\t\treference genome\t7227\t7227\t" + "Drosophila melanogaster\t\t\tlatest\tChromosome\tMajor\tFull\t2014/10/21\t" + "Release_6_plus_ISO1_MT\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT\n" + "GCF_000001405.40\tPRJNA168\tna\t\treference genome\t9606\t9606\t" + "Homo sapiens\t\t\tlatest\tChromosome\tPatch\tFull\t2022/02/03\t" + "GRCh38.p14\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14\n" + "GCF_000005845.2\tPRJNA57779\tSAMN02604091\t\trepresentative genome\t511145\t562\t" + "Escherichia coli\t\t\treplaced\tComplete Genome\tMajor\tFull\t2013/09/26\t" + "ASM584v2\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2\n" + "GCF_000009999.1\tPRJNA999\tSAMN999\t\tna\t0\t0\t" + "Test organism\t\t\tsuppressed\tScaffold\tMajor\tFull\t2010/01/01\t" + "ASM999v1\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/999/GCF_000009999.1_ASM999v1\n" + "GCF_000099999.1\tPRJNA888\tSAMN888\t\tna\t0\t0\t" + "Test organism 2\t\t\tlatest\tContig\tMajor\tFull\t2023/06/15\t" + "ASM9999v1\t\t\t\tna\n" +) + + +@pytest.fixture +def mock_s3_client() -> Generator[botocore.client.BaseClient]: + """Yield a mocked S3 client with the CDM Lake bucket created.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(promote_mod, "get_s3_client", return_value=client), + ): + yield client + reset_s3_client() + + +@pytest.fixture +def mock_s3_client_no_checksum(mock_s3_client: botocore.client.BaseClient) -> botocore.client.BaseClient: + """Mocked S3 client with copy_object and upload_file patched to strip ChecksumAlgorithm.""" + mock_s3_client.copy_object = strip_checksum_algorithm(mock_s3_client.copy_object) # type: ignore[method-assign] + mock_s3_client.upload_file = strip_checksum_algorithm(mock_s3_client.upload_file) # type: ignore[method-assign] + return mock_s3_client diff --git a/tests/ncbi_ftp/test_assembly.py b/tests/ncbi_ftp/test_assembly.py new file mode 100644 index 00000000..261f4676 --- /dev/null +++ b/tests/ncbi_ftp/test_assembly.py @@ -0,0 +1,94 @@ +"""Tests for ncbi_ftp.assembly module — path helpers, file filtering, checksum parsing.""" + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import ( + build_accession_path, + parse_assembly_path, + parse_md5_checksums_file, +) + + +# ── Path helpers ───────────────────────────────────────────────────────── + + +@pytest.mark.parametrize( + ("assembly_dir", "expected"), + [ + pytest.param( + "GCF_000001215.4_Release_6_plus_ISO1_MT", + "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + id="gcf", + ), + pytest.param( + "GCA_012345678.1_ASM1234v1", + "raw_data/GCA/012/345/678/GCA_012345678.1_ASM1234v1/", + id="gca", + ), + ], +) +def test_build_accession_path(assembly_dir: str, expected: str) -> None: + """Verify accession path construction for various inputs.""" + assert build_accession_path(assembly_dir) == expected + + +def test_build_accession_path_invalid() -> None: + """Verify ValueError on invalid assembly name.""" + with pytest.raises(ValueError, match="Cannot parse"): + build_accession_path("invalid_name") + + +@pytest.mark.parametrize( + ("path", "expected"), + [ + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + ("GCF", "GCF_000001215.4_Release_6_plus_ISO1_MT", "GCF_000001215.4"), + id="with_trailing_slash", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT", + ("GCF", "GCF_000001215.4_Release_6_plus_ISO1_MT", "GCF_000001215.4"), + id="without_trailing_slash", + ), + ], +) +def test_parse_assembly_path(path: str, expected: tuple[str, str, str]) -> None: + """Verify db, assembly_dir, and accession are parsed correctly.""" + assert parse_assembly_path(path) == expected + + +def test_parse_assembly_path_invalid() -> None: + """Verify ValueError on invalid path.""" + with pytest.raises(ValueError, match="Cannot parse"): + parse_assembly_path("/random/path/") + + +# ── parse_md5_checksums_file ───────────────────────────────────────────── + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + pytest.param( + "abc123 ./GCF_000001215.4_genomic.fna.gz\ndef456 ./GCF_000001215.4_genomic.gff.gz\n", + {"GCF_000001215.4_genomic.fna.gz": "abc123", "GCF_000001215.4_genomic.gff.gz": "def456"}, + id="dot_slash_prefix", + ), + pytest.param( + "abc123 GCF_000001215.4_genomic.fna.gz\n", + {"GCF_000001215.4_genomic.fna.gz": "abc123"}, + id="no_dot_slash_prefix", + ), + pytest.param("", {}, id="empty_string"), + pytest.param(" \n \n", {}, id="whitespace_only"), + pytest.param( + "abc123 file1.txt\n\n\ndef456 file2.txt\n", + {"file1.txt": "abc123", "file2.txt": "def456"}, + id="blank_lines_ignored", + ), + ], +) +def test_parse_md5_checksums_file(text: str, expected: dict[str, str]) -> None: + """Verify parse_md5_checksums_file handles various input formats.""" + assert parse_md5_checksums_file(text) == expected diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py new file mode 100644 index 00000000..5611e925 --- /dev/null +++ b/tests/ncbi_ftp/test_manifest.py @@ -0,0 +1,676 @@ +"""Tests for ncbi_ftp.manifest module — assembly summary parsing, diff, filtering, writing.""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, + DiffResult, + _extract_accession_from_s3_key, + _extract_assembly_dir_from_s3_key, + _ftp_dir_from_url, + accession_prefix, + compute_diff, + filter_by_prefix_range, + get_latest_assembly_paths, + parse_assembly_summary, + scan_store_to_synthetic_summary, + verify_transfer_candidates, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) + +from .conftest import SAMPLE_SUMMARY + +_EXPECTED_TWO = 2 + + +# ── parse_assembly_summary ─────────────────────────────────────────────── + + +_EXPECTED_ASSEMBLIES = { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date="2014/10/21", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT", + assembly_dir="GCF_000001215.4_Release_6_plus_ISO1_MT", + ), + "GCF_000001405.40": AssemblyRecord( + accession="GCF_000001405.40", + status="latest", + seq_rel_date="2022/02/03", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14", + assembly_dir="GCF_000001405.40_GRCh38.p14", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="replaced", + seq_rel_date="2013/09/26", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2", + assembly_dir="GCF_000005845.2_ASM584v2", + ), + "GCF_000009999.1": AssemblyRecord( + accession="GCF_000009999.1", + status="suppressed", + seq_rel_date="2010/01/01", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/999/GCF_000009999.1_ASM999v1", + assembly_dir="GCF_000009999.1_ASM999v1", + ), + # GCF_000099999.1 is excluded because ftp_path == "na" +} + + +def test_parse_assembly_summary() -> None: + """SAMPLE_SUMMARY is parsed to the expected assemblies.""" + assert parse_assembly_summary(SAMPLE_SUMMARY) == _EXPECTED_ASSEMBLIES + + +def test_parse_assembly_summary_empty() -> None: + """Comment-only input returns empty dict.""" + assert parse_assembly_summary("# comment only\n") == {} + + +@pytest.mark.parametrize("source", ["file", "file_str", "list_of_lines"]) +def test_parse_assembly_summary_input_types(source: str, tmp_path: Path) -> None: + """Parsing works from a file path, string path, and list of lines.""" + if source == "list_of_lines": + arg = SAMPLE_SUMMARY.splitlines(keepends=True) + else: + f = tmp_path / "summary.tsv" + f.write_text(SAMPLE_SUMMARY) + arg = f if source == "file" else str(f) + assert parse_assembly_summary(arg) == _EXPECTED_ASSEMBLIES + + +# ── get_latest_assembly_paths ──────────────────────────────────────────── + + +def test_get_latest_assembly_paths() -> None: + """Only 'latest' assemblies appear; paths are FTP directories with trailing slash.""" + assert dict(get_latest_assembly_paths(parse_assembly_summary(SAMPLE_SUMMARY))) == { + "GCF_000001215.4": "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + "GCF_000001405.40": "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/", + } + + +def test_get_latest_assembly_paths_empty() -> None: + """Empty input returns empty list.""" + assert get_latest_assembly_paths(parse_assembly_summary("# empty\n")) == [] + + +# ── compute_diff ───────────────────────────────────────────────────────── + + +def test_compute_diff_new() -> None: + """All latest assemblies are new with no previous state; result is sorted.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + assert "GCF_000001215.4" in diff.new + assert "GCF_000001405.40" in diff.new + assert "GCF_000005845.2" not in diff.new # replaced + assert diff.new == sorted(diff.new) + + +def test_compute_diff_updated() -> None: + """seq_rel_date moving forward marks updated; moving backward does not.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + previous = parse_assembly_summary(SAMPLE_SUMMARY) + previous["GCF_000001215.4"].seq_rel_date = "2010/01/01" + assert "GCF_000001215.4" in compute_diff(current, previous_assemblies=previous).updated + + previous2 = parse_assembly_summary(SAMPLE_SUMMARY) + previous2["GCF_000001215.4"].seq_rel_date = "2099/12/31" + assert "GCF_000001215.4" not in compute_diff(current, previous_assemblies=previous2).updated + + +def test_compute_diff_removed() -> None: + """Replaced, suppressed, and entirely-absent accessions are classified correctly.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + assert "GCF_000005845.2" in compute_diff(current, previous_accessions={"GCF_000005845.2"}).replaced + assert "GCF_000009999.1" in compute_diff(current, previous_accessions={"GCF_000009999.1"}).suppressed + # Accession absent from current entirely → suppressed + assert ( + "GCF_000001215.4" + in compute_diff(parse_assembly_summary("# empty\n"), previous_accessions={"GCF_000001215.4"}).suppressed + ) + + +def test_compute_diff_scan_store_fallback() -> None: + """Known accessions are not marked new; unknown ones are.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) + assert "GCF_000001215.4" not in diff.new + assert "GCF_000001405.40" in diff.new + + +# ── accession_prefix & filter_by_prefix_range ──────────────────────────── + + +@pytest.mark.parametrize( + ("accession", "expected"), + [ + pytest.param("GCF_000001215.4", "000", id="three_zeros"), + pytest.param("GCF_123456789.1", "123", id="non_zero"), + pytest.param("invalid", None, id="invalid"), + ], +) +def test_accession_prefix(accession: str, expected: str | None) -> None: + """3-digit prefix is extracted from the accession; invalid input returns None.""" + assert accession_prefix(accession) == expected + + +def test_filter_by_prefix_range() -> None: + """Range filter is inclusive; out-of-range excluded; no range returns all.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert len(filter_by_prefix_range(assemblies, "000", "000")) == len(assemblies) + assert len(filter_by_prefix_range(assemblies, "001", "999")) == 0 + assert len(filter_by_prefix_range(assemblies)) == len(assemblies) + + +# ── Manifest writing ──────────────────────────────────────────────────── + + +def test_write_transfer_manifest(tmp_path: Path) -> None: + """Transfer manifest is written with FTP paths that start with /genomes/ and end with /.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + manifest_file = tmp_path / "transfer.txt" + paths = write_transfer_manifest(diff, current, manifest_file) + assert len(paths) > 0 + lines = [line.strip() for line in manifest_file.read_text().splitlines() if line.strip()] + assert len(lines) == len(paths) + for line in lines: + assert line.startswith("/genomes/") + assert line.endswith("/") + + +def test_write_removed_manifest(tmp_path: Path) -> None: + """Removed manifest lists replaced and suppressed accessions.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000005845.2", "GCF_000009999.1"}) + removed_file = tmp_path / "removed.txt" + removed = write_removed_manifest(diff, removed_file) + assert len(removed) == _EXPECTED_TWO + lines = [line.strip() for line in removed_file.read_text().splitlines() if line.strip()] + assert len(lines) == _EXPECTED_TWO + + +def test_write_updated_manifest(tmp_path: Path) -> None: + """Updated manifest lists only updated accessions, sorted.""" + diff = DiffResult(new=["GCF_000001215.4"], updated=["GCF_000005845.2", "GCF_000001405.40"]) + updated_file = tmp_path / "updated.txt" + updated = write_updated_manifest(diff, updated_file) + assert len(updated) == _EXPECTED_TWO + lines = [line.strip() for line in updated_file.read_text().splitlines() if line.strip()] + assert lines == ["GCF_000001405.40", "GCF_000005845.2"] + + +def test_write_diff_summary(tmp_path: Path) -> None: + """Diff summary JSON is written with correct counts, prefix range, and database.""" + diff = DiffResult(new=["a"], updated=["b"], replaced=["c"], suppressed=[]) + summary_file = tmp_path / "summary.json" + summary = write_diff_summary(diff, summary_file, "refseq", "000", "003") + assert json.loads(summary_file.read_text()) == summary + assert {k: summary[k] for k in ("database", "counts", "prefix_range", "accessions")} == { + "database": "refseq", + "counts": { + "new": 1, + "updated": 1, + "replaced": 1, + "suppressed": 0, + "total_to_transfer": 2, + "total_to_remove": 1, + }, + "prefix_range": {"from": "000", "to": "003"}, + "accessions": {"new": ["a"], "updated": ["b"], "replaced": ["c"], "suppressed": []}, + } + + +# ── _ftp_dir_from_url ─────────────────────────────────────────────────── + + +@pytest.mark.parametrize( + ("url", "expected", "kwargs"), + [ + pytest.param( + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="https_url", + ), + pytest.param( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="ftp_url", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="bare_path", + ), + pytest.param( + "ftp://custom.host.example.com/genomes/all/GCF/000/001/215", + "/genomes/all/GCF/000/001/215", + {"ftp_host": "custom.host.example.com"}, + id="custom_ftp_host", + ), + ], +) +def test_ftp_dir_from_url(url: str, expected: str, kwargs: dict) -> None: + assert _ftp_dir_from_url(url, **kwargs) == expected + + +# ── verify_transfer_candidates ─────────────────────────────────────────── + + +_MD5_CHECKSUMS_TXT = ( + "d41d8cd98f00b204e9800998ecf8427e ./GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz\n" + "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4 ./GCF_000001215.4_Release_6_plus_ISO1_MT_protein.faa.gz\n" + "ffffffffffffffffffffffffffffffff ./GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt\n" + "0000000000000000000000000000dead ./GCF_000001215.4_Release_6_plus_ISO1_MT_README.txt\n" +) + + +def _mock_s3_with_objects() -> MagicMock: + """Return a mock S3 client whose list_objects_v2 always reports objects exist.""" + client = MagicMock() + client.list_objects_v2.return_value = {"KeyCount": 1} + return client + + +def _mock_s3_empty() -> MagicMock: + """Return a mock S3 client whose list_objects_v2 reports no objects.""" + client = MagicMock() + client.list_objects_v2.return_value = {"KeyCount": 0} + return client + + +_BUCKET = "cdm-lake" +_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" + + +def _assemblies() -> dict: + return parse_assembly_summary(SAMPLE_SUMMARY) + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_prunes_when_all_match( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assemblies where every file matches S3 are pruned from the list.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + if "_genomic.fna.gz" in s3_path: + return {"size": 100, "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, "checksum_crc64nvme": None} + if "_protein.faa.gz" in s3_path: + return {"size": 100, "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, "checksum_crc64nvme": None} + if "_assembly_report.txt" in s3_path: + return {"size": 100, "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, "checksum_crc64nvme": None} + return None + + mock_head.side_effect = head_side_effect + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == [] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_md5_differs( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when at least one file has a different MD5.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {"md5": "WRONG"}, "checksum_crc64nvme": None} + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_s3_object_missing( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when at least one file doesn't exist in S3.""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_no_md5_metadata( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when S3 object exists but has no md5 metadata.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {}, "checksum_crc64nvme": None} + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", side_effect=Exception("FTP error")) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_ftp_fails( + mock_connect: MagicMock, mock_retrieve: MagicMock, mock_s3: MagicMock +) -> None: + """Assembly is kept (conservative) when md5checksums.txt cannot be fetched.""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_empty_input(mock_connect: MagicMock) -> None: + """Empty accession list returns empty result without connecting.""" + assert verify_transfer_candidates([], {}, _BUCKET, "prefix/") == [] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_unknown_accession_kept(mock_connect: MagicMock) -> None: + """Accessions not in assemblies dict are kept (conservative).""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_999999999.1"], {}, _BUCKET, "prefix/") == ["GCF_999999999.1"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_short_circuits_on_first_mismatch( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Verification stops checking after the first missing/mismatched file.""" + mock_connect.return_value = MagicMock() + verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert mock_head.call_count == 1 + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_mixed( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """A mix of matching and non-matching assemblies: matched pruned, unmatched kept.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + if "GCF_000001215.4_Release_6_plus_ISO1_MT/" in s3_path: + if "_genomic.fna.gz" in s3_path: + return {"size": 1, "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, "checksum_crc64nvme": None} + if "_protein.faa.gz" in s3_path: + return {"size": 1, "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, "checksum_crc64nvme": None} + if "_assembly_report.txt" in s3_path: + return {"size": 1, "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, "checksum_crc64nvme": None} + return None + + mock_head.side_effect = head_side_effect + result = verify_transfer_candidates(["GCF_000001215.4", "GCF_000001405.40"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert result == ["GCF_000001405.40"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_empty()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_skips_ftp_when_folder_missing( + mock_connect: MagicMock, + mock_s3: MagicMock, +) -> None: + """Accessions with no objects in S3 are confirmed without FTP round-trip.""" + result = verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert result == ["GCF_000001215.4"] + mock_connect.assert_not_called() + + +# ── Synthetic summary from S3 store scan ──────────────────────────────── + + +@pytest.mark.parametrize( + ("key", "expected"), + [ + pytest.param( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "GCF_000001215.4", + id="long_path", + ), + pytest.param("some/path/GCA_999999999.1_whatever/data.txt", "GCA_999999999.1", id="short_path"), + pytest.param("some/random/path", None, id="no_accession"), + pytest.param("", None, id="empty"), + ], +) +def test_extract_accession_from_s3_key(key: str, expected: str | None) -> None: + """Accession is extracted from S3 key paths; invalid/empty paths return None.""" + assert _extract_accession_from_s3_key(key) == expected + + +@pytest.mark.parametrize( + ("key", "expected"), + [ + pytest.param( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "GCF_000001215.4_Release_6_plus_ISO1_MT", + id="long_path", + ), + pytest.param( + "prefix/GCA_999999999.1_assembly_name/subdir/data.txt", + "GCA_999999999.1_assembly_name", + id="subdir", + ), + pytest.param("some/random/path", None, id="no_assembly_dir"), + pytest.param("", None, id="empty"), + ], +) +def test_extract_assembly_dir_from_s3_key(key: str, expected: str | None) -> None: + """Assembly directory is extracted from S3 key paths; invalid/empty paths return None.""" + assert _extract_assembly_dir_from_s3_key(key) == expected + + +def _make_mock_s3_paginator() -> MagicMock: + """Return a mock S3 client with two assemblies (GCF_000001215.4, GCF_000005845.2).""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file1.gz", + "LastModified": datetime(2024, 1, 15, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file2.gz", + "LastModified": datetime(2024, 1, 16, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000005845.2_Assembly/file.gz", + "LastModified": datetime(2024, 2, 20, tzinfo=timezone.utc), + }, + ] + } + ] + return mock + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_builds_summary(mock_get_s3: MagicMock) -> None: + """Synthetic summary is built correctly with provided release_date for all assemblies.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + assert scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") == { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date="2024/01/31", + ftp_path="https://ftp.ncbi.nlm.nih.gov/synthetic/GCF_000001215.4_Release_6", + assembly_dir="GCF_000001215.4_Release_6", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="latest", + seq_rel_date="2024/01/31", + ftp_path="https://ftp.ncbi.nlm.nih.gov/synthetic/GCF_000005845.2_Assembly", + assembly_dir="GCF_000005845.2_Assembly", + ), + } + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_applies_release_date_to_all(mock_get_s3: MagicMock) -> None: + """Provided release_date is used even when files have different LastModified dates.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "prefix/GCF_000001215.4_v1/file_newer.gz", + "LastModified": datetime(2024, 3, 20, tzinfo=timezone.utc), + }, + { + "Key": "prefix/GCF_000001215.4_v1/file_older.gz", + "LastModified": datetime(2024, 1, 10, tzinfo=timezone.utc), + }, + ] + } + ] + mock_get_s3.return_value = mock + assert ( + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/31")["GCF_000001215.4"].seq_rel_date + == "2024/03/31" + ) + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_raises_for_invalid_release_date(mock_get_s3: MagicMock) -> None: + """Invalid release_date format is rejected.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + with pytest.raises(ValueError, match="Invalid release_date"): + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024-03-31") + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_invokes_progress_callback(mock_get_s3: MagicMock) -> None: + """Progress callback is called once per unique assembly discovered.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + calls: list[tuple[int, str]] = [] + scan_store_to_synthetic_summary( + "test-bucket", "prefix/", "2024/01/31", progress_callback=lambda n, a: calls.append((n, a)) + ) + assert len(calls) == 2 + assert calls[0][0] == 1 + assert calls[1][0] == 2 + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_handles_empty_store(mock_get_s3: MagicMock) -> None: + """Empty store returns empty dict.""" + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [{"Contents": []}] + mock_get_s3.return_value = mock + assert scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") == {} + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_skips_objects_without_accession(mock_get_s3: MagicMock) -> None: + """Objects without valid accessions in the key are skipped.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + {"Key": "prefix/some/random/file.txt", "LastModified": datetime(2024, 1, 1, tzinfo=timezone.utc)}, + { + "Key": "prefix/GCF_000001215.4_Assembly/valid_file.gz", + "LastModified": datetime(2024, 2, 1, tzinfo=timezone.utc), + }, + ] + } + ] + mock_get_s3.return_value = mock + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") + assert len(result) == 1 + assert "GCF_000001215.4" in result + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_assembly_dir_survives_round_trip(mock_get_s3: MagicMock, tmp_path: Path) -> None: + """assembly_dir is preserved after save-to-file / parse-back round-trip. + + Regression: previously ftp_path was written as "" causing assembly_dir="" + and compute_diff flagging every assembly as updated. + """ + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "prefix/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "LastModified": datetime(2024, 3, 10, tzinfo=timezone.utc), + } + ] + } + ] + mock.get_paginator.return_value = mock_paginator + mock_get_s3.return_value = mock + + synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/10") + + out_file = tmp_path / "synthetic_summary.txt" + with out_file.open("w") as f: + for acc in sorted(synthetic.keys()): + rec = synthetic[acc] + f.write( + f"{rec.accession}\t.\t.\t.\t.\t.\t.\t.\t.\t.\t{rec.status}\t.\t.\t.\t{rec.seq_rel_date}\t.\t.\t.\t.\t{rec.ftp_path}\t.\n" + ) + + reparsed = parse_assembly_summary(out_file) + assert "GCF_000001215.4" in reparsed + reparsed_rec = reparsed["GCF_000001215.4"] + original_rec = synthetic["GCF_000001215.4"] + assert reparsed_rec.assembly_dir == original_rec.assembly_dir + assert reparsed_rec.seq_rel_date == original_rec.seq_rel_date + assert reparsed_rec.status == original_rec.status diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py new file mode 100644 index 00000000..df1df775 --- /dev/null +++ b/tests/ncbi_ftp/test_metadata.py @@ -0,0 +1,263 @@ +"""Unit tests for cdm_data_loaders.ncbi_ftp.metadata.""" + +from __future__ import annotations + +import json +import time +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch +from urllib.parse import urlparse + +import boto3 +import pytest +from moto import mock_aws + +if TYPE_CHECKING: + from collections.abc import Generator + + import botocore.client + +import cdm_data_loaders.ncbi_ftp.metadata as metadata_mod +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.ncbi_ftp.metadata import ( + DescriptorResource, + archive_descriptor, + build_archive_descriptor_key, + build_descriptor_key, + create_descriptor, + upload_descriptor, + validate_descriptor, +) +from cdm_data_loaders.utils.s3 import reset_s3_client +from tests.ncbi_ftp.conftest import TEST_BUCKET + +AWS_REGION = "us-east-1" + +_ACCESSION = "GCF_000001215.4" +_ASSEMBLY_DIR = "GCF_000001215.4_Release_6_plus_ISO1_MT" +_RELEASE_TAG = "2024-01" +_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" +_TIMESTAMP = 1_700_000_000 + +_SAMPLE_RESOURCES: list[DescriptorResource] = [ + { + "name": "GCF_000001215.4_genomic.fna.gz", + "path": f"{_KEY_PREFIX}raw_data/GCF/000/001/215/{_ASSEMBLY_DIR}/GCF_000001215.4_genomic.fna.gz", + "format": "gz", + "bytes": 1024, + "hash": "abc123", + }, + { + "name": "GCF_000001215.4_assembly_report.txt", + "path": f"{_KEY_PREFIX}raw_data/GCF/000/001/215/{_ASSEMBLY_DIR}/GCF_000001215.4_assembly_report.txt", + "format": "txt", + "bytes": 512, + "hash": None, # no md5 sidecar for this one + }, +] + + +# ── build_descriptor_key / build_archive_descriptor_key ───────────────── + + +@pytest.mark.parametrize("prefix", [_KEY_PREFIX, _KEY_PREFIX.rstrip("/")]) +def test_build_descriptor_key(prefix: str) -> None: + """Key is under metadata/, ends with _datapackage.json, trailing slash on prefix is normalized.""" + key = build_descriptor_key(_ASSEMBLY_DIR, prefix) + assert key == f"{_KEY_PREFIX}metadata/{_ASSEMBLY_DIR}_datapackage.json" + assert "//" not in key + + +@pytest.mark.parametrize( + ("prefix", "tag"), + [ + pytest.param(_KEY_PREFIX, _RELEASE_TAG, id="trailing_slash"), + pytest.param(_KEY_PREFIX.rstrip("/"), _RELEASE_TAG, id="no_trailing_slash"), + pytest.param(_KEY_PREFIX, "2025-06", id="different_tag"), + ], +) +def test_build_archive_descriptor_key(prefix: str, tag: str) -> None: + """Archive key includes tag and reason segment; no double slash; prefix trailing slash normalized.""" + key = build_archive_descriptor_key(_ASSEMBLY_DIR, tag, prefix, "updated") + assert key == f"{_KEY_PREFIX}archive/{tag}/updated/metadata/{_ASSEMBLY_DIR}_datapackage.json" + assert "//" not in key + + +# ── create_descriptor ──────────────────────────────────────────────────── + + +def test_create_descriptor() -> None: + """create_descriptor produces a fully populated descriptor matching the expected structure.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + + # URL hostname is computed; can't express as equality + host = urlparse(d["url"]).hostname + assert host is not None and (host == "ncbi.nlm.nih.gov" or host.endswith(".ncbi.nlm.nih.gov")) + + # resource[1]: hash=None → key absent; bytes=512 → key present + r1 = d["resources"][1] + assert "hash" not in r1 + assert "bytes" in r1 + + assert {k: d[k] for k in ("identifier", "resource_type", "version", "license", "contributors")} == { + "identifier": f"NCBI:{_ACCESSION}", + "resource_type": "dataset", + "version": "4", + "license": {}, + "contributors": [ + { + "name": "National Center for Biotechnology Information", + "contributor_id": "ROR:02meqm098", + "contributor_type": "Organization", + "contributor_roles": "DataCurator", + } + ], + } + assert { + k: d["meta"][k] for k in ("saved_by", "credit_metadata_schema_version", "timestamp", "credit_metadata_source") + } == { + "saved_by": "cdm-data-loaders-ncbi-ftp", + "credit_metadata_schema_version": "1.0", + "timestamp": _TIMESTAMP, + "credit_metadata_source": [ + { + "access_timestamp": _TIMESTAMP, + "source_name": "NCBI Genomes FTP", + "source_url": "ftp.ncbi.nlm.nih.gov/genomes/all/", + } + ], + } + assert _ASSEMBLY_DIR in d["titles"][0]["title"] + assert _ACCESSION in d["descriptions"][0]["description_text"] + r0 = d["resources"][0] + assert {k: r0[k] for k in ("hash", "bytes", "path")} == { + "hash": "abc123", + "bytes": 1024, + "path": _SAMPLE_RESOURCES[0]["path"], + } + + +def test_create_descriptor_default_timestamp_is_recent() -> None: + """Default timestamp is close to current time when not specified.""" + before = int(time.time()) + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES) + after = int(time.time()) + assert before <= d["meta"]["timestamp"] <= after + 1 + + +def test_create_descriptor_resource_name_lowercased() -> None: + """Resource names are converted to lowercase.""" + resources: list[DescriptorResource] = [ + {"name": "FILE_UPPER.FNA.GZ", "path": "s3://bucket/a", "format": "gz", "bytes": 100, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert d["resources"][0]["name"] == "file_upper.fna.gz" + + +def test_create_descriptor_null_bytes_omitted() -> None: + """Resources with bytes=None have the 'bytes' key removed from the output.""" + resources: list[DescriptorResource] = [ + {"name": "f.txt", "path": "s3://b/f.txt", "format": "txt", "bytes": None, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert "bytes" not in d["resources"][0] + + +def test_create_descriptor_empty_resources() -> None: + """Empty resources list produces a valid descriptor.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, [], timestamp=_TIMESTAMP) + assert d["resources"] == [] + + +# ── validate_descriptor ────────────────────────────────────────────────── + + +def test_validate_descriptor_valid() -> None: + """Valid descriptor does not raise.""" + validate_descriptor( + create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP), + _ACCESSION, + ) + + +def test_validate_descriptor_empty_raises() -> None: + """Empty dict fails frictionless validation and raises.""" + with pytest.raises((ValueError, Exception)): + validate_descriptor({}, _ACCESSION) + + +# ── upload_descriptor / archive_descriptor ─────────────────────────────── + + +@pytest.fixture +def mock_s3() -> Generator[botocore.client.BaseClient]: + """Mocked S3 client with the CDM Lake bucket pre-created.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(metadata_mod, "get_s3_client", return_value=client), + ): + yield client + reset_s3_client() + + +@pytest.fixture +def mock_s3_with_descriptor(mock_s3: botocore.client.BaseClient) -> tuple[botocore.client.BaseClient, MagicMock]: + """mock_s3 with a live descriptor pre-uploaded and copy_object patched.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + mock_s3.put_object(Bucket=TEST_BUCKET, Key=live_key, Body=json.dumps(descriptor).encode()) + with patch.object(metadata_mod, "copy_object") as mock_copy: + yield mock_s3, mock_copy + + +@pytest.mark.s3 +def test_upload_descriptor(mock_s3: botocore.client.BaseClient) -> None: + """Uploaded object is valid JSON at the expected key with the expected identifier.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) + expected_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + assert key == expected_key + assert key.startswith(_KEY_PREFIX) + assert key.endswith("_datapackage.json") + body = json.loads(mock_s3.get_object(Bucket=TEST_BUCKET, Key=key)["Body"].read()) + assert body["identifier"] == f"NCBI:{_ACCESSION}" + + +@pytest.mark.s3 +def test_upload_descriptor_dry_run(mock_s3: botocore.client.BaseClient) -> None: + """Dry-run returns the correct key but creates no S3 object.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) + assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + objs = mock_s3.list_objects_v2(Bucket=TEST_BUCKET).get("Contents", []) + assert not any(o["Key"] == key for o in objs) + + +@pytest.mark.s3 +def test_archive_descriptor(mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock]) -> None: + """archive_descriptor returns True and calls copy_object with the correct keys.""" + _, mock_copy = mock_s3_with_descriptor + result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) + assert result is True + mock_copy.assert_called_once() + args = mock_copy.call_args[0] + assert f"{TEST_BUCKET}/{build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX)}" in args + assert f"{TEST_BUCKET}/{build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX)}" in args + + +@pytest.mark.s3 +def test_archive_descriptor_dry_run(mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock]) -> None: + """Dry-run returns True but does not call copy_object.""" + _, mock_copy = mock_s3_with_descriptor + assert archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG, dry_run=True) is True + mock_copy.assert_not_called() + + +@pytest.mark.s3 +def test_archive_descriptor_missing_returns_false(mock_s3: botocore.client.BaseClient) -> None: + """Returns False when no descriptor exists at the live key.""" + assert archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) is False diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py new file mode 100644 index 00000000..345572db --- /dev/null +++ b/tests/ncbi_ftp/test_notebooks.py @@ -0,0 +1,86 @@ +"""Smoke tests for NCBI FTP notebooks — syntax and import validation.""" + +import ast +import json +from pathlib import Path + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST # noqa: F401 +from cdm_data_loaders.ncbi_ftp.manifest import ( # noqa: F401 + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import ( + DEFAULT_LAKEHOUSE_KEY_PREFIX, + promote_from_s3, +) +from cdm_data_loaders.utils.s3 import split_s3_path # noqa: F401 + +NOTEBOOKS_DIR = Path(__file__).resolve().parents[2] / "notebooks" + +NCBI_NOTEBOOKS = [ + "ncbi_ftp_manifest.ipynb", + "ncbi_ftp_promote.ipynb", + "ncbi_ftp_download.ipynb", +] + + +def _extract_code_cells(notebook_path: Path) -> list[str]: + """Extract source code from all code cells in a notebook. + + :param notebook_path: path to the .ipynb file + :return: list of source code strings, one per code cell + """ + with notebook_path.open() as f: + nb = json.load(f) + return ["".join(cell.get("source", [])) for cell in nb.get("cells", []) if cell.get("cell_type") == "code"] + + +@pytest.mark.parametrize("notebook", NCBI_NOTEBOOKS) +def test_notebook_syntax(notebook: str) -> None: + """Every code cell is syntactically valid Python and non-empty.""" + path = NOTEBOOKS_DIR / notebook + assert path.exists(), f"Notebook not found: {path}" + cells = _extract_code_cells(path) + assert len(cells) > 0, f"No code cells found in {notebook}" + for i, source in enumerate(cells, 1): + assert source.strip(), f"{notebook} cell {i} is empty" + try: + ast.parse(source, filename=f"{notebook}:cell{i}") + except SyntaxError as exc: + pytest.fail(f"{notebook} cell {i} has a syntax error: {exc}") + + +def test_manifest_notebook_imports() -> None: + """All manifest notebook imports are verified at module load time above.""" + assert isinstance(FTP_HOST, str) and FTP_HOST + assert AssemblyRecord is not None + assert callable(download_assembly_summary) + assert callable(compute_diff) + assert callable(write_updated_manifest) + + +def test_promote_notebook_imports() -> None: + """All promote notebook imports are verified at module load time above.""" + assert callable(promote_from_s3) + assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) + assert callable(split_s3_path) + + +def test_download_notebook_imports() -> None: + """All download notebook imports resolve without error.""" + from cdm_data_loaders.pipelines.ncbi_ftp_download import ( # noqa: F401 + DEFAULT_STAGING_KEY_PREFIX, + download_and_stage, + ) + + assert callable(download_and_stage) + assert isinstance(DEFAULT_STAGING_KEY_PREFIX, str) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py new file mode 100644 index 00000000..ae0ff887 --- /dev/null +++ b/tests/ncbi_ftp/test_promote.py @@ -0,0 +1,1005 @@ +"""Tests for ncbi_ftp.promote module — S3 promote, archive, manifest trimming.""" + +import hashlib +from pathlib import Path + +import botocore.client +import pytest + +from cdm_data_loaders.ncbi_ftp.promote import ( + DEFAULT_LAKEHOUSE_KEY_PREFIX, + _archive_assemblies, + _trim_manifest, + promote_from_s3, +) +from tests.ncbi_ftp.conftest import TEST_BUCKET + + +# ── Promotion test constants ───────────────────────────────────────────── + +_STAGE_PREFIX = "staging/run1/" + +# Assembly 1 +_ACC1 = "GCF_000001215.4" +_DIR1 = "GCF_000001215.4_Release_6" +_STG1 = f"{_STAGE_PREFIX}raw_data/GCF/000/001/215/{_DIR1}/" +_LKH1 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{_DIR1}/" + +# Assembly 2 +_ACC2 = "GCF_000005845.2" +_DIR2 = "GCF_000005845.2_ASM584v2" +_STG2 = f"{_STAGE_PREFIX}raw_data/GCF/000/005/845/{_DIR2}/" +_LKH2 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{_DIR2}/" + + +def _stage( + s3: botocore.client.BaseClient, + staging_base: str, + files: dict[str, bytes], + *, + with_md5: bool = True, + with_crc64: bool = False, +) -> list[str]: + """Stage files at *staging_base*, optionally adding .md5 / .crc64nvme sidecars. + + Returns list of all staged keys (data files only, not sidecars). + """ + keys = [] + for fname, content in files.items(): + key = f"{staging_base}{fname}" + s3.put_object(Bucket=TEST_BUCKET, Key=key, Body=content) + keys.append(key) + if with_md5: + s3.put_object( + Bucket=TEST_BUCKET, + Key=f"{key}.md5", + Body=hashlib.md5(content).hexdigest().encode(), # noqa: S324 + ) + if with_crc64: + s3.put_object(Bucket=TEST_BUCKET, Key=f"{key}.crc64nvme", Body=b"fake-crc") + return keys + + +def _stage_files(s3_client: botocore.client.BaseClient, prefix: str) -> None: + """Upload sample staged files to mock S3.""" + for key in [ + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz", + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz.md5", + f"{prefix}download_report.json", + ]: + body = b"md5hash123" if key.endswith(".md5") else b"data" + s3_client.put_object(Bucket=TEST_BUCKET, Key=key, Body=body) + + +@pytest.mark.s3 +def test_promote_dry_run_no_writes(mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify dry_run does not write any objects.""" + prefix = "staging/run1/" + _stage_files(mock_s3_client_no_checksum, prefix) + + report = promote_from_s3( + staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET, dry_run=True + ) + assert report["promoted"] == 1 + assert report["dry_run"] is True + + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key).get("KeyCount", 0) == 0 + + +@pytest.mark.s3 +def test_promote_with_metadata(mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Objects are promoted with MD5 metadata; download_report.json is skipped.""" + prefix = "staging/run1/" + _stage_files(mock_s3_client_no_checksum, prefix) + + report = promote_from_s3(staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET) + assert report["promoted"] == 1 # only .fna.gz, not download_report.json + assert report["failed"] == 0 + + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) + assert resp["Metadata"].get("md5") == "md5hash123" + + +@pytest.mark.s3 +@pytest.mark.parametrize( + ("manifest_body", "promoted_set", "expected_present", "expected_absent"), + [ + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n", + {"GCF_000001215.4"}, + ["GCF_000001405.40"], + ["GCF_000001215.4"], + id="partial", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n", + {"GCF_000001215.4"}, + [], + ["GCF_000001215.4"], + id="all", + ), + ], +) +def test_trim_manifest( + mock_s3_client_no_checksum: botocore.client.BaseClient, + manifest_body: str, + promoted_set: set[str], + expected_present: list[str], + expected_absent: list[str], +) -> None: + """Promoted accessions are removed; others remain (partial) or the manifest empties (all).""" + manifest_key = "manifests/transfer_manifest.txt" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) + _trim_manifest(manifest_key, TEST_BUCKET, promoted_set) + remaining = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key)["Body"].read().decode() + for acc in expected_present: + assert acc in remaining + for acc in expected_absent: + assert acc not in remaining + + +@pytest.mark.s3 +def test_archive_assemblies_removed(mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path) -> None: + """Removed accessions are archived and originals deleted.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + assert ( + _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 0 + + archive_key = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/replaced_or_suppressed/" + f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 + + +@pytest.mark.s3 +def test_archive_assemblies_updated_no_delete( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Updated accessions are archived but originals remain.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"original-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + assert ( + _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-06", + archive_reason="updated", + delete_source=False, + ) + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 1 + + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_multiple_releases_no_collision( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Archiving the same accession in different releases creates distinct folders.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v1-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") + _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") + + archive_key_1 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + archive_key_2 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1)["Body"].read() == b"v1-data" + assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_2)["Body"].read() == b"v2-data" + + +@pytest.mark.s3 +def test_archive_assemblies_dry_run(mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path) -> None: + """dry_run does not copy or delete anything.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + assert ( + _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + dry_run=True, + ) + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 1 + + archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix).get("KeyCount", 0) == 0 + + +@pytest.mark.s3 +def test_archive_assemblies_no_objects_skips( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Accessions with no existing S3 objects are silently skipped.""" + manifest = tmp_path / "updated.txt" + manifest.write_text("GCF_000001215.4\n") + assert _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01") == 0 + + +@pytest.mark.s3 +def test_archive_assemblies_unknown_release_fallback( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """ncbi_release=None falls back to 'unknown' in the archive path.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + assert _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release=None) == 1 + + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 + + +# ── Concurrent / multi-file archive (new behaviour) ───────────────────── + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_all_copied( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """All files for an accession are copied concurrently — none missed.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + file_names = [ + f"{accession}_genomic.fna.gz", + f"{accession}_protein.faa.gz", + f"{accession}_rna.fna.gz", + f"{accession}_assembly_report.txt", + f"{accession}_assembly_stats.txt", + ] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=fname.encode()) + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + assert archived == len(file_names) + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_content_preserved( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Archive copies preserve byte-for-byte content of each file.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + files = { + f"{accession}_genomic.fna.gz": b"\x1f\x8bGENOMIC", + f"{accession}_protein.faa.gz": b"\x1f\x8bPROTEIN", + } + for fname, body in files.items(): + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=body) + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + for fname, original_body in files.items(): + obj = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert obj["Body"].read() == original_body, f"Content mismatch for {fname}" + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_delete_all( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Batch delete removes ALL source files when delete_source=True.""" + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + file_names = [ + f"{accession}_genomic.fna.gz", + f"{accession}_protein.faa.gz", + f"{accession}_assembly_report.txt", + ] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + + assert archived == len(file_names) + # All sources deleted + for fname in file_names: + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{base}{fname}") + assert result.get("KeyCount", 0) == 0, f"Source not deleted: {fname}" + # All archives present + archive_base = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-03/replaced_or_suppressed/raw_data/GCF/000/005/845/{asm_dir}/" + ) + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +# ── Partial-archive idempotency ────────────────────────────────────────── + + +@pytest.mark.s3 +def test_archive_assemblies_partial_already_archived_overwritten( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Re-running archive after a partial run overwrites the already-archived files. + + Simulates a partial failure: file_a was archived, file_b was not. + The second run should archive both file_a (overwrite) and file_b. + """ + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + + file_a = f"{accession}_genomic.fna.gz" + file_b = f"{accession}_protein.faa.gz" + + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_a}", Body=b"new-genomic") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_b}", Body=b"new-protein") + # Simulate partial prior run: file_a already archived with stale content + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_a}", Body=b"stale-genomic") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + assert archived == 2 # noqa: PLR2004 + # file_a should now have the current content (overwritten) + obj_a = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_a}") + assert obj_a["Body"].read() == b"new-genomic", "Re-run should overwrite stale archive" + # file_b should now be archived + obj_b = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_b}") + assert obj_b["Body"].read() == b"new-protein" + + +@pytest.mark.s3 +def test_archive_assemblies_partial_delete_resumes( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Re-running replaced_or_suppressed archive after partial copy+delete is safe. + + Simulates: file_a was copied+deleted, file_b was copied but NOT deleted, + file_c was not touched. The re-run finds only file_b and file_c present + (file_a is gone), archives both, and deletes both. + """ + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + archive_base = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-03/replaced_or_suppressed/raw_data/GCF/000/005/845/{asm_dir}/" + ) + + file_b = f"{accession}_protein.faa.gz" + file_c = f"{accession}_assembly_report.txt" + + # file_a already gone (deleted in first partial run) + # file_b present at source (not yet deleted from first partial run) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_b}", Body=b"protein") + # file_c present at source (not touched at all) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_c}", Body=b"report") + # file_a already at archive destination + mock_s3_client_no_checksum.put_object( + Bucket=TEST_BUCKET, Key=f"{archive_base}{accession}_genomic.fna.gz", Body=b"genomic" + ) + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + + # Only the 2 remaining source files were archived + assert archived == 2 # noqa: PLR2004 + # Both now gone from source + for fname in (file_b, file_c): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{base}{fname}") + assert result.get("KeyCount", 0) == 0, f"Expected {fname} deleted" + # file_a archive still intact (not touched by re-run) + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{accession}_genomic.fna.gz") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_idempotent_updated_reruns_cleanly( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Running updated archive twice on the same data produces the same result.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + file_names = [f"{accession}_genomic.fna.gz", f"{accession}_protein.faa.gz"] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"content") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived_1 = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + archived_2 = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + + assert archived_1 == len(file_names) + assert archived_2 == len(file_names) + # Sources still present after both runs (delete_source=False) + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_multi_accession_manifest( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Multiple accessions in a single manifest are all archived.""" + accessions = [ + ("GCF_000001215.4", "GCF_000001215.4_Release_6", "GCF/000/001/215"), + ("GCF_000005845.2", "GCF_000005845.2_ASM584v2", "GCF/000/005/845"), + ] + for accession, asm_dir, path in accessions: + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/{path}/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "updated.txt" + manifest.write_text("\n".join(acc for acc, _, _ in accessions) + "\n") + + archived = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + + assert archived == len(accessions) + for accession, asm_dir, path in accessions: + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/{path}/{asm_dir}/{accession}_genomic.fna.gz" + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) + assert result.get("KeyCount", 0) == 1, f"Archive missing for {accession}" + + +@pytest.mark.s3 +def test_archive_assemblies_dry_run_multi_file( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """dry_run with multiple files per accession makes no copies and no deletes.""" + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + file_names = [f"{accession}_genomic.fna.gz", f"{accession}_protein.faa.gz", f"{accession}_rna.fna.gz"] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + dry_run=True, + ) + + # Reported count matches + assert archived == len(file_names) + # No actual archive keys created + archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/" + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix) + assert result.get("KeyCount", 0) == 0 + # Sources untouched + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_invalid_accession_skipped( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Malformed accession lines are skipped; valid ones still archived.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "mixed.txt" + manifest.write_text("NOT_AN_ACCESSION\n\n \n" + f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + assert archived == 1 + + +# ── Concurrent / multi-file promotion (new behaviour) ──────────────────── + + +@pytest.mark.s3 +def test_promote_multi_file_all_land_at_final_path( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """All files for an assembly are promoted concurrently — none missed.""" + file_names = [ + f"{_ACC1}_genomic.fna.gz", + f"{_ACC1}_protein.faa.gz", + f"{_ACC1}_rna.fna.gz", + f"{_ACC1}_assembly_report.txt", + f"{_ACC1}_assembly_stats.txt", + ] + _stage(mock_s3_client_no_checksum, _STG1, {f: f.encode() for f in file_names}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == len(file_names) + assert report["failed"] == 0 + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_file_content_preserved( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Content at the final key is byte-identical to the staged content.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"\x1f\x8bGENOMIC", + f"{_ACC1}_protein.faa.gz": b"\x1f\x8bPROTEIN", + f"{_ACC1}_rna.fna.gz": b"\x1f\x8bRNA", + } + _stage(mock_s3_client_no_checksum, _STG1, files) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for fname, expected in files.items(): + obj = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert obj["Body"].read() == expected, f"Content mismatch for {fname}" + + +@pytest.mark.s3 +def test_promote_md5_metadata_set_from_sidecar( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """MD5 metadata on the promoted object matches the .md5 sidecar value.""" + content = b"\x1f\x8bGENOMIC" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: content}, with_md5=True) + expected_md5 = hashlib.md5(content).hexdigest() # noqa: S324 + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") == expected_md5 + + +@pytest.mark.s3 +def test_promote_no_sidecar_no_md5_metadata( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """A file staged without a .md5 sidecar is promoted but carries no md5 metadata.""" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: b"data"}, with_md5=False) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") is None + + +@pytest.mark.s3 +def test_promote_staging_data_files_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged data files are deleted from staging after a fully successful assembly promote.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"genomic", + f"{_ACC1}_protein.faa.gz": b"protein", + } + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, files) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for key in staged_keys: + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) + assert result.get("KeyCount", 0) == 0, f"Staged data file not deleted: {key}" + + +@pytest.mark.s3 +def test_promote_md5_sidecars_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged .md5 sidecar files are deleted from staging after a successful promote.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"genomic", + f"{_ACC1}_protein.faa.gz": b"protein", + } + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for key in staged_keys: + for sidecar_key in (f"{key}.md5", f"{key}.crc64nvme"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=sidecar_key) + assert result.get("KeyCount", 0) == 0, f"Sidecar not deleted: {sidecar_key}" + + +@pytest.mark.s3 +def test_promote_crc64nvme_sidecars_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged .crc64nvme sidecar files are also batch-deleted after a successful promote.""" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: b"data"}, with_md5=True, with_crc64=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + staged_key = f"{_STG1}{fname}" + for sidecar_key in (f"{staged_key}.md5", f"{staged_key}.crc64nvme"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=sidecar_key) + assert result.get("KeyCount", 0) == 0, f"Sidecar not deleted: {sidecar_key}" + + +@pytest.mark.s3 +def test_promote_partial_failure_staging_not_cleaned( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """When one file in an assembly fails, NO staged files for that assembly are deleted. + + Preserving staging on partial failure lets an operator re-run without + re-staging and without losing the partially-promoted state. + """ + file_ok = f"{_ACC1}_genomic.fna.gz" + file_fail = f"{_ACC1}_protein.faa.gz" + _stage(mock_s3_client_no_checksum, _STG1, {file_ok: b"ok", file_fail: b"fail"}) + staged_ok = f"{_STG1}{file_ok}" + staged_fail = f"{_STG1}{file_fail}" + + # Make download_file raise for exactly the failing key + original_download = mock_s3_client_no_checksum.download_file + + def _download_one_fail(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == staged_fail: + msg = "simulated download failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _download_one_fail + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + # Staging files must still be present (cleanup skipped due to failure) + for key in (staged_ok, staged_fail): + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, ( + f"Expected staged file to survive partial failure: {key}" + ) # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_partial_failure_failed_count( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """report[\"failed\"] reflects the number of files that could not be promoted.""" + file_names = [f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz", f"{_ACC1}_rna.fna.gz"] + _stage(mock_s3_client_no_checksum, _STG1, {f: b"data" for f in file_names}) + + failing_key = f"{_STG1}{file_names[1]}" + original_download = mock_s3_client_no_checksum.download_file + + def _download_middle_fail(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == failing_key: + msg = "simulated failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _download_middle_fail + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + assert report["promoted"] == 2 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_two_assemblies_independent_cleanup( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """A fully successful assembly cleans up its staging even when another assembly partially fails. + + Assembly 1 fully succeeds → staging cleared. + Assembly 2 has one failing file → staging NOT cleared. + """ + # Assembly 1: two files, both succeed + _stage( + mock_s3_client_no_checksum, + _STG1, + {f"{_ACC1}_genomic.fna.gz": b"g1", f"{_ACC1}_protein.faa.gz": b"p1"}, + ) + # Assembly 2: two files, one will fail + _stage( + mock_s3_client_no_checksum, + _STG2, + {f"{_ACC2}_genomic.fna.gz": b"g2", f"{_ACC2}_protein.faa.gz": b"p2"}, + ) + failing_key = f"{_STG2}{_ACC2}_protein.faa.gz" + original_download = mock_s3_client_no_checksum.download_file + + def _patched(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == failing_key: + msg = "simulated failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _patched + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + + # Assembly 1 staging must be gone + for fname in (f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{_STG1}{fname}") + assert result.get("KeyCount", 0) == 0, f"Assembly 1 staging should be cleaned: {fname}" + + # Assembly 2 staging must remain (partial failure) + for fname in (f"{_ACC2}_genomic.fna.gz", f"{_ACC2}_protein.faa.gz"): + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_STG2}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, ( + f"Assembly 2 staging must survive partial failure: {fname}" + ) # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_assembly_all_succeed_all_cleaned( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Two assemblies both fully succeed → all staged files removed for both.""" + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"g1"}) + _stage(mock_s3_client_no_checksum, _STG2, {f"{_ACC2}_genomic.fna.gz": b"g2"}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + + for stg, fname, lkh in ( + (_STG1, f"{_ACC1}_genomic.fna.gz", _LKH1), + (_STG2, f"{_ACC2}_genomic.fna.gz", _LKH2), + ): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{stg}{fname}") + assert result.get("KeyCount", 0) == 0, f"Staging not cleaned: {fname}" + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{lkh}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_dry_run_multi_file_no_writes_no_cleanup( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """dry_run with multiple files writes nothing to final path and does not delete staging.""" + file_names = [f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz", f"{_ACC1}_rna.fna.gz"] + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, {f: f.encode() for f in file_names}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + dry_run=True, + ) + + assert report["promoted"] == len(file_names) + assert report["dry_run"] is True + + # Final path must be empty + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=_LKH1) + assert result.get("KeyCount", 0) == 0 + + # Staging keys must survive + for key in staged_keys: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, f"Staging deleted during dry-run: {key}" # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_skips_non_raw_data_paths( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Files outside raw_data/ (e.g. download_report.json) are silently skipped.""" + # Stage a real data file alongside non-promotable files + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"data"}) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{_STAGE_PREFIX}download_report.json", Body=b"{}") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{_STAGE_PREFIX}logs/run.log", Body=b"logs") + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == 1 # only the .fna.gz + assert report["failed"] == 0 + + +@pytest.mark.s3 +def test_promote_idempotent_second_run_on_empty_staging( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Second promote run after staging has been cleaned promotes 0 files without error.""" + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"data"}) + + report1 = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + report2 = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report1["promoted"] == 1 + assert report2["promoted"] == 0 + assert report2["failed"] == 0 + + # Final key still present after second run + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{_ACC1}_genomic.fna.gz") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_file_md5_per_file( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Each promoted file carries the MD5 matching its own content, not another file's.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"GENOMIC_UNIQUE", + f"{_ACC1}_protein.faa.gz": b"PROTEIN_UNIQUE", + f"{_ACC1}_rna.fna.gz": b"RNA_UNIQUE", + } + _stage(mock_s3_client_no_checksum, _STG1, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for fname, content in files.items(): + expected_md5 = hashlib.md5(content).hexdigest() # noqa: S324 + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") == expected_md5, f"Wrong MD5 on {fname}" diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py new file mode 100644 index 00000000..99a9ebfd --- /dev/null +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -0,0 +1,576 @@ +"""Tests for pipelines.ncbi_ftp_download — settings, batch orchestration, CLI.""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import boto3 +import pytest +from moto import mock_aws +from pydantic import ValidationError + +from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST +from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT +from cdm_data_loaders.pipelines.ncbi_ftp_download import ( + DownloadSettings, + download_and_stage, + download_batch, +) +from cdm_data_loaders.utils.s3 import reset_s3_client + +_MOCK_STATS = { + "accession": "GCF_000001215.4", + "assembly_dir": "GCF_000001215.4_Release_6_plus_ISO1_MT", + "files_downloaded": 0, + "files_skipped_checksum_mismatch": 0, + "files_without_checksum": 0, +} + +_DEFAULT_THREADS = 4 +_CUSTOM_THREADS = 8 +_ALIAS_THREADS = 16 +_BOUNDARY_MIN = 1 +_BOUNDARY_MAX = 32 +_OVER_MAX = 64 +_CUSTOM_LIMIT = 100 +_ALIAS_LIMIT = 50 +_EXPECTED_ATTEMPTED = 2 + + +def make_settings(**kwargs: str | int) -> DownloadSettings: + """Generate a validated DownloadSettings object.""" + return DownloadSettings(_cli_parse_args=[], **kwargs) + + +# ── Settings defaults ──────────────────────────────────────────────────── + + +class TestDownloadSettingsDefaults: + """Test default settings.""" + + def test_manifest_default(self) -> None: + """Verify default manifest path uses INPUT_MOUNT.""" + s = make_settings() + assert s.manifest == f"{INPUT_MOUNT}/transfer_manifest.txt" + + def test_output_dir_default(self) -> None: + """Verify default output_dir uses OUTPUT_MOUNT.""" + s = make_settings() + assert s.output_dir == OUTPUT_MOUNT + + def test_threads_default(self) -> None: + """Verify default threads is 4.""" + s = make_settings() + assert s.threads == _DEFAULT_THREADS + + def test_ftp_host_default(self) -> None: + """Verify default ftp_host matches FTP_HOST constant.""" + s = make_settings() + assert s.ftp_host == FTP_HOST + + def test_limit_default_none(self) -> None: + """Verify default limit is None.""" + s = make_settings() + assert s.limit is None + + +# ── Settings all params ────────────────────────────────────────────────── + + +class TestDownloadSettingsAllParams: + """Test with all params set.""" + + def test_all_params(self) -> None: + """Verify all parameters are correctly set when provided.""" + s = make_settings( + manifest="/data/my_manifest.txt", + output_dir="/data/output", + threads=_CUSTOM_THREADS, + ftp_host="ftp.example.com", + limit=_CUSTOM_LIMIT, + ) + assert s.manifest == "/data/my_manifest.txt" + assert s.output_dir == "/data/output" + assert s.threads == _CUSTOM_THREADS + assert s.ftp_host == "ftp.example.com" + assert s.limit == _CUSTOM_LIMIT + + +# ── Settings aliases ───────────────────────────────────────────────────── + + +class TestDownloadSettingsAliases: + """Test CLI alias resolution.""" + + def test_manifest_alias_m(self) -> None: + """Verify 'm' alias resolves to manifest.""" + s = make_settings(m="/data/m.txt") + assert s.manifest == "/data/m.txt" + + def test_output_dir_alias(self) -> None: + """Verify 'output_dir' / 'output-dir' alias resolves to output_dir.""" + s = make_settings(output_dir="/data/o") + assert s.output_dir == "/data/o" + + def test_threads_alias_t(self) -> None: + """Verify 't' alias resolves to threads.""" + s = make_settings(t=_ALIAS_THREADS) + assert s.threads == _ALIAS_THREADS + + def test_limit_alias_l(self) -> None: + """Verify 'l' alias resolves to limit.""" + s = make_settings(l=_ALIAS_LIMIT) + assert s.limit == _ALIAS_LIMIT + + +# ── Settings validation ────────────────────────────────────────────────── + + +class TestDownloadSettingsValidation: + """Test validation constraints.""" + + def test_threads_too_low(self) -> None: + """Verify threads=0 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(threads=0) + + def test_threads_too_high(self) -> None: + """Verify threads above 32 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(threads=_OVER_MAX) + + def test_threads_boundary_1(self) -> None: + """Verify threads=1 is accepted.""" + s = make_settings(threads=_BOUNDARY_MIN) + assert s.threads == _BOUNDARY_MIN + + def test_threads_boundary_32(self) -> None: + """Verify threads=32 is accepted.""" + s = make_settings(threads=_BOUNDARY_MAX) + assert s.threads == _BOUNDARY_MAX + + def test_limit_must_be_positive(self) -> None: + """Verify limit=0 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(limit=0) + + +# ── download_batch ─────────────────────────────────────────────────────── + + +class TestDownloadBatch: + """Test download_batch with mocked internals.""" + + @pytest.fixture(autouse=True) + def _mock_ftp_pool(self) -> None: + """Prevent real FTP connections from the ThreadLocalFTP pool.""" + mock_pool = MagicMock() + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP", return_value=mock_pool): + yield + + def test_reads_manifest_and_calls_download(self, tmp_path: Path) -> None: + """Verify manifest is read and download is called for each entry.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" + ) + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "test", "files_downloaded": 3} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + report = download_batch( + manifest_path=str(manifest), + output_dir=str(output), + threads=1, + ftp_host="ftp.example.com", + ) + + assert report["total_attempted"] == _EXPECTED_ATTEMPTED + assert report["succeeded"] == _EXPECTED_ATTEMPTED + assert report["failed"] == 0 + + def test_limit_truncates(self, tmp_path: Path) -> None: + """Verify limit parameter truncates the number of assemblies processed.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" + ) + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "test", "files_downloaded": 1} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + report = download_batch( + manifest_path=str(manifest), + output_dir=str(output), + threads=1, + limit=1, + ) + assert report["total_attempted"] == 1 + + def test_writes_report_json(self, tmp_path: Path) -> None: + """Verify download_report.json is written to the output directory.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "GCF_000001215.4", "files_downloaded": 5} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + download_batch(manifest_path=str(manifest), output_dir=str(output), threads=1) + + report_file = output / "download_report.json" + assert report_file.exists() + report = json.loads(report_file.read_text()) + assert "timestamp" in report + assert report["succeeded"] == 1 + + def test_handles_download_failure(self, tmp_path: Path) -> None: + """Verify failed downloads are counted and do not crash the batch.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") + output = tmp_path / "output" + output.mkdir() + + with patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=RuntimeError("connection lost"), + ): + report = download_batch(manifest_path=str(manifest), output_dir=str(output), threads=1) + + assert report["failed"] == 1 + assert report["succeeded"] == 0 + + +# ── Helpers shared by download_and_stage tests ─────────────────────────── + +_MANIFEST_CONTENT = ( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" +) +_TEST_BUCKET = "test-bucket" +_STAGING_PREFIX = "staging/run1/" + + +def _make_moto_s3(): + """Return a moto-backed S3 client with the test bucket created.""" + client = boto3.client("s3", region_name="us-east-1") + client.create_bucket(Bucket=_TEST_BUCKET) + return client + + +# ── download_and_stage — manifest source ──────────────────────────────── + + +@pytest.mark.parametrize( + ("manifest_s3_key", "use_local"), + [ + pytest.param("staging/input/transfer_manifest.txt", False, id="s3_source"), + pytest.param(None, True, id="local_source"), + ], +) +@mock_aws +def test_download_and_stage_manifest_source( + tmp_path: Path, + manifest_s3_key: str | None, + use_local: bool, +) -> None: + """Assembly paths from the manifest are processed regardless of source (S3 or local).""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local: Path | None = None + if manifest_s3_key is not None: + s3.put_object(Bucket=_TEST_BUCKET, Key=manifest_s3_key, Body=_MANIFEST_CONTENT.encode()) + else: + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + called_paths: list[str] = [] + + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 + called_paths.append(path) + return _MOCK_STATS + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), + ): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=manifest_s3_key, + manifest_local_path=manifest_local, + dry_run=True, + threads=1, + ) + + expected_paths = [l for l in _MANIFEST_CONTENT.splitlines() if l.strip()] + assert sorted(called_paths) == sorted(expected_paths) + + reset_s3_client() + + +# ── download_and_stage — exactly one source required ──────────────────── + + +@pytest.mark.parametrize( + ("s3_key", "local_path", "should_raise"), + [ + pytest.param("s3/key", "local/path", True, id="both_provided_raises"), + pytest.param(None, None, True, id="neither_provided_raises"), + pytest.param("s3/key", None, False, id="s3_only_ok"), + pytest.param(None, "local/path", False, id="local_only_ok"), + ], +) +@mock_aws +def test_download_and_stage_exactly_one_source_required( + tmp_path: Path, + s3_key: str | None, + local_path: str | None, + should_raise: bool, +) -> None: + """ValueError is raised when both or neither manifest sources are given.""" + reset_s3_client() + + if should_raise: + with pytest.raises(ValueError, match="manifest"): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=s3_key, + manifest_local_path=local_path, + ) + else: + s3 = _make_moto_s3() + # For s3_only: seed the object; for local_only: create the file + if s3_key is not None: + s3.put_object(Bucket=_TEST_BUCKET, Key=s3_key, Body=_MANIFEST_CONTENT.encode()) + if local_path is not None: + real_local = tmp_path / "manifest.txt" + real_local.write_text(_MANIFEST_CONTENT) + local_path = real_local + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ), + ): + result = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=s3_key, + manifest_local_path=local_path, + dry_run=True, + ) + assert result["succeeded"] == _EXPECTED_ATTEMPTED + + reset_s3_client() + + +# ── download_and_stage — uploads to staging ────────────────────────────── + + +@mock_aws +def test_download_and_stage_uploads_to_staging(tmp_path: Path) -> None: + """Files produced by download_assembly_to_local and download_report.json are all staged to S3.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + # Single assembly so the fake download writes exactly the files we expect + manifest_local.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") + + assembly_rel = "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" + + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 + asm_dir = Path(output_dir) / assembly_rel + asm_dir.mkdir(parents=True) + (asm_dir / "genomic.fna.gz").write_bytes(b"fasta_data") + (asm_dir / "genomic.fna.gz.md5").write_bytes(b"abc123") + return {**_MOCK_STATS, "files_downloaded": 2} + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=False, + threads=1, + ) + + paginator = s3.get_paginator("list_objects_v2") + uploaded_keys = {obj["Key"] for page in paginator.paginate(Bucket=_TEST_BUCKET) for obj in page.get("Contents", [])} + + expected_keys = { + f"{_STAGING_PREFIX}{assembly_rel}/genomic.fna.gz", + f"{_STAGING_PREFIX}{assembly_rel}/genomic.fna.gz.md5", + f"{_STAGING_PREFIX}download_report.json", + } + assert uploaded_keys == expected_keys + assert report["staged_objects"] == len(expected_keys) + + reset_s3_client() + + +# ── download_and_stage — dry_run skips upload ──────────────────────────── + + +@mock_aws +def test_download_and_stage_dry_run_skips_upload(tmp_path: Path) -> None: + """dry_run=True leaves S3 empty and returns staged_objects=0.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 + asm_dir = Path(output_dir) / "raw_data/GCF/000/001/215/GCF_000001215.4" + asm_dir.mkdir(parents=True, exist_ok=True) + (asm_dir / "genomic.fna.gz").write_bytes(b"fasta") + return _MOCK_STATS + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=True, + threads=1, + ) + + listed = s3.list_objects_v2(Bucket=_TEST_BUCKET) + assert listed.get("KeyCount", 0) == 0 + assert report["staged_objects"] == 0 + assert report["dry_run"] is True + + reset_s3_client() + + +# ── download_and_stage — limit forwarded ──────────────────────────────── + + +@pytest.mark.parametrize( + "limit", + [ + pytest.param(1, id="limit_1"), + pytest.param(10, id="limit_10"), + ], +) +@mock_aws +def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: + """The limit parameter truncates the number of assemblies processed.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ) as mock_dl, + ): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + limit=limit, + dry_run=True, + ) + + # The manifest has 2 entries; limit caps how many were processed + expected_calls = min(limit, _EXPECTED_ATTEMPTED) + assert mock_dl.call_count == expected_calls + + reset_s3_client() + + +# ── download_and_stage — report shape ─────────────────────────────────── + + +@mock_aws +def test_download_and_stage_report_shape(tmp_path: Path) -> None: + """Return value contains all expected keys including staged_objects, staging_key_prefix, dry_run.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=True, + ) + + for key in ("timestamp", "total_attempted", "succeeded", "failed", "failures", "assembly_stats"): + assert key in report + assert report["staged_objects"] == 0 + assert report["staging_key_prefix"] == _STAGING_PREFIX + assert report["dry_run"] is True + assert report["total_attempted"] == _EXPECTED_ATTEMPTED + assert report["succeeded"] == _EXPECTED_ATTEMPTED + + reset_s3_client() diff --git a/tests/s3_helpers.py b/tests/s3_helpers.py new file mode 100644 index 00000000..cf98c3d5 --- /dev/null +++ b/tests/s3_helpers.py @@ -0,0 +1,25 @@ +"""Shared S3 test helpers. + +# NOTE: Moto currently does not support CRC64NVME; remove this helper when it does. +""" + +import functools +from collections.abc import Callable +from typing import Any + + +def strip_checksum_algorithm(method: Callable[..., Any]) -> Callable[..., Any]: + """Wrap a boto3 S3 method to remove the ChecksumAlgorithm argument before calling moto. + + Moto does not implement CRC64NVME checksums, so any call that includes + ChecksumAlgorithm='CRC64NVME' would fail. This wrapper silently drops the + argument so the rest of the call proceeds normally against the moto backend. + """ + + @functools.wraps(method) + def wrapper(*args: Any, **kwargs: Any) -> Any: + """Remove the ChecksumAlgorithm argument from the call.""" + kwargs.pop("ChecksumAlgorithm", None) + return method(*args, **kwargs) + + return wrapper diff --git a/tests/utils/test_checksums.py b/tests/utils/test_checksums.py new file mode 100644 index 00000000..6c7bdbc6 --- /dev/null +++ b/tests/utils/test_checksums.py @@ -0,0 +1,76 @@ +"""Tests for utils.checksums module — MD5 and CRC64/NVME checksum utilities.""" + +import base64 +import hashlib +from pathlib import Path + +import pytest + +from cdm_data_loaders.utils.checksums import compute_md5, verify_md5 + +_EXPECTED_CRC64_BYTE_LEN = 8 + + +class TestComputeMd5: + """Test MD5 computation.""" + + def test_correct_hash(self, tmp_path: Path) -> None: + """Verify MD5 matches hashlib reference.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + assert compute_md5(f) == hashlib.md5(b"Hello, World!").hexdigest() # noqa: S324 + + def test_empty_file(self, tmp_path: Path) -> None: + """Verify MD5 of an empty file.""" + f = tmp_path / "empty" + f.write_bytes(b"") + assert compute_md5(f) == hashlib.md5(b"").hexdigest() # noqa: S324 + + def test_accepts_str_path(self, tmp_path: Path) -> None: + """Verify compute_md5 accepts a string path.""" + f = tmp_path / "test.txt" + f.write_bytes(b"data") + assert compute_md5(str(f)) == hashlib.md5(b"data").hexdigest() # noqa: S324 + + +class TestVerifyMd5: + """Test MD5 verification.""" + + def test_correct(self, tmp_path: Path) -> None: + """Verify True when MD5 matches.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + expected = hashlib.md5(b"Hello, World!").hexdigest() # noqa: S324 + assert verify_md5(f, expected) is True + + def test_incorrect(self, tmp_path: Path) -> None: + """Verify False when MD5 does not match.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + assert verify_md5(f, "0000000000000000") is False + + +class TestComputeCrc64nvme: + """Test CRC64/NVME computation (skipped if awscrt unavailable).""" + + @pytest.fixture(autouse=True) + def _skip_if_no_awscrt(self) -> None: + pytest.importorskip("awscrt") + + def test_returns_base64(self, tmp_path: Path) -> None: + """Verify CRC64/NVME returns an 8-byte base64 string.""" + from cdm_data_loaders.utils.checksums import compute_crc64nvme # noqa: PLC0415 + + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + crc = compute_crc64nvme(f) + decoded = base64.b64decode(crc) + assert len(decoded) == _EXPECTED_CRC64_BYTE_LEN + + def test_deterministic(self, tmp_path: Path) -> None: + """Verify repeated calls return the same checksum.""" + from cdm_data_loaders.utils.checksums import compute_crc64nvme # noqa: PLC0415 + + f = tmp_path / "test.bin" + f.write_bytes(b"test data for checksum") + assert compute_crc64nvme(f) == compute_crc64nvme(f) diff --git a/tests/utils/test_ftp_client.py b/tests/utils/test_ftp_client.py new file mode 100644 index 00000000..385fd329 --- /dev/null +++ b/tests/utils/test_ftp_client.py @@ -0,0 +1,199 @@ +"""Tests for utils.ftp_client module — mock ftplib for keepalive, retry, thread-local.""" + +import socket +import time +from collections.abc import Callable +from ftplib import FTP, error_temp +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from cdm_data_loaders.utils.ftp_client import ( + ThreadLocalFTP, + _set_keepalive, + connect_ftp, + ftp_download_file, + ftp_list_dir, + ftp_noop_keepalive, + ftp_retrieve_text, +) + +_IDLE_SECONDS = 30 +_KEEPIDLE_VALUE = 60 +_KEEPALIVE_INTERVAL = 25 +_EXPECTED_RETRY_COUNT = 2 +_FTP_TIMEOUT = 30 +_ERR_421 = "421 timeout" + + +class TestSetKeepalive: + """Test TCP keepalive socket options.""" + + def test_sets_so_keepalive(self) -> None: + """Verify SO_KEEPALIVE is set on the socket.""" + mock_ftp = MagicMock(spec=FTP) + mock_sock = MagicMock() + mock_ftp.sock = mock_sock + _set_keepalive(mock_ftp) + mock_sock.setsockopt.assert_any_call(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + + def test_sets_tcp_keepidle(self) -> None: + """Verify TCP_KEEPIDLE is set when available.""" + mock_ftp = MagicMock(spec=FTP) + mock_sock = MagicMock() + mock_ftp.sock = mock_sock + _set_keepalive(mock_ftp, idle=_KEEPIDLE_VALUE) + if hasattr(socket, "TCP_KEEPIDLE"): + mock_sock.setsockopt.assert_any_call(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, _KEEPIDLE_VALUE) + + +class TestConnectFtp: + """Test connect_ftp creates and configures an FTP connection.""" + + @patch("cdm_data_loaders.utils.ftp_client.FTP") + def test_connect_and_login(self, mock_ftp_cls: MagicMock) -> None: + """Verify FTP object is created, login called, and returned.""" + mock_ftp = MagicMock() + mock_ftp.sock = MagicMock() + mock_ftp_cls.return_value = mock_ftp + result = connect_ftp("ftp.example.com", timeout=_FTP_TIMEOUT) + mock_ftp_cls.assert_called_once_with("ftp.example.com", timeout=_FTP_TIMEOUT) + mock_ftp.login.assert_called_once() + assert result is mock_ftp + + +class TestFtpNoopKeepalive: + """Test NOOP keepalive logic.""" + + def test_sends_noop_when_idle(self) -> None: + """Verify NOOP is sent when idle exceeds interval.""" + mock_ftp = MagicMock(spec=FTP) + old_time = time.monotonic() - _IDLE_SECONDS + new_time = ftp_noop_keepalive(mock_ftp, old_time, interval=_KEEPALIVE_INTERVAL) + mock_ftp.sendcmd.assert_called_once_with("NOOP") + assert new_time > old_time + + def test_no_noop_when_recent(self) -> None: + """Verify no NOOP is sent when activity is recent.""" + mock_ftp = MagicMock(spec=FTP) + recent = time.monotonic() + result = ftp_noop_keepalive(mock_ftp, recent, interval=_KEEPALIVE_INTERVAL) + mock_ftp.sendcmd.assert_not_called() + assert result == recent + + +class TestFtpListDir: + """Test ftp_list_dir with retry.""" + + def test_returns_file_list(self) -> None: + """Verify file listing is returned correctly.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + for name in ["file1.txt", "file2.gz"]: + callback(name) + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_list_dir(mock_ftp, "/some/path") + assert result == ["file1.txt", "file2.gz"] + mock_ftp.cwd.assert_called_once_with("/some/path") + + def test_retries_on_error_temp(self) -> None: + """Verify retry logic on FTP temporary errors.""" + mock_ftp = MagicMock(spec=FTP) + call_count = 0 + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + nonlocal call_count + call_count += 1 + if call_count < _EXPECTED_RETRY_COUNT: + raise error_temp(_ERR_421) # noqa: S321 + callback("file.txt") + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_list_dir(mock_ftp, "/path", retries=3) + assert result == ["file.txt"] + assert call_count == _EXPECTED_RETRY_COUNT + + def test_raises_after_exhausted_retries(self) -> None: + """Verify error is raised after all retries are exhausted.""" + mock_ftp = MagicMock(spec=FTP) + mock_ftp.retrlines.side_effect = error_temp(_ERR_421) # noqa: S321 + with pytest.raises(error_temp): + ftp_list_dir(mock_ftp, "/path", retries=_EXPECTED_RETRY_COUNT) + + +class TestFtpDownloadFile: + """Test ftp_download_file with retry.""" + + def test_downloads_file(self, tmp_path: Path) -> None: + """Verify file is downloaded and written to disk.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrbinary(_cmd: str, callback: Callable[[bytes], None]) -> None: + callback(b"file data") + + mock_ftp.retrbinary.side_effect = fake_retrbinary + local = tmp_path / "out.bin" + ftp_download_file(mock_ftp, "remote.bin", str(local)) + assert local.read_bytes() == b"file data" + + def test_retries_on_error_temp(self, tmp_path: Path) -> None: + """Verify download retries on FTP temporary errors.""" + mock_ftp = MagicMock(spec=FTP) + call_count = 0 + + def fake_retrbinary(_cmd: str, callback: Callable[[bytes], None]) -> None: + nonlocal call_count + call_count += 1 + if call_count < _EXPECTED_RETRY_COUNT: + msg = "421" + raise error_temp(msg) # noqa: S321 + callback(b"ok") + + mock_ftp.retrbinary.side_effect = fake_retrbinary + local = str(tmp_path / "out.bin") + ftp_download_file(mock_ftp, "remote.bin", local, retries=3) + assert call_count == _EXPECTED_RETRY_COUNT + + +class TestFtpRetrieveText: + """Test ftp_retrieve_text.""" + + def test_returns_content(self) -> None: + """Verify text content is retrieved and joined with newlines.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + for line in ["line1", "line2"]: + callback(line) + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_retrieve_text(mock_ftp, "remote.txt") + assert result == "line1\nline2" + + +class TestThreadLocalFTP: + """Test thread-local FTP connection management.""" + + @patch("cdm_data_loaders.utils.ftp_client.connect_ftp") + def test_get_returns_same_connection(self, mock_connect: MagicMock) -> None: + """Verify get() returns the same FTP connection on repeated calls.""" + mock_ftp = MagicMock() + mock_connect.return_value = mock_ftp + pool = ThreadLocalFTP("ftp.example.com") + ftp1 = pool.get() + ftp2 = pool.get() + assert ftp1 is ftp2 + mock_connect.assert_called_once() + + @patch("cdm_data_loaders.utils.ftp_client.connect_ftp") + def test_close_all(self, mock_connect: MagicMock) -> None: + """Verify close_all() quits the FTP connection.""" + mock_ftp = MagicMock() + mock_connect.return_value = mock_ftp + pool = ThreadLocalFTP("ftp.example.com") + pool.get() + pool.close_all() + mock_ftp.quit.assert_called_once() diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 0ba7d567..8c6c2f9b 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -1,9 +1,8 @@ """Tests for s3_utils.py using moto to mock AWS S3.""" -import functools import io import logging -from collections.abc import Callable, Generator +from collections.abc import Generator from pathlib import Path from typing import Any from unittest.mock import MagicMock, patch @@ -16,12 +15,14 @@ from requests.exceptions import HTTPError import cdm_data_loaders.utils.s3 as s3_utils +from tests.s3_helpers import strip_checksum_algorithm from cdm_data_loaders.utils.s3 import ( CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, copy_directory, copy_object, delete_object, + delete_objects, download_file, get_s3_client, head_object, @@ -51,6 +52,11 @@ } BUCKETS = [CDM_LAKE_BUCKET, ALT_BUCKET] +HTTP_STATUS_OK = 200 +HTTP_STATUS_NO_CONTENT = 204 +SIZE_HELLO = 5 +SIZE_DATA = 4 + @pytest.fixture def mock_s3_client() -> Generator[Any, Any]: @@ -318,7 +324,7 @@ def test_list_matching_objects_empty_for_missing_prefix( } -# TODO: use a single fixture for all these tests +# NOTE: These tests currently compose multiple fixtures explicitly for readability. @pytest.mark.parametrize("dir_path", EXPECTED_FILE_LIST.keys()) @pytest.mark.s3 def test_list_matching_objects_returns_more_than_1000_entries( @@ -344,13 +350,13 @@ def test_head_object_and_object_exists_true_and_false(mock_s3_client: Any, proto for bucket, file_list in FILES_IN_BUCKETS.items(): for f in file_list: output = head_object(f"{protocol}{bucket}/{f}") - assert output.get("ResponseMetadata", {}).get("HTTPStatusCode") == 200 + assert output is not None + assert isinstance(output["size"], int) assert object_exists(f"{protocol}{bucket}/{f}") is True nonexistent_file = f"{protocol}{bucket}/a-file-i-just-made-up.txt" assert object_exists(nonexistent_file) is False - with pytest.raises(ClientError, match=r"An error occurred \(404\) when calling the HeadObject operation"): - head_object(nonexistent_file) + assert head_object(nonexistent_file) is None @pytest.mark.parametrize("s3_path", ["absent", "dir_one", "dir_one/", "dir_one/file1.tnt"]) @@ -388,16 +394,14 @@ def test_upload_file_uses_custom_object_name(mock_s3_client: Any, sample_file: P @pytest.mark.s3 -def test_upload_file_skips_when_already_present( - mock_s3_client: Any, sample_file: Path, caplog: pytest.LogCaptureFixture -) -> None: - """Verify that uploading a file that already exists is skipped and returns True.""" +def test_upload_file_skips_when_already_present(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that uploading a file that already exists is skipped, returns True, and leaves the object unchanged.""" mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads") assert result is True - last_log_message = caplog.records[-1] - assert "File already present" in last_log_message.message - assert last_log_message.levelno == logging.INFO + # The existing object must not have been overwritten + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") + assert obj["Body"].read() == b"old" @pytest.mark.usefixtures("mock_s3_client") @@ -684,33 +688,15 @@ def test_upload_dir_raises_on_empty_destination(sample_dir: Path) -> None: upload_dir(sample_dir, "") -# FIXME: once moto supports CRC64NVME, this can be removed -def strip_checksum_algorithm(method: Callable): - """Wrap a boto3 S3 method to remove the ChecksumAlgorithm argument before calling moto. - - Moto does not implement CRC64NVME checksums, so any call that includes - ChecksumAlgorithm='CRC64NVME' would fail. This wrapper silently drops the - argument so the rest of the call proceeds normally against the moto backend. - """ - - @functools.wraps(method) - def wrapper(*args, **kwargs): - """Remove the ChecksumAlgorithm argument from the call.""" - kwargs.pop("ChecksumAlgorithm", None) - return method(*args, **kwargs) - - return wrapper - - @pytest.fixture -def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Generator[Any, Any]: - """Yield the mocked S3 client with copy_object patched to strip ChecksumAlgorithm. +def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Any: + """Return the mocked S3 client with copy_object patched to strip ChecksumAlgorithm. This works around the moto limitation of not supporting CRC64NVME checksums, allowing copy_object calls that include ChecksumAlgorithm to succeed. """ mock_s3_client.copy_object = strip_checksum_algorithm(mock_s3_client.copy_object) - yield mock_s3_client + return mock_s3_client # copy_object @@ -728,7 +714,7 @@ def test_copy_object(mocked_s3_client_no_checksum: Any, destination: str) -> Non obj = mocked_s3_client_no_checksum.get_object(Bucket=destination, Key="dst/path/to/file.txt") assert obj["Body"].read() == b"copy me" - assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 + assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK @pytest.mark.s3 @@ -751,6 +737,16 @@ def test_copy_object_source_bucket_nonexistent() -> None: copy_object(s3_path, f"{CDM_LAKE_BUCKET}/a/different/path/to/file") +@pytest.mark.s3 +@pytest.mark.usefixtures("mock_s3_client") +def test_copy_file_source_object_nonexistent() -> None: + """Ensure that the code throws an error if the source object does not exist.""" + s3_path = f"{CDM_LAKE_BUCKET}/some/path/to/file" + assert object_exists(s3_path) is False + with pytest.raises(Exception, match="The specified key does not exist"): + copy_object(s3_path, f"{CDM_LAKE_BUCKET}/a/different/path/to/file") + + # copy_directory tests @@ -890,12 +886,127 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol resp = delete_object(s3_path) assert object_exists(s3_path) is False - assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == HTTP_STATUS_NO_CONTENT # retry the deletion resp = delete_object(s3_path) assert object_exists(s3_path) is False - assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == HTTP_STATUS_NO_CONTENT + + +# upload_file with metadata +@pytest.mark.parametrize("bucket", BUCKETS) +@pytest.mark.s3 +def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample_file: Path, bucket: str) -> None: + """Verify that upload_file with metadata stores user metadata on the uploaded object.""" + metadata = {"md5": "abc123", "source": "ncbi"} + result = upload_file(sample_file, f"{bucket}/uploads", tags=metadata) + assert result is True + + resp = mock_s3_client.head_object(Bucket=bucket, Key=f"uploads/{sample_file.name}") + assert resp["Metadata"]["md5"] == "abc123" + assert resp["Metadata"]["source"] == "ncbi" + + +@pytest.mark.s3 +def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that the object_name parameter overrides the filename.""" + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", tags={"k": "v"}, object_name="renamed.txt") + assert result is True + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key="uploads/renamed.txt") + assert obj["Body"].read() == b"hello s3" + + +@pytest.mark.s3 +def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that upload_file with metadata uploads even when the object already exists.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", tags={"new": "true"}) + assert result is True + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") + assert obj["Body"].read() == b"hello s3" + + +@pytest.mark.usefixtures("mock_s3_client") +@pytest.mark.s3 +def test_upload_file_with_metadata_raises_on_empty_destination(sample_file: Path) -> None: + """Verify ValueError when destination_dir is empty.""" + with pytest.raises(ValueError, match="No destination directory"): + upload_file(sample_file, "", tags={"k": "v"}) + + +@pytest.mark.usefixtures("mock_s3_client") +@pytest.mark.parametrize("path_type", [str, Path]) +@pytest.mark.s3 +def test_upload_file_with_metadata_accepts_str_and_path(sample_file: Path, path_type: type[str] | type[Path]) -> None: + """Verify that upload_file with metadata accepts both str and Path.""" + result = upload_file(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", tags={}) + assert result is True + + +# head_object +@pytest.mark.s3 +def test_head_object_returns_info(mock_s3_client: Any) -> None: + """Verify that head_object returns size, metadata, and checksum fields.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="info/file.txt", Body=b"hello", Metadata={"md5": "abc123"}) + result = head_object(f"{CDM_LAKE_BUCKET}/info/file.txt") + assert result is not None + assert result["size"] == SIZE_HELLO + assert result["metadata"]["md5"] == "abc123" + # moto may not populate CRC64NVME, but the key should be present + assert "checksum_crc64nvme" in result + + +@pytest.mark.s3 +@pytest.mark.usefixtures("mock_s3_client") +def test_head_object_returns_none_for_missing() -> None: + """Verify that head_object returns None for a non-existent object.""" + result = head_object(f"{CDM_LAKE_BUCKET}/does/not/exist.txt") + assert result is None + + +@pytest.mark.parametrize("protocol", ["", "s3://", "s3a://"]) +@pytest.mark.s3 +def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: + """Verify that head_object handles all valid protocol prefixes.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="proto/file.txt", Body=b"data") + result = head_object(f"{protocol}{CDM_LAKE_BUCKET}/proto/file.txt") + assert result is not None + assert result["size"] == SIZE_DATA + + +# copy_object +@pytest.mark.parametrize("destination", BUCKETS) +@pytest.mark.s3 +def test_copy_object_preserves_user_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: + """copy_object preserves source user metadata (MetadataDirective=COPY default).""" + mocked_s3_client_no_checksum.put_object( + Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"md5": "abc123"} + ) + response = copy_object( + f"{CDM_LAKE_BUCKET}/src/file.txt", + f"{destination}/archive/file.txt", + ) + assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK + + # source user metadata is preserved (MetadataDirective=COPY) + resp = mocked_s3_client_no_checksum.head_object(Bucket=destination, Key="archive/file.txt") + assert resp["Metadata"].get("md5") == "abc123" + + # verify source still exists + assert object_exists(f"{CDM_LAKE_BUCKET}/src/file.txt") + + +@pytest.mark.s3 +def test_copy_object_preserves_content(mocked_s3_client_no_checksum: Any) -> None: + """Verify that the content of the copied object matches the original.""" + mocked_s3_client_no_checksum.put_object(Bucket=CDM_LAKE_BUCKET, Key="src/data.bin", Body=b"binary data") + copy_object( + f"{CDM_LAKE_BUCKET}/src/data.bin", + f"{CDM_LAKE_BUCKET}/dst/data.bin", + ) + obj = mocked_s3_client_no_checksum.get_object(Bucket=CDM_LAKE_BUCKET, Key="dst/data.bin") + assert obj["Body"].read() == b"binary data" # delete_object - bucket does not exist @@ -907,3 +1018,34 @@ def test_delete_object_no_such_bucket() -> None: assert object_exists(s3_path) is False with pytest.raises(Exception, match="The specified bucket does not exist"): delete_object(s3_path) + + +# delete_objects +@pytest.mark.s3 +def test_delete_objects_removes_all(mock_s3_client: Any) -> None: + """delete_objects removes every listed key in a single call.""" + keys = ["bulk/a.txt", "bulk/b.txt", "bulk/c.txt"] + for k in keys: + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=k, Body=b"data") + + errors = delete_objects(CDM_LAKE_BUCKET, keys) + + assert errors == [] + for k in keys: + assert object_exists(f"{CDM_LAKE_BUCKET}/{k}") is False + + +@pytest.mark.s3 +def test_delete_objects_empty_list_is_noop(mock_s3_client: Any) -> None: + """delete_objects with an empty list makes no API call and returns no errors.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="keep/me.txt", Body=b"safe") + errors = delete_objects(CDM_LAKE_BUCKET, []) + assert errors == [] + assert object_exists(f"{CDM_LAKE_BUCKET}/keep/me.txt") is True + + +@pytest.mark.s3 +def test_delete_objects_nonexistent_keys_no_error(mock_s3_client: Any) -> None: + """Deleting keys that don't exist returns no errors (S3 delete is idempotent).""" + errors = delete_objects(CDM_LAKE_BUCKET, ["ghost/a.txt", "ghost/b.txt"]) + assert errors == []