From f7fc9d57c6459e0f679b4eebbdc5e96cee0b35d9 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 16 Apr 2026 15:18:29 -0700 Subject: [PATCH 01/76] draft ncbi scripts --- cdm_ncbi_fto.prompt.md | 292 ++++++++++ notebooks/ncbi_ftp_manifest.ipynb | 210 +++++++ notebooks/ncbi_ftp_promote.ipynb | 172 ++++++ pyproject.toml | 3 +- scripts/entrypoint.sh | 8 +- src/cdm_data_loaders/ncbi_ftp/__init__.py | 0 src/cdm_data_loaders/ncbi_ftp/assembly.py | 206 +++++++ src/cdm_data_loaders/ncbi_ftp/manifest.py | 471 ++++++++++++++++ src/cdm_data_loaders/ncbi_ftp/promote.py | 281 ++++++++++ .../pipelines/ncbi_ftp_download.py | 191 +++++++ src/cdm_data_loaders/utils/checksums.py | 55 ++ src/cdm_data_loaders/utils/ftp_client.py | 162 ++++++ src/cdm_data_loaders/utils/s3.py | 114 ++++ tests/ncbi_ftp/__init__.py | 0 tests/ncbi_ftp/conftest.py | 80 +++ tests/ncbi_ftp/test_assembly.py | 114 ++++ tests/ncbi_ftp/test_manifest.py | 516 ++++++++++++++++++ tests/ncbi_ftp/test_notebooks.py | 89 +++ tests/ncbi_ftp/test_promote.py | 278 ++++++++++ tests/pipelines/test_ncbi_ftp_download.py | 230 ++++++++ tests/utils/test_checksums.py | 76 +++ tests/utils/test_ftp_client.py | 199 +++++++ tests/utils/test_s3.py | 128 +++++ 23 files changed, 3872 insertions(+), 3 deletions(-) create mode 100644 cdm_ncbi_fto.prompt.md create mode 100644 notebooks/ncbi_ftp_manifest.ipynb create mode 100644 notebooks/ncbi_ftp_promote.ipynb create mode 100644 src/cdm_data_loaders/ncbi_ftp/__init__.py create mode 100644 src/cdm_data_loaders/ncbi_ftp/assembly.py create mode 100644 src/cdm_data_loaders/ncbi_ftp/manifest.py create mode 100644 src/cdm_data_loaders/ncbi_ftp/promote.py create mode 100644 src/cdm_data_loaders/pipelines/ncbi_ftp_download.py create mode 100644 src/cdm_data_loaders/utils/checksums.py create mode 100644 src/cdm_data_loaders/utils/ftp_client.py create mode 100644 tests/ncbi_ftp/__init__.py create mode 100644 tests/ncbi_ftp/conftest.py create mode 100644 tests/ncbi_ftp/test_assembly.py create mode 100644 tests/ncbi_ftp/test_manifest.py create mode 100644 tests/ncbi_ftp/test_notebooks.py create mode 100644 tests/ncbi_ftp/test_promote.py create mode 100644 tests/pipelines/test_ncbi_ftp_download.py create mode 100644 tests/utils/test_checksums.py create mode 100644 tests/utils/test_ftp_client.py diff --git a/cdm_ncbi_fto.prompt.md b/cdm_ncbi_fto.prompt.md new file mode 100644 index 00000000..a1e52592 --- /dev/null +++ b/cdm_ncbi_fto.prompt.md @@ -0,0 +1,292 @@ +# Plan: Port NCBI Assembly Sync to cdm-data-loaders + +Port the 3-phase NCBI assembly transfer pipeline from this repo +([kbase-transfers](https://github.com/kbase/kbase-transfers)) on the `develop-ncbi-automation` branchinto +[kbase/cdm-data-loaders](https://github.com/kbase/cdm-data-loaders/tree/develop) +(develop branch). + +- **Phase 2** (container download) becomes a new CTS entrypoint command. +- **Phases 1 and 3** become Jupyter notebooks in `notebooks/`. +- The existing cdm-data-loaders `utils/s3.py` gets new functions for metadata support + (existing functions are not modified). +- Tests use **moto** (matching cdm-data-loaders conventions). +- FTP logic stays as **ftplib**. +- New code lives in a dedicated `src/cdm_data_loaders/ncbi_ftp/` module, + separate from existing NCBI REST / refseq code. + +### Phase responsibilities + +Each phase has a deliberately narrow scope: + +| Phase | Input | Output | Responsibility | +|-------|-------|--------|----------------| +| 1 — Manifest | NCBI assembly summary + previous snapshot from S3 | `transfer_manifest.txt`, `removed_manifest.txt`, `diff_summary.json` | **All** filtering logic (prefix ranges, limits, diffing). Produces a final list of what to transfer and what to archive. | +| 2 — Download | `transfer_manifest.txt` (from input mount) | Downloaded files in output mount, preserving FTP directory structure (`GCF/000/001/.../assembly_dir/`) | Reads the manifest; downloads exactly those assemblies from NCBI FTP; verifies MD5; writes `.md5` sidecars. No filtering, no S3 access. | +| 3 — Promote | Downloaded files in S3 staging prefix + `removed_manifest.txt` | Files at final Lakehouse paths; archived replaced assemblies | Syncs staging → final location. Archives replaced/suppressed assemblies. **Removes successfully-promoted entries from `transfer_manifest.txt`** so an interrupted Phase 2 can resume from where it left off. | + +--- + +## Background: the 3-phase pipeline + +The pipeline is documented in this repo's +[scripts/ncbi/README.md](README.md#semi-automated-transfer-pipeline): + +| Phase | Script (this repo) | Purpose | +|-------|-------------------|---------| +| 1 — Manifest | [`generate_transfer_manifest.py`](generate_transfer_manifest.py) | Diff NCBI assembly summary against previous snapshot; produce download + removal manifests | +| 2 — Download | [`container_download.py`](container_download.py) | Download assemblies from NCBI FTP, verify MD5, write `.md5` sidecars (runs in CTS container) | +| 3 — Promote | [`promote_staged_files.py`](promote_staged_files.py) | Copy staged files to final Lakehouse paths; archive replaced/suppressed assemblies | + +Supporting code in this repo: + +| File | What to port | +|------|-------------| +| [`download_genomes.py`](download_genomes.py) | FTP resilience patterns (TCP keepalive, NOOP pings, thread-local connections), file filters, accession path construction | +| [`../../kbase_transfers/minio_client.py`](../../kbase_transfers/minio_client.py) | Metadata-aware upload pattern (MD5 as user metadata, CRC64/NVME checksums) | +| [`../../tests/test_sync.py`](../../tests/test_sync.py) | Unit tests for parsing, diffing — port to moto-based tests | +| [`../../tests/test_minio_client.py`](../../tests/test_minio_client.py) | Integration test patterns for S3 operations | + +--- + +## Phase A: Extend cdm-data-loaders `utils/s3.py` + +The promote step needs to attach user-metadata (MD5) to uploads and read +checksums via HEAD. The existing +[`s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/s3.py) +doesn't support custom metadata on upload or `head_object` with checksum +retrieval. + +**Prefer adding new functions over modifying existing ones to minimise +impact on other scripts.** + +### Steps + +1. **Add `upload_file_with_metadata()`** — new function that accepts + `local_file_path`, `destination_dir`, `metadata: dict[str, str]`, + optional `object_name`. Passes `Metadata` in `ExtraArgs` alongside the + existing `ChecksumAlgorithm: CRC64NVME`. Same upload logic as + `upload_file()` but with metadata support. + +2. **Add `head_object(s3_path)`** — new function returning dict with `size`, + `metadata`, `checksum_crc64nvme` (from `ChecksumCRC64NVME` header), or + `None` if 404. Uses `ChecksumMode='ENABLED'`. + +3. **Add `copy_object_with_metadata()`** — new function wrapping + `s3.copy_object()` that accepts `metadata` + `MetadataDirective='REPLACE'` + for archiving replaced assemblies with tags (`archive_reason`, `archive_date`, + `ncbi_last_release`). + +4. **Add moto-based tests** following the existing `mock_s3_client` fixture pattern + in [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py). + Use the existing `strip_checksum_algorithm` workaround for moto's CRC64NVME limitation. + +**Files modified:** +- `src/cdm_data_loaders/utils/s3.py` — add 3 new functions (no changes to existing functions) +- `tests/utils/test_s3.py` — add corresponding tests + +--- + +## Phase B: Create the `ncbi_ftp` module + +New module at `src/cdm_data_loaders/ncbi_ftp/`, separate from the +existing `ncbi_rest_api.py` pipeline and `refseq_pipeline/`. + +``` +src/cdm_data_loaders/ncbi_ftp/ +├── __init__.py +├── ftp_client.py # FTP: connect, keepalive, list, download, retry +├── manifest.py # Phase 1: summary diffing, manifest generation, ALL filtering +├── download.py # Phase 2: CTS container download + MD5 verification +├── promote.py # Phase 3: sync staging → final, archive, trim manifest +├── checksums.py # MD5 verification, CRC64/NVME computation +└── settings.py # Pydantic settings (extends CtsDefaultSettings) +``` + +### Steps + +5. **`ftp_client.py`** — Port from [`download_genomes.py`](download_genomes.py) + and [`container_download.py`](container_download.py): + - `connect_ftp(host, timeout)` with TCP keepalive (`SO_KEEPALIVE`, `TCP_KEEPIDLE`, `TCP_KEEPINTVL`) + - `ftp_noop_keepalive(ftp, interval)` — NOOP sender for idle connections + - `ftp_list_dir(ftp, path)` — NLST wrapper with retry on `error_temp` + - `ftp_download_file(ftp, remote_path, local_path)` — `RETR` with retry + - Thread-local FTP connection management for parallel downloads + - Use `get_cdm_logger()` instead of print statements + +6. **`checksums.py`** — Port from [`download_genomes.py`](download_genomes.py): + - `compute_crc64nvme(file_path) -> str` — reads in 1MB chunks, returns base64-encoded 8-byte big-endian (uses `awscrt.checksums.crc64nvme`) + - `verify_md5(file_path, expected_md5) -> bool` + - `parse_md5_checksums_file(text) -> dict[str, str]` — parses NCBI `md5checksums.txt` + +7. **`settings.py`** — Pydantic settings following + [`cts_defaults.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/cts_defaults.py) pattern: + - `DownloadSettings(CtsDefaultSettings)` — adds `manifest`, `threads`, `ftp_host` + - CLI-parseable with `AliasChoices`, `field_validator` for constraints + +8. **`manifest.py`** — Port from [`generate_transfer_manifest.py`](generate_transfer_manifest.py): + - `download_assembly_summary(ftp, database) -> str` + - `parse_assembly_summary(text) -> dict[str, AssemblyRecord]` + - `compute_diff(current, previous) -> DiffResult` — new/updated/replaced/suppressed/withdrawn + - `write_manifest(diff, output_dir)` — writes `transfer_manifest.txt`, `removed_manifest.txt`, `diff_summary.json` + - **All filtering logic lives here**: prefix-range filtering (`--prefix-from`, `--prefix-to`), `--limit`, any other subsetting + - The output `transfer_manifest.txt` is the final, filtered list — Phase 2 downloads exactly what's in it + +9. **`download.py`** — Port from [`container_download.py`](container_download.py). + **This phase is deliberately simple**: read manifest, download, verify. + - `run_download(settings: DownloadSettings)` — main CTS entry point + - Reads `transfer_manifest.txt` from input mount; each line is an FTP path + - `download_assembly(ftp, ftp_path, output_dir) -> DownloadResult` + - File filters: `_genomic.fna.gz`, `_genomic.gff.gz`, `_protein.faa.gz`, `_gene_ontology.gaf.gz`, `_assembly_report.txt`, `_assembly_stats.txt`, etc. + - MD5 verification (3 retries), `.md5` sidecar writing + - `ThreadPoolExecutor` for parallel downloads + - Output preserves FTP directory structure: `{GCF|GCA}/{000}/{001}/{215}/{assembly_dir}/` (same subfolder hierarchy as on the FTP server) + - Writes `download_report.json` summary + - `cli()` function using `run_cli(DownloadSettings, run_download)` from + [`core.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/core.py) + - **No filtering or subsetting logic** — downloads exactly what's in the manifest + +10. **`promote.py`** — Port from [`promote_staged_files.py`](promote_staged_files.py): + - `run_promote(staging_prefix, removed_manifest, release_tag, manifest_path)` + - Walk staged files in S3 staging prefix, upload each to final Lakehouse path via `upload_file_with_metadata()` with MD5 from `.md5` sidecars + - Skip `.md5` and `.crc64nvme` sidecar files themselves + - Archive replaced/suppressed assemblies (from `removed_manifest.txt`): `copy_object_with_metadata()` to `archive/{release}/...` with metadata tags, then `delete_object()` + - **Manifest trimming for resumability**: after successfully promoting an assembly's files, remove that entry from `transfer_manifest.txt`. If Phase 2 is re-run after a partial failure, it only downloads the remaining entries. + - `--dry-run` support + +--- + +## Phase C: Notebooks for Phases 1 and 3 + +11. **`notebooks/ncbi_ftp_manifest.ipynb`** — Phase 1: + - Cell 1: imports + S3 client init (`get_s3_client()`) + - Cell 2: configure parameters (database, prefix-from/to, limit, dry-run) + - Cell 3: download current assembly summary from FTP + - Cell 4: load previous summary from S3 (or scan existing prefixes) + - Cell 5: compute diff, display summary + - Cell 6: apply filters (prefix range, limit), write manifest files, upload new summary to S3 + +12. **`notebooks/ncbi_ftp_promote.ipynb`** — Phase 3: + - Cell 1: imports + S3 client init + - Cell 2: configure parameters (staging prefix, removed manifest path, release tag, manifest path for trimming, dry-run) + - Cell 3: scan staged files, display summary + - Cell 4: promote files to final paths + - Cell 5: archive replaced/suppressed assemblies + - Cell 6: trim manifest (remove promoted entries), display promotion report + +--- + +## Phase D: Container integration (Phase 2) + +13. Register CLI entry point in `pyproject.toml`: + ```toml + [project.scripts] + ncbi_ftp_sync = "cdm_data_loaders.ncbi_ftp.download:cli" + ``` + +14. Add command to `scripts/entrypoint.sh`: + ```bash + ncbi_ftp_sync) + exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" + ;; + ``` + +15. No Dockerfile changes needed (package installed via `uv sync`; entrypoint dispatches). + +--- + +## Phase E: Tests + +All tests use **moto** for S3 mocking. No live MinIO dependency in CI. + +``` +tests/ncbi_ftp/ +├── __init__.py +├── conftest.py # Mock FTP, sample manifests, assembly records +├── test_ftp_client.py # Mock ftplib: keepalive, retry, thread-local +├── test_checksums.py # MD5 verify, CRC64/NVME, md5checksums.txt parsing +├── test_manifest.py # Summary parsing, diff logic, filtering (port from test_sync.py) +├── test_download.py # Mock FTP + fs: filters, MD5 verify, sidecars, layout +├── test_promote.py # moto S3: upload with metadata, archive, manifest trimming, dry-run +└── test_settings.py # Pydantic validation (follow test_ncbi_rest_api.py) +``` + +### Steps + +16. **`test_checksums.py`** — `verify_md5` correct/incorrect, `parse_md5_checksums_file`, + `compute_crc64nvme` (skip if `awscrt` unavailable) + +17. **`test_manifest.py`** — Port relevant tests from this repo's + [`tests/test_sync.py`](../../tests/test_sync.py): `parse_assembly_summary`, + `compute_diff` (new/updated/replaced/suppressed/withdrawn), `write_manifest`, + prefix-range filtering, limit filtering + +18. **`test_ftp_client.py`** — Mock `ftplib.FTP`: keepalive options, retry on + `error_temp`, thread-local connections + +19. **`test_download.py`** — Mock FTP + filesystem: file filter logic, MD5 + verification, sidecar writing, directory layout preserves FTP structure, + `download_report.json` + +20. **`test_promote.py`** — moto `mock_s3_client` fixture: upload with metadata, + archive copy with tags, deletion of originals, **manifest trimming** (verify + promoted entries removed, remaining entries preserved), dry-run no side effects. + Use `strip_checksum_algorithm` workaround for CRC64NVME. + +21. **`test_settings.py`** — Follow + [`test_ncbi_rest_api.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/pipelines/test_ncbi_rest_api.py) + pattern: defaults, all params, CLI variants, invalid values, boolean parsing + +--- + +## Phase F: Dependencies and CI + +22. Add `awscrt` to `pyproject.toml` if not already covered by `boto3[crt]`. + +23. All new tests run automatically in CI — no `requires_spark` marks needed. + ruff checks apply (120 char lines, py313 target). + +--- + +## Key reference patterns in cdm-data-loaders + +| Pattern | Where to find it | +|---------|-----------------| +| S3 utility functions + moto tests | [`utils/s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/s3.py), [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py) | +| CTS settings base class | [`pipelines/cts_defaults.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/cts_defaults.py) | +| `run_cli()` entry point | [`pipelines/core.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/core.py) | +| Logger | [`utils/cdm_logger.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/cdm_logger.py) | +| Settings test pattern | [`tests/pipelines/test_ncbi_rest_api.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/pipelines/test_ncbi_rest_api.py) | +| Entrypoint dispatch | [`scripts/entrypoint.sh`](https://github.com/kbase/cdm-data-loaders/blob/develop/scripts/entrypoint.sh) | +| moto CRC64NVME workaround | `strip_checksum_algorithm()` in [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py) | + +--- + +## Verification + +1. `ruff check src/cdm_data_loaders/ncbi_ftp/ tests/ncbi_ftp/` — lint passes +2. `ruff format --check src/cdm_data_loaders/ncbi_ftp/ tests/ncbi_ftp/` — formatting passes +3. `uv run pytest tests/ncbi_ftp/ -v` — all unit tests pass +4. `uv run pytest tests/utils/test_s3.py -v` — new S3 function tests pass +5. Manual: build Docker image, run `ncbi_ftp_sync` with small manifest against local MinIO +6. Manual: run both notebooks against local MinIO for end-to-end verification + +--- + +## Decisions + +- **`ncbi_ftp` naming** — distinguishes bulk FTP file transfer from the existing NCBI REST API pipeline (`ncbi_rest_api.py`) and Spark-based refseq processing (`refseq_pipeline/`) +- **New functions in `s3.py`, not modified existing ones** — minimises impact on other scripts; avoids signature changes that could break callers +- **All filtering in Phase 1** — Phase 2 is a pure download-what's-in-the-manifest step; Phase 3 is a pure sync-and-archive step. Clean separation of concerns. +- **Manifest trimming in Phase 3** — enables resumable Phase 2 runs. After promoting files, Phase 3 removes those entries from `transfer_manifest.txt`. Re-running Phase 2 only downloads what's left. +- **Output preserves FTP directory structure** — Phase 2 writes files under the same `GCF/000/001/.../assembly_dir/` path used on the FTP server, making it trivial to correlate staged files with their NCBI source +- **moto for tests** — matches cdm-data-loaders conventions; fast, no Docker in CI. The `strip_checksum_algorithm` workaround handles the CRC64NVME gap. +- **ftplib over httpx** — NCBI FTP is the established bulk download protocol; existing keepalive/NOOP/retry patterns are proven +- **Notebooks for Phases 1 & 3** — interactive, judgement-requiring steps; natural fit for JupyterLab +- **Phase 2 as CTS command** — matches the entrypoint dispatch pattern and CTS mount contract + +## Excluded from scope + +- Frictionless `datapackage.json` descriptors (only in old monolithic `download_genomes.py`) +- `backfill_checksums.py` (legacy utility, not part of ongoing pipeline) +- `download_genomes.py` monolith (superseded by the 3-phase pipeline) +- Spark/Delta Lake integration (assembly sync is file-level, not data transformation) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb new file mode 100644 index 00000000..0cbb5295 --- /dev/null +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a05af26", + "metadata": {}, + "source": [ + "# NCBI Assembly Manifest Generation (Phase 1)\n", + "\n", + "Downloads the current NCBI assembly summary from FTP, compares it against a\n", + "previous snapshot, and produces:\n", + "\n", + "- `transfer_manifest.txt` — assemblies to download in Phase 2\n", + "- `removed_manifest.txt` — assemblies to archive in Phase 3\n", + "- `diff_summary.json` — human-readable summary of changes\n", + "\n", + "All filtering (prefix range, limit) is applied here so downstream phases\n", + "receive a final, pre-filtered manifest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319383dc", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "from __future__ import annotations\n", + "\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST\n", + "from cdm_data_loaders.ncbi_ftp.manifest import (\n", + " AssemblyRecord,\n", + " compute_diff,\n", + " download_assembly_summary,\n", + " filter_by_prefix_range,\n", + " parse_assembly_summary,\n", + " write_diff_summary,\n", + " write_removed_manifest,\n", + " write_transfer_manifest,\n", + " write_updated_manifest,\n", + ")\n", + "from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8cdb6f", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\"\"\"\n", + "\n", + "# Which NCBI database to sync: \"refseq\" or \"genbank\"\n", + "DATABASE = \"refseq\"\n", + "\n", + "# Accession prefix filtering (3-digit, inclusive). Set to None to skip.\n", + "PREFIX_FROM: str | None = None # e.g. \"000\"\n", + "PREFIX_TO: str | None = None # e.g. \"003\"\n", + "\n", + "# Maximum number of new/updated assemblies to include (None = unlimited)\n", + "LIMIT: int | None = None\n", + "\n", + "# S3 path to the previous assembly summary snapshot (set to None on first run)\n", + "PREVIOUS_SUMMARY_S3: str | None = None # e.g. \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/assembly_summary_refseq_prev.txt\"\n", + "\n", + "# S3 path where the new snapshot will be uploaded after diffing\n", + "SNAPSHOT_UPLOAD_S3: str | None = None # e.g. \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/assembly_summary_refseq_curr.txt\"\n", + "\n", + "# Local output directory for manifest files\n", + "OUTPUT_DIR = Path(\"output\")\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# FTP hostname (default is the standard NCBI FTP server)\n", + "FTP_HOSTNAME = FTP_HOST\n", + "\n", + "print(f\"Database: {DATABASE}\")\n", + "print(f\"Prefix range: {PREFIX_FROM} -> {PREFIX_TO}\")\n", + "print(f\"Limit: {LIMIT}\")\n", + "print(f\"Output dir: {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b10c3aaf", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Download current assembly summary from NCBI FTP.\"\"\"\n", + "\n", + "raw_summary = download_assembly_summary(database=DATABASE, ftp_host=FTP_HOSTNAME)\n", + "current = parse_assembly_summary(raw_summary)\n", + "print(f\"Parsed {len(current)} assemblies from current {DATABASE} summary\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88954378", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Load previous summary from S3 (or start fresh).\"\"\"\n", + "\n", + "previous: dict[str, AssemblyRecord] | None = None\n", + "\n", + "if PREVIOUS_SUMMARY_S3:\n", + " s3 = get_s3_client()\n", + " bucket, key = split_s3_path(PREVIOUS_SUMMARY_S3)\n", + " resp = s3.get_object(Bucket=bucket, Key=key)\n", + " prev_text = resp[\"Body\"].read().decode(\"utf-8\")\n", + " previous = parse_assembly_summary(prev_text)\n", + " print(f\"Loaded {len(previous)} assemblies from previous snapshot\")\n", + "else:\n", + " print(\"No previous snapshot — all current 'latest' assemblies will be marked as new\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18482b3c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Compute diff and apply filters.\"\"\"\n", + "\n", + "# Filter current assemblies by prefix range\n", + "filtered = filter_by_prefix_range(current, prefix_from=PREFIX_FROM, prefix_to=PREFIX_TO)\n", + "print(f\"After prefix filter: {len(filtered)} assemblies\")\n", + "\n", + "# Also filter previous if present\n", + "filtered_prev = filter_by_prefix_range(previous, prefix_from=PREFIX_FROM, prefix_to=PREFIX_TO) if previous else None\n", + "\n", + "# Compute diff\n", + "diff = compute_diff(filtered, previous_assemblies=filtered_prev)\n", + "\n", + "print(f\"New: {len(diff.new)}\")\n", + "print(f\"Updated: {len(diff.updated)}\")\n", + "print(f\"Replaced: {len(diff.replaced)}\")\n", + "print(f\"Suppressed: {len(diff.suppressed)}\")\n", + "print(f\"Total to transfer: {len(diff.new) + len(diff.updated)}\")\n", + "print(f\"Total to remove: {len(diff.replaced) + len(diff.suppressed)}\")\n", + "\n", + "# Apply limit if set\n", + "if LIMIT is not None:\n", + " original_new = len(diff.new)\n", + " original_updated = len(diff.updated)\n", + " combined = diff.new + diff.updated\n", + " limited = combined[:LIMIT]\n", + " diff.new = [a for a in diff.new if a in set(limited)]\n", + " diff.updated = [a for a in diff.updated if a in set(limited)]\n", + " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", + " print(f\" (was {original_new} new, {original_updated} updated)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9e2b631", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Write manifest files and upload snapshot to S3.\"\"\"\n", + "\n", + "# Write transfer manifest\n", + "transfer_path = OUTPUT_DIR / \"transfer_manifest.txt\"\n", + "paths = write_transfer_manifest(diff, filtered, transfer_path, ftp_host=FTP_HOSTNAME)\n", + "print(f\"Transfer manifest: {len(paths)} entries -> {transfer_path}\")\n", + "\n", + "# Write removed manifest\n", + "removed_path = OUTPUT_DIR / \"removed_manifest.txt\"\n", + "removed = write_removed_manifest(diff, removed_path)\n", + "print(f\"Removed manifest: {len(removed)} entries -> {removed_path}\")\n", + "\n", + "# Write updated manifest (for Phase 3 pre-overwrite archiving)\n", + "updated_path = OUTPUT_DIR / \"updated_manifest.txt\"\n", + "updated = write_updated_manifest(diff, updated_path)\n", + "print(f\"Updated manifest: {len(updated)} entries -> {updated_path}\")\n", + "\n", + "# Write diff summary\n", + "summary_path = OUTPUT_DIR / \"diff_summary.json\"\n", + "summary = write_diff_summary(diff, summary_path, DATABASE, PREFIX_FROM, PREFIX_TO)\n", + "print(f\"Diff summary -> {summary_path}\")\n", + "print(json.dumps(summary[\"counts\"], indent=2))\n", + "\n", + "# Upload new snapshot to S3 for future diffing\n", + "if SNAPSHOT_UPLOAD_S3:\n", + " s3 = get_s3_client()\n", + " bucket, key = split_s3_path(SNAPSHOT_UPLOAD_S3)\n", + " s3.put_object(Bucket=bucket, Key=key, Body=raw_summary.encode(\"utf-8\"))\n", + " print(f\"Uploaded new snapshot to {SNAPSHOT_UPLOAD_S3}\")\n", + "else:\n", + " print(\"Skipping S3 snapshot upload (SNAPSHOT_UPLOAD_S3 not set)\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb new file mode 100644 index 00000000..de19c0e6 --- /dev/null +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2eda19a9", + "metadata": {}, + "source": [ + "# NCBI Assembly Promote & Archive (Phase 3)\n", + "\n", + "Promotes staged assembly files from S3 staging (written by CTS Phase 2)\n", + "to their final Lakehouse paths, archives replaced/suppressed assemblies,\n", + "and trims the transfer manifest for resumability.\n", + "\n", + "Steps:\n", + "1. Configure staging prefix, removed manifest, updated manifest, and release tag\n", + "2. Scan staged files and display summary\n", + "3. Archive existing versions of updated assemblies (pre-overwrite)\n", + "4. Promote files to final paths with MD5 metadata\n", + "5. Archive replaced/suppressed assemblies\n", + "6. Trim manifest (remove promoted entries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b736665", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "from __future__ import annotations\n", + "\n", + "import json\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.promote import (\n", + " DEFAULT_PATH_PREFIX,\n", + " promote_from_s3,\n", + ")\n", + "from cdm_data_loaders.utils.s3 import get_s3_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b36a556c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\"\"\"\n", + "\n", + "# S3 bucket where staged files and final Lakehouse data live\n", + "BUCKET = \"cdm-lake\" # e.g. \"cdm-lake\"\n", + "\n", + "# Staging prefix written by CTS Phase 2 (from the CTS output mount)\n", + "STAGING_PREFIX = \"staging/run1/\" # e.g. \"staging/run1/\"\n", + "\n", + "# Local path to removed_manifest.txt from Phase 1 (or None to skip archiving)\n", + "REMOVED_MANIFEST: str | None = None # e.g. \"output/removed_manifest.txt\"\n", + "\n", + "# Local path to updated_manifest.txt from Phase 1 (or None to skip pre-overwrite archiving)\n", + "UPDATED_MANIFEST: str | None = None # e.g. \"output/updated_manifest.txt\"\n", + "\n", + "# NCBI release tag for archive metadata (e.g. \"2024-01\")\n", + "NCBI_RELEASE: str | None = None\n", + "\n", + "# S3 key of transfer_manifest.txt for trimming (or None to skip)\n", + "MANIFEST_S3_KEY: str | None = None # e.g. \"ncbi/transfer_manifest.txt\"\n", + "\n", + "# Final Lakehouse path prefix\n", + "PATH_PREFIX = DEFAULT_PATH_PREFIX\n", + "\n", + "# Dry-run mode — log actions without making changes\n", + "DRY_RUN = True\n", + "\n", + "print(f\"Updated manifest: {UPDATED_MANIFEST}\")\n", + "print(f\"NCBI release: {NCBI_RELEASE}\")\n", + "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Path prefix: {PATH_PREFIX}\")\n", + "\n", + "print(f\"Dry-run: {DRY_RUN}\")\n", + "print(f\"Path prefix: {PATH_PREFIX}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e521fd45", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Scan staged files and display summary.\"\"\"\n", + "\n", + "s3 = get_s3_client()\n", + "paginator = s3.get_paginator(\"list_objects_v2\")\n", + "\n", + "staged: list[str] = []\n", + "for page in paginator.paginate(Bucket=BUCKET, Prefix=STAGING_PREFIX):\n", + " staged.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", + "\n", + "sidecars = [k for k in staged if k.endswith((\".md5\", \".crc64nvme\"))]\n", + "data_files = [k for k in staged if k not in set(sidecars)]\n", + "\n", + "print(f\"Staged objects: {len(staged)}\")\n", + "print(f\" Data files: {len(data_files)}\")\n", + "print(f\" Sidecars: {len(sidecars)}\")\n", + "\n", + "# Show first few data files\n", + "PREVIEW_COUNT = 10\n", + "for key in data_files[:PREVIEW_COUNT]:\n", + " print(f\" {key}\")\n", + "if len(data_files) > PREVIEW_COUNT:\n", + " print(f\" ... and {len(data_files) - PREVIEW_COUNT} more\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a46367", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Promote staged files to final Lakehouse paths.\"\"\"\n", + "\n", + "report = promote_from_s3(\n", + " staging_prefix=STAGING_PREFIX,\n", + " bucket=BUCKET,\n", + " removed_manifest=REMOVED_MANIFEST,\n", + " updated_manifest=UPDATED_MANIFEST,\n", + " ncbi_release=NCBI_RELEASE,\n", + " manifest_path=MANIFEST_S3_KEY,\n", + " path_prefix=PATH_PREFIX,\n", + " dry_run=DRY_RUN,\n", + ")\n", + "\n", + "print(json.dumps(report, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d18a1e0", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Display promotion report.\"\"\"\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"PROMOTION REPORT\")\n", + "print(\"=\" * 50)\n", + "print(f\"Promoted: {report['promoted']}\")\n", + "print(f\"Archived: {report['archived']}\")\n", + "print(f\"Failed: {report['failed']}\")\n", + "print(f\"Dry-run: {report['dry_run']}\")\n", + "print(f\"Timestamp: {report['timestamp']}\")\n", + "\n", + "if report['failed'] > 0:\n", + " print(\"\\n⚠️ Some operations failed — check logs above for details.\")\n", + "\n", + "if report['dry_run']:\n", + " print(\"\\n📋 This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index c90538a0..5898efde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ uniprot = "cdm_data_loaders.pipelines.uniprot_kb:cli" uniref = "cdm_data_loaders.pipelines.uniref:cli" ncbi_rest_api = "cdm_data_loaders.pipelines.ncbi_rest_api:cli" +ncbi_ftp_sync = "cdm_data_loaders.pipelines.ncbi_ftp_download:cli" [dependency-groups] dev = [ @@ -170,7 +171,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "*.ipynb" = ["T201"] # ignore printing in notebooks -"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002"] # use of assert, booleans +"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002", "ARG002"] # use of assert, booleans, unused mock args "tests/utils/test_s3.py" = ["ANN401"] "**/__init__.py" = ["D104"] diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 74d90521..ee087591 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -3,7 +3,7 @@ set -euo pipefail # Ensure at least one argument is provided if [ "$#" -eq 0 ]; then - echo "Usage: $0 {uniref|uniprot|ncbi_rest_api|xml_split|test} [args...]" + echo "Usage: $0 {uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|xml_split|test} [args...]" exit 1 fi @@ -27,6 +27,10 @@ case "$cmd" in # Run the NCBI datasets API importer exec /usr/bin/tini -- uv run --no-sync ncbi_rest_api "$@" ;; + ncbi_ftp_sync) + # Run the NCBI FTP assembly download pipeline (Phase 2) + exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" + ;; test) # run the tests exec /usr/bin/tini -- uv run --no-sync pytest -m "not requires_spark" @@ -35,7 +39,7 @@ case "$cmd" in exec /usr/bin/tini -- /bin/bash ;; *) - echo "Error: unknown command '$cmd'; valid commands are 'uniref', 'uniprot', 'ncbi_rest_api', or 'xml_split'." >&2 + echo "Error: unknown command '$cmd'; valid commands are 'uniref', 'uniprot', 'ncbi_rest_api', 'ncbi_ftp_sync', or 'xml_split'." >&2 exit 1 ;; esac diff --git a/src/cdm_data_loaders/ncbi_ftp/__init__.py b/src/cdm_data_loaders/ncbi_ftp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/cdm_data_loaders/ncbi_ftp/assembly.py b/src/cdm_data_loaders/ncbi_ftp/assembly.py new file mode 100644 index 00000000..cd2981ef --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/assembly.py @@ -0,0 +1,206 @@ +"""NCBI FTP assembly-specific domain logic. + +Provides path helpers, file filters, MD5 checksum parsing, and single-assembly +download logic for NCBI GenBank/RefSeq assemblies. Orchestration (batching, +threading, CLI) lives in :mod:`cdm_data_loaders.pipelines.ncbi_ftp_download`. +""" + +import contextlib +import re +import time +from ftplib import FTP +from pathlib import Path +from typing import Any + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.checksums import compute_md5 +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_noop_keepalive, ftp_retrieve_text + +logger = get_cdm_logger() + +FTP_HOST = "ftp.ncbi.nlm.nih.gov" + +FILE_FILTERS = [ + "_gene_ontology.gaf.gz", + "_genomic.fna.gz", + "_genomic.gff.gz", + "_protein.faa.gz", + "_ani_contam_ranges.tsv", + "_assembly_regions.txt", + "_assembly_report.txt", + "_assembly_stats.txt", + "_gene_expression_counts.txt.gz", + "_normalized_gene_expression_counts.txt.gz", +] + + +def parse_md5_checksums_file(text: str) -> dict[str, str]: + """Parse an NCBI ``md5checksums.txt`` file into a filename-to-hash mapping. + + Each line has the format `` ./`` (two-space separator). + + :param text: raw text of the md5checksums.txt file + :return: dict mapping filename to MD5 hex digest + """ + checksums: dict[str, str] = {} + for raw_line in text.strip().splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + parts = stripped.split(" ", maxsplit=1) + if len(parts) == 2: # noqa: PLR2004 + md5_hash, filename = parts + checksums[filename.removeprefix("./")] = md5_hash.strip() + return checksums + + +# ── Path helpers ───────────────────────────────────────────────────────── + + +def build_accession_path(assembly_dir: str) -> str: + """Build the relative output path for an assembly directory. + + Produces ``raw_data/{GCF|GCA}/{000}/{001}/{215}/{assembly_dir}/``. + + :param assembly_dir: full assembly directory name (e.g. ``GCF_000001215.4_Release_6...``) + :return: relative path string + :raises ValueError: if the assembly directory name cannot be parsed + """ + m = re.match(r"GC[AF]_(\d{3})(\d{3})(\d{3})\.\d+.*", assembly_dir) + if not m: + msg = f"Cannot parse accession: {assembly_dir}" + raise ValueError(msg) + p1, p2, p3 = m.groups() + return f"raw_data/{assembly_dir[:3]}/{p1}/{p2}/{p3}/{assembly_dir}/" + + +def parse_assembly_path(assembly_path: str) -> tuple[str, str, str]: + """Extract database, assembly_dir, and accession from an FTP assembly path. + + :param assembly_path: FTP directory path (e.g. ``/genomes/all/GCF/000/.../GCF_000001215.4_Rel.../``) + :return: tuple of ``(database, assembly_dir, accession)`` + :raises ValueError: if the path cannot be parsed + """ + m = re.search( + r"/(GC[AF])/\d{3}/\d{3}/\d{3}/((GC[AF]_\d{9}\.\d+)_[^/]+)/?$", + assembly_path.rstrip("/"), + ) + if not m: + msg = f"Cannot parse assembly path: {assembly_path}" + raise ValueError(msg) + return m.group(1), m.group(2), m.group(3) + + +# ── Single assembly download ──────────────────────────────────────────── + + +def _download_and_verify( # noqa: PLR0913 + ftp: FTP, + filename: str, + dest_dir: Path, + md5_checksums: dict[str, str], + stats: dict[str, Any], + last_activity: float, +) -> float: + """Download one file, verify its MD5, and write a sidecar if valid.""" + last_activity = ftp_noop_keepalive(ftp, last_activity) + local_file = dest_dir / filename + expected_md5 = md5_checksums.get(filename) + + for attempt in range(1, 4): + logger.debug(" Downloading %s (attempt %d/3)", filename, attempt) + with local_file.open("wb") as f: + ftp.retrbinary(f"RETR {filename}", f.write) + last_activity = time.monotonic() + + if expected_md5: + actual_md5 = compute_md5(str(local_file)) + if actual_md5 != expected_md5: + logger.warning( + " MD5 mismatch for %s: expected %s, got %s", + filename, + expected_md5, + actual_md5, + ) + if attempt < 3: # noqa: PLR2004 + continue + stats["files_skipped_checksum_mismatch"] += 1 + local_file.unlink(missing_ok=True) + return last_activity + logger.debug(" MD5 verified: %s", filename) + else: + stats["files_without_checksum"] += 1 + + if expected_md5: + (dest_dir / f"{filename}.md5").write_text(expected_md5) + + stats["files_downloaded"] += 1 + return last_activity + + return last_activity + + +def download_assembly_to_local( + assembly_path: str, + output_dir: str | Path, + ftp_host: str = FTP_HOST, + ftp: FTP | None = None, +) -> dict[str, Any]: + """Download one assembly from NCBI FTP to a local directory. + + Creates a directory structure under *output_dir* matching the S3 layout, + downloads filtered files, verifies MD5 checksums, and writes ``.md5`` + sidecar files for downstream metadata. + + :param assembly_path: FTP directory path for the assembly + :param output_dir: base output directory + :param ftp_host: FTP hostname + :param ftp: optional existing FTP connection (caller manages lifecycle) + :return: dict with download statistics + """ + _database, assembly_dir, accession = parse_assembly_path(assembly_path) + rel_path = build_accession_path(assembly_dir) + dest_dir = Path(output_dir) / rel_path + dest_dir.mkdir(parents=True, exist_ok=True) + + logger.info("Downloading %s -> %s", accession, dest_dir) + + owns_ftp = ftp is None + if owns_ftp: + ftp = connect_ftp(ftp_host) + stats: dict[str, Any] = { + "accession": accession, + "assembly_dir": assembly_dir, + "files_downloaded": 0, + "files_skipped_checksum_mismatch": 0, + "files_without_checksum": 0, + } + + try: + ftp.cwd(assembly_path.rstrip("/")) + + files: list[str] = [] + ftp.retrlines("NLST", files.append) + + # Download and parse md5checksums.txt + md5_checksums: dict[str, str] = {} + if "md5checksums.txt" in files: + md5_text = ftp_retrieve_text(ftp, "md5checksums.txt") + md5_checksums = parse_md5_checksums_file(md5_text) + (dest_dir / "md5checksums.txt").write_text(md5_text) + stats["files_downloaded"] += 1 + + target_files = [f for f in files if any(f.endswith(s) for s in FILE_FILTERS)] + last_activity = time.monotonic() + + for filename in target_files: + last_activity = _download_and_verify(ftp, filename, dest_dir, md5_checksums, stats, last_activity) + + logger.info(" %s: %d files downloaded", accession, stats["files_downloaded"]) + + finally: + if owns_ftp: + with contextlib.suppress(Exception): + ftp.quit() + + return stats diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py new file mode 100644 index 00000000..21aa084d --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -0,0 +1,471 @@ +"""Phase 1: Assembly summary diffing and manifest generation. + +Downloads the current NCBI assembly summary from FTP, compares it against a +previous snapshot, and produces ``transfer_manifest.txt`` (assemblies to +download), ``removed_manifest.txt`` (assemblies to archive), and a JSON diff +summary. All filtering logic (prefix range, limit) lives here so that +downstream phases receive a final, pre-filtered manifest. +""" + +import contextlib +import csv +import json +import re +import time +from collections.abc import Iterable +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from cdm_data_loaders.ncbi_ftp.assembly import ( + FILE_FILTERS, + FTP_HOST, + build_accession_path, + parse_md5_checksums_file, +) +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_noop_keepalive, ftp_retrieve_text +from cdm_data_loaders.utils.s3 import head_object + +logger = get_cdm_logger() + +SUMMARY_FTP_PATHS: dict[str, str] = { + "refseq": "/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt", + "genbank": "/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt", +} + + +# ── Data structures ───────────────────────────────────────────────────── + + +@dataclass +class AssemblyRecord: + """Parsed row from an NCBI assembly summary file.""" + + accession: str + status: str + seq_rel_date: str + ftp_path: str + assembly_dir: str + + +@dataclass +class DiffResult: + """Result of comparing current and previous assembly summaries.""" + + new: list[str] = field(default_factory=list) + updated: list[str] = field(default_factory=list) + replaced: list[str] = field(default_factory=list) + suppressed: list[str] = field(default_factory=list) + + +# ── Assembly summary download & parsing ────────────────────────────────── + + +def download_assembly_summary(database: str = "refseq", ftp_host: str = FTP_HOST) -> str: + """Download the assembly summary file from NCBI FTP. + + :param database: ``"refseq"`` or ``"genbank"`` + :param ftp_host: FTP hostname + :return: raw text content of the summary file + """ + ftp_path = SUMMARY_FTP_PATHS.get(database) + if not ftp_path: + msg = f"Unknown database: {database}" + raise ValueError(msg) + + logger.info("Downloading assembly_summary_%s.txt from NCBI FTP ...", database) + ftp = connect_ftp(ftp_host) + try: + content = ftp_retrieve_text(ftp, ftp_path) + finally: + with contextlib.suppress(Exception): + ftp.quit() + + logger.info("Downloaded assembly summary (%d bytes)", len(content)) + return content + + +def parse_assembly_summary(source: str | Path | list[str]) -> dict[str, AssemblyRecord]: + """Parse an NCBI assembly summary into a dict of assembly records. + + Accepts a file path, raw text string, or list of lines. + + Columns of interest (0-indexed): + 0: assembly_accession (e.g. GCF_000001215.4) + 10: version_status ("latest", "replaced", "suppressed") + 14: seq_rel_date + 19: ftp_path (full FTP URL or "na") + + :param source: file path, raw text, or list of lines + :return: dict mapping accession to :class:`AssemblyRecord` + """ + assemblies: dict[str, AssemblyRecord] = {} + + def _parse_lines(lines: Iterable[str]) -> None: + reader = csv.reader( + (line.rstrip("\n") for line in lines if not line.startswith("#")), + delimiter="\t", + ) + for row in reader: + if len(row) < 20: # noqa: PLR2004 + continue + accession = row[0] + ftp_path = row[19] + if ftp_path == "na": + continue + assemblies[accession] = AssemblyRecord( + accession=accession, + status=row[10], + seq_rel_date=row[14], + ftp_path=ftp_path, + assembly_dir=ftp_path.rstrip("/").split("/")[-1], + ) + + if isinstance(source, Path) or (isinstance(source, str) and "\n" not in source and Path(source).is_file()): + with Path(source).open() as f: + _parse_lines(f) + elif isinstance(source, list): + _parse_lines(source) + else: + _parse_lines(source.splitlines(keepends=True)) + + logger.info("Parsed %d assemblies from summary", len(assemblies)) + return assemblies + + +def get_latest_assembly_paths(assemblies: dict[str, AssemblyRecord], ftp_host: str = FTP_HOST) -> list[tuple[str, str]]: + """Extract FTP directory paths for all assemblies with ``latest`` status. + + :param assemblies: parsed assembly records + :param ftp_host: FTP hostname for URL stripping + :return: list of ``(accession, ftp_dir_path)`` tuples + """ + paths: list[tuple[str, str]] = [] + for accession, rec in assemblies.items(): + if rec.status != "latest": + continue + ftp_path = _ftp_dir_from_url(rec.ftp_path, ftp_host) + paths.append((accession, ftp_path.rstrip("/") + "/")) + return paths + + +# ── Prefix filtering ──────────────────────────────────────────────────── + + +def accession_prefix(accession: str) -> str | None: + """Extract the 3-digit prefix from an accession (e.g. ``GCF_000005845.2`` → ``"000"``).""" + m = re.match(r"GC[AF]_(\d{3})\d{6}\.\d+", accession) + return m.group(1) if m else None + + +def filter_by_prefix_range( + assemblies: dict[str, AssemblyRecord], + prefix_from: str | None = None, + prefix_to: str | None = None, +) -> dict[str, AssemblyRecord]: + """Filter assemblies to those whose 3-digit accession prefix is in range. + + Both bounds are inclusive. If neither is set, returns all assemblies. + + :param assemblies: dict of parsed assembly records + :param prefix_from: lower bound (inclusive), e.g. ``"000"`` + :param prefix_to: upper bound (inclusive), e.g. ``"003"`` + :return: filtered dict + """ + if prefix_from is None and prefix_to is None: + return assemblies + filtered: dict[str, AssemblyRecord] = {} + for acc, rec in assemblies.items(): + pfx = accession_prefix(acc) + if pfx is None: + continue + if prefix_from is not None and pfx < prefix_from: + continue + if prefix_to is not None and pfx > prefix_to: + continue + filtered[acc] = rec + return filtered + + +# ── Diff computation ──────────────────────────────────────────────────── + + +def compute_diff( # noqa: PLR0912 + current: dict[str, AssemblyRecord], + previous_assemblies: dict[str, AssemblyRecord] | None = None, + previous_accessions: set[str] | None = None, +) -> DiffResult: + """Compute the diff between current and previous assembly state. + + :param current: the new NCBI summary (parsed) + :param previous_assemblies: full parsed previous summary, or None if using fallback + :param previous_accessions: set of known accessions (store-scan fallback) + :return: diff result with new/updated/replaced/suppressed lists + """ + diff = DiffResult() + + if previous_assemblies is not None: + known = set(previous_assemblies.keys()) + elif previous_accessions is not None: + known = previous_accessions + else: + known = set() + + for acc, rec in current.items(): + if rec.status == "replaced": + if acc in known: + diff.replaced.append(acc) + continue + if rec.status == "suppressed": + if acc in known: + diff.suppressed.append(acc) + continue + if rec.status != "latest": + continue + + if acc not in known: + diff.new.append(acc) + elif previous_assemblies is not None: + prev = previous_assemblies.get(acc) + if prev and (rec.seq_rel_date != prev.seq_rel_date or rec.assembly_dir != prev.assembly_dir): + diff.updated.append(acc) + + # Accessions in previous but entirely absent from current (withdrawn) + current_accs = set(current.keys()) + for acc in known: + if acc not in current_accs and acc not in diff.suppressed: + diff.suppressed.append(acc) + + diff.new.sort() + diff.updated.sort() + diff.replaced.sort() + diff.suppressed.sort() + return diff + + +# ── FTP URL helpers ────────────────────────────────────────────────────── + + +def _ftp_dir_from_url(ftp_url: str, ftp_host: str = FTP_HOST) -> str: + """Convert an FTP URL from the assembly summary to an FTP directory path.""" + if ftp_url.startswith("https://"): + return ftp_url.replace("https://ftp.ncbi.nlm.nih.gov", "") + if ftp_url.startswith("ftp://"): + return ftp_url.replace(f"ftp://{ftp_host}", "") + return ftp_url + + +# ── Checksum verification against S3 store ─────────────────────────────── + + +def verify_transfer_candidates( + accessions: list[str], + current_assemblies: dict[str, AssemblyRecord], + bucket: str, + path_prefix: str, + ftp_host: str = FTP_HOST, +) -> list[str]: + """Verify which transfer candidates actually need downloading. + + For each accession, downloads ``md5checksums.txt`` from NCBI FTP and + compares the checksums of filtered files against the ``md5`` user metadata + on corresponding S3 objects. Only accessions where at least one file + differs or is missing from S3 are returned. + + This acts as a final gate before Phase 2: even if the summary diff flags an + assembly, we skip it if every file in the store already matches. + + :param accessions: list of candidate accessions (new + updated from diff) + :param current_assemblies: parsed current assembly summary + :param bucket: S3 bucket name + :param path_prefix: Lakehouse path prefix (e.g. ``"tenant-general-warehouse/kbase/datasets/ncbi/"``) + :param ftp_host: NCBI FTP hostname + :return: filtered list of accessions that actually need downloading + """ + if not accessions: + return [] + + ftp = connect_ftp(ftp_host) + confirmed: list[str] = [] + pruned = 0 + last_activity = time.monotonic() + + try: + for acc in accessions: + rec = current_assemblies.get(acc) + if not rec: + confirmed.append(acc) + continue + + # Keep FTP alive between assemblies + last_activity = ftp_noop_keepalive(ftp, last_activity) + + # Download md5checksums.txt from FTP + ftp_dir = _ftp_dir_from_url(rec.ftp_path, ftp_host) + try: + md5_text = ftp_retrieve_text(ftp, ftp_dir.rstrip("/") + "/md5checksums.txt") + last_activity = time.monotonic() + ftp_checksums = parse_md5_checksums_file(md5_text) + except Exception: # noqa: BLE001 + logger.warning("Cannot fetch md5checksums.txt for %s, keeping in transfer list", acc) + confirmed.append(acc) + continue + + # Filter to files we'd actually download + target_checksums = { + fname: md5 + for fname, md5 in ftp_checksums.items() + if any(fname.endswith(suffix) for suffix in FILE_FILTERS) + } + + if not target_checksums: + confirmed.append(acc) + continue + + # Build S3 prefix for this assembly + s3_rel = build_accession_path(rec.assembly_dir) + + # Short-circuit: if any file differs or is missing, keep the assembly + needs_update = False + for fname, expected_md5 in target_checksums.items(): + s3_path = f"{bucket}/{path_prefix}{s3_rel}{fname}" + obj_info = head_object(s3_path) + + if obj_info is None: + needs_update = True + break + + s3_md5 = obj_info["metadata"].get("md5", "") + if s3_md5 != expected_md5: + logger.debug("MD5 mismatch for %s/%s: S3=%s FTP=%s", acc, fname, s3_md5, expected_md5) + needs_update = True + break + + if needs_update: + confirmed.append(acc) + else: + pruned += 1 + logger.debug("Pruned %s — all files match S3 checksums", acc) + finally: + with contextlib.suppress(Exception): + ftp.quit() + + logger.info( + "Checksum verification: %d confirmed, %d pruned (of %d candidates)", + len(confirmed), + pruned, + len(accessions), + ) + return confirmed + + +# ── Manifest writing ──────────────────────────────────────────────────── + + +def write_transfer_manifest( + diff: DiffResult, + current_assemblies: dict[str, AssemblyRecord], + output_path: str | Path, + ftp_host: str = FTP_HOST, +) -> list[str]: + """Write the transfer manifest (new + updated assemblies). + + Each line is an FTP directory path suitable for Phase 2 download. + + :param diff: computed diff result + :param current_assemblies: parsed current assembly summary + :param output_path: path to write the manifest file + :param ftp_host: FTP hostname for URL stripping + :return: list of FTP paths written + """ + to_transfer = diff.new + diff.updated + paths: list[str] = [] + for acc in sorted(to_transfer): + rec = current_assemblies.get(acc) + if not rec: + continue + ftp_path = _ftp_dir_from_url(rec.ftp_path, ftp_host) + paths.append(ftp_path.rstrip("/") + "/") + + with Path(output_path).open("w") as f: + f.writelines(p + "\n" for p in paths) + + logger.info("Wrote %d entries to transfer manifest: %s", len(paths), output_path) + return paths + + +def write_removed_manifest(diff: DiffResult, output_path: str | Path) -> list[str]: + """Write the removed manifest (replaced + suppressed accessions). + + :param diff: computed diff result + :param output_path: path to write the manifest file + :return: list of accessions written + """ + removed = sorted(diff.replaced + diff.suppressed) + with Path(output_path).open("w") as f: + f.writelines(acc + "\n" for acc in removed) + logger.info("Wrote %d entries to removed manifest: %s", len(removed), output_path) + return removed + + +def write_updated_manifest(diff: DiffResult, output_path: str | Path) -> list[str]: + """Write the updated manifest (accessions whose content changed). + + This file is consumed by Phase 3 to archive existing S3 objects + before they are overwritten by the new versions. + + :param diff: computed diff result + :param output_path: path to write the manifest file + :return: list of accessions written + """ + updated = sorted(diff.updated) + with Path(output_path).open("w") as f: + f.writelines(acc + "\n" for acc in updated) + logger.info("Wrote %d entries to updated manifest: %s", len(updated), output_path) + return updated + + +def write_diff_summary( + diff: DiffResult, + output_path: str | Path, + database: str, + prefix_from: str | None = None, + prefix_to: str | None = None, +) -> dict[str, Any]: + """Write a JSON diff summary file. + + :param diff: computed diff result + :param output_path: path to write the JSON file + :param database: database name (``"refseq"`` or ``"genbank"``) + :param prefix_from: lower bound of prefix filter (if any) + :param prefix_to: upper bound of prefix filter (if any) + :return: the summary dict that was written + """ + summary: dict[str, Any] = { + "database": database, + "timestamp": datetime.now(UTC).isoformat(), + "prefix_range": { + "from": prefix_from, + "to": prefix_to, + }, + "counts": { + "new": len(diff.new), + "updated": len(diff.updated), + "replaced": len(diff.replaced), + "suppressed": len(diff.suppressed), + "total_to_transfer": len(diff.new) + len(diff.updated), + "total_to_remove": len(diff.replaced) + len(diff.suppressed), + }, + "accessions": { + "new": diff.new, + "updated": diff.updated, + "replaced": diff.replaced, + "suppressed": diff.suppressed, + }, + } + with Path(output_path).open("w") as f: + json.dump(summary, f, indent=2) + logger.info("Wrote diff summary to: %s", output_path) + return summary diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py new file mode 100644 index 00000000..37f1fa82 --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -0,0 +1,281 @@ +"""Phase 3: Promote staged files to final Lakehouse paths in S3. + +Walks staged files in an S3 staging prefix (written by CTS after Phase 2), +uploads each to the final Lakehouse path with MD5 metadata from sidecar files, +archives replaced/suppressed and updated assemblies, and trims the transfer +manifest so that a re-run of Phase 2 only downloads remaining entries. +""" + +import re +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.s3 import ( + copy_object_with_metadata, + delete_object, + get_s3_client, + upload_file_with_metadata, +) + +logger = get_cdm_logger() + +DEFAULT_PATH_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" + + +# ── Promote from S3 staging prefix ────────────────────────────────────── + + +def promote_from_s3( # noqa: PLR0913 + staging_prefix: str, + bucket: str, + removed_manifest: str | Path | None = None, + updated_manifest: str | Path | None = None, + ncbi_release: str | None = None, + manifest_path: str | None = None, + path_prefix: str = DEFAULT_PATH_PREFIX, + *, + dry_run: bool = False, +) -> dict[str, Any]: + """Promote files from an S3 staging prefix to the final Lakehouse path. + + Downloads each file to a temp location and re-uploads to the final path + with MD5 metadata from ``.md5`` sidecar files. + + :param staging_prefix: S3 key prefix where CTS output was written + :param bucket: S3 bucket name + :param removed_manifest: local path to the removed_manifest file + :param updated_manifest: local path to the updated_manifest file + :param ncbi_release: NCBI release version tag for archiving + :param manifest_path: S3 path to transfer_manifest.txt for trimming + :param path_prefix: Lakehouse path prefix for final locations + :param dry_run: if True, log actions without side effects + :return: report dict with counts + """ + s3 = get_s3_client() + paginator = s3.get_paginator("list_objects_v2") + + promoted = 0 + failed = 0 + + # Collect all objects under the staging prefix + staged_objects: list[str] = [] + for page in paginator.paginate(Bucket=bucket, Prefix=staging_prefix): + staged_objects.extend(obj["Key"] for obj in page.get("Contents", [])) + + # Separate data files from sidecars + sidecars = {k for k in staged_objects if k.endswith((".crc64nvme", ".md5"))} + data_files = [k for k in staged_objects if k not in sidecars] + + logger.info("Found %d data files and %d sidecars in staging", len(data_files), len(sidecars)) + + # Archive all affected assemblies BEFORE promoting or deleting + archived = 0 + for manifest_file, reason, delete in [ + (updated_manifest, "updated", False), + (removed_manifest, "replaced_or_suppressed", True), + ]: + if manifest_file and Path(str(manifest_file)).is_file(): + archived += _archive_assemblies( + str(manifest_file), + bucket=bucket, + ncbi_release=ncbi_release, + path_prefix=path_prefix, + archive_reason=reason, + delete_source=delete, + dry_run=dry_run, + ) + + promoted_accessions: set[str] = set() + + for staged_key in data_files: + if staged_key.endswith("download_report.json"): + continue + + rel_path = staged_key[len(staging_prefix) :] + if not rel_path.startswith("raw_data/"): + continue + final_key = path_prefix + rel_path + + if dry_run: + logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) + promoted += 1 + continue + + try: + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp_path = tmp.name + try: + s3.download_file(Bucket=bucket, Key=staged_key, Filename=tmp_path) + + # Read MD5 from sidecar + metadata: dict[str, str] = {} + md5_key = staged_key + ".md5" + if md5_key in sidecars: + md5_obj = s3.get_object(Bucket=bucket, Key=md5_key) + metadata["md5"] = md5_obj["Body"].read().decode().strip() + + upload_file_with_metadata( + tmp_path, + f"{bucket}/{Path(final_key).parent}", + metadata=metadata, + object_name=Path(final_key).name, + ) + promoted += 1 + + # Track promoted accession for manifest trimming + acc_match = re.search(r"(GC[AF]_\d{9}\.\d+)", staged_key) + if acc_match: + promoted_accessions.add(acc_match.group(1)) + + finally: + Path(tmp_path).unlink() + except Exception: + logger.exception("Failed to promote %s", staged_key) + failed += 1 + + # Trim manifest for resumability + if manifest_path and promoted_accessions and not dry_run: + _trim_manifest(manifest_path, bucket, promoted_accessions) + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "promoted": promoted, + "archived": archived, + "failed": failed, + "dry_run": dry_run, + } + + logger.info( + "PROMOTE SUMMARY: %d promoted, %d archived, %d failed%s", + promoted, + archived, + failed, + " (dry-run)" if dry_run else "", + ) + return report + + +# ── Archive assemblies ────────────────────────────────────────────────── + + +def _archive_assemblies( # noqa: PLR0913 + manifest_path: str, + bucket: str, + ncbi_release: str | None = None, + path_prefix: str = DEFAULT_PATH_PREFIX, + archive_reason: str = "unknown", + *, + delete_source: bool = False, + dry_run: bool = False, +) -> int: + """Archive assembly objects to ``archive/{release_tag}/``. + + Copies S3 objects matching each accession to the archive prefix. + When *delete_source* is True (replaced/suppressed), the original + objects are deleted after copying. When False (updated), the + originals remain in place to be overwritten by the promote step. + + :param manifest_path: local path to a manifest file (one accession per line) + :param bucket: S3 bucket name + :param ncbi_release: release tag used in the archive path + :param path_prefix: Lakehouse path prefix + :param archive_reason: metadata value describing why the object was archived + :param delete_source: if True, delete the source object after copying + :param dry_run: if True, log without making changes + :return: number of objects archived + """ + s3 = get_s3_client() + release_tag = ncbi_release or "unknown" + datestamp = datetime.now(UTC).strftime("%Y-%m-%d") + archived = 0 + + with Path(manifest_path).open() as f: + accessions = [line.strip() for line in f if line.strip()] + + for accession in accessions: + m = re.match(r"(GC[AF])_(\d{3})(\d{3})(\d{3})\.\d+", accession) + if not m: + logger.warning("Cannot parse accession for archival: %s", accession) + continue + + db = m.group(1) + p1, p2, p3 = m.group(2), m.group(3), m.group(4) + source_prefix = f"{path_prefix}raw_data/{db}/{p1}/{p2}/{p3}/" + + paginator = s3.get_paginator("list_objects_v2") + matching_keys: list[str] = [] + for page in paginator.paginate(Bucket=bucket, Prefix=source_prefix): + matching_keys.extend(obj["Key"] for obj in page.get("Contents", []) if accession in obj["Key"]) + + if not matching_keys: + logger.debug("No objects found for %s, skipping archive", accession) + continue + + for source_key in matching_keys: + rel = source_key[len(path_prefix) :] + archive_key = f"{path_prefix}archive/{release_tag}/{rel}" + + if dry_run: + logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) + archived += 1 + continue + + try: + copy_object_with_metadata( + f"{bucket}/{source_key}", + f"{bucket}/{archive_key}", + metadata={ + "ncbi_last_release": release_tag, + "archive_reason": archive_reason, + "archive_date": datestamp, + }, + ) + if delete_source: + delete_object(f"{bucket}/{source_key}") + archived += 1 + logger.debug(" Archived: %s -> %s", source_key, archive_key) + except Exception: + logger.exception("Failed to archive %s", source_key) + + logger.info("Archived %d objects for %d accessions (%s)", archived, len(accessions), archive_reason) + return archived + + +# ── Manifest trimming ─────────────────────────────────────────────────── + + +def _trim_manifest(manifest_s3_path: str, bucket: str, promoted_accessions: set[str]) -> None: + """Remove promoted accessions from the transfer manifest in S3. + + :param manifest_s3_path: S3 key of the transfer_manifest.txt + :param bucket: S3 bucket name + :param promoted_accessions: set of accessions that were successfully promoted + """ + s3 = get_s3_client() + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as tmp: + tmp_path = tmp.name + + try: + s3.download_file(Bucket=bucket, Key=manifest_s3_path, Filename=tmp_path) + + with Path(tmp_path).open() as f: + lines = f.readlines() + + remaining = [line for line in lines if line.strip() and not any(acc in line for acc in promoted_accessions)] + + with Path(tmp_path).open("w") as f: + f.writelines(remaining) + + s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=manifest_s3_path) + logger.info( + "Trimmed manifest: %d -> %d entries (%d promoted)", + len(lines), + len(remaining), + len(lines) - len(remaining), + ) + finally: + Path(tmp_path).unlink() diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py new file mode 100644 index 00000000..6508fc1f --- /dev/null +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -0,0 +1,191 @@ +"""NCBI FTP assembly download pipeline (Phase 2). + +Orchestrates parallel downloading of NCBI assemblies listed in a transfer +manifest. Settings, batching, CLI entry point, and CTS integration live here; +domain-specific download logic is in :mod:`cdm_data_loaders.ncbi_ftp.assembly`. +""" + +import json +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import UTC, datetime +from ftplib import error_temp +from pathlib import Path +from typing import Any + +from pydantic import AliasChoices, Field, field_validator +from pydantic_settings import CliSuppress + +from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST, download_assembly_to_local +from cdm_data_loaders.pipelines.core import run_cli +from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT, CtsDefaultSettings +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.ftp_client import ThreadLocalFTP + +logger = get_cdm_logger() + + +# ── Settings ───────────────────────────────────────────────────────────── + + +class DownloadSettings(CtsDefaultSettings): + """Configuration for the NCBI FTP assembly download pipeline.""" + + manifest: str = Field( + default=f"{INPUT_MOUNT}/transfer_manifest.txt", + description="Path to the transfer manifest file listing FTP paths to download", + validation_alias=AliasChoices("m", "manifest"), + ) + output_dir: str = Field( + default=OUTPUT_MOUNT, + description="Output directory for downloaded assembly files", + validation_alias=AliasChoices("o", "output-dir", "output_dir"), + ) + threads: int = Field( + default=4, + ge=1, + le=32, + description="Number of parallel download threads", + validation_alias=AliasChoices("t", "threads"), + ) + ftp_host: str = Field( + default=FTP_HOST, + description="NCBI FTP hostname", + validation_alias=AliasChoices("ftp-host", "ftp_host"), + ) + limit: CliSuppress[int | None] = Field( + default=None, + ge=1, + description="Limit to first N assemblies (for testing)", + validation_alias=AliasChoices("l", "limit"), + ) + + @field_validator("threads") + @classmethod + def validate_threads(cls, v: int) -> int: + """Validate threads is within range. + + :param v: number of threads + :raises ValueError: if out of range + :return: validated thread count + """ + if v < 1 or v > 32: # noqa: PLR2004 + msg = f"threads must be between 1 and 32, got {v}" + raise ValueError(msg) + return v + + +# ── Batch download ─────────────────────────────────────────────────────── + + +def download_batch( + manifest_path: str | Path, + output_dir: str | Path, + threads: int = 4, + ftp_host: str = FTP_HOST, + limit: int | None = None, +) -> dict[str, Any]: + """Download all assemblies listed in the manifest. + + :param manifest_path: path to the transfer manifest file + :param output_dir: base output directory + :param threads: number of parallel download threads + :param ftp_host: FTP hostname + :param limit: optional limit for testing + :return: report dict with overall stats + """ + with Path(manifest_path).open() as f: + assembly_paths = [line.strip() for line in f if line.strip() and not line.startswith("#")] + + if limit: + assembly_paths = assembly_paths[:limit] + + logger.info("Starting download of %d assemblies with %d threads", len(assembly_paths), threads) + + pool = ThreadLocalFTP(ftp_host) + lock = threading.Lock() + success_count = 0 + failed: list[dict[str, str]] = [] + all_stats: list[dict[str, Any]] = [] + + def _download_one(path: str) -> tuple[str, Exception | None]: + nonlocal success_count + last_error: Exception | None = None + for attempt in range(1, 4): + try: + stats = download_assembly_to_local(path, output_dir, ftp_host=ftp_host, ftp=pool.get()) + except error_temp as e: + last_error = e + if attempt < 3: # noqa: PLR2004 + logger.warning("Transient FTP error for %s, retry %d/3: %s", path, attempt, e) + time.sleep(5) + except Exception as e: # noqa: BLE001 + return path, e + else: + with lock: + success_count += 1 + all_stats.append(stats) + return path, None + return path, last_error + + try: + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = {executor.submit(_download_one, p): p for p in assembly_paths} + for future in as_completed(futures): + path, error = future.result() + if error: + logger.error("FAILED: %s: %s", path, error) + with lock: + failed.append({"path": path, "error": str(error)}) + finally: + pool.close_all() + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "total_attempted": len(assembly_paths), + "succeeded": success_count, + "failed": len(failed), + "failures": failed, + "assembly_stats": all_stats, + } + + report_path = Path(output_dir) / "download_report.json" + report_path.parent.mkdir(parents=True, exist_ok=True) + with report_path.open("w") as f: + json.dump(report, f, indent=2) + logger.info("Download report written to: %s", report_path) + + logger.info( + "SUMMARY: %d attempted, %d succeeded, %d failed", + len(assembly_paths), + success_count, + len(failed), + ) + + return report + + +# ── CTS entry point ───────────────────────────────────────────────────── + + +def run_download(config: DownloadSettings) -> None: + """Main CTS entry point for Phase 2 download. + + :param config: validated download settings + """ + report = download_batch( + manifest_path=config.manifest, + output_dir=config.output_dir, + threads=config.threads, + ftp_host=config.ftp_host, + limit=config.limit, + ) + if report["failed"] > 0: + msg = f"Download completed with {report['failed']} failures" + raise RuntimeError(msg) + + +def cli() -> None: + """CLI entry point for ``ncbi_ftp_sync``.""" + run_cli(DownloadSettings, run_download) diff --git a/src/cdm_data_loaders/utils/checksums.py b/src/cdm_data_loaders/utils/checksums.py new file mode 100644 index 00000000..021098a6 --- /dev/null +++ b/src/cdm_data_loaders/utils/checksums.py @@ -0,0 +1,55 @@ +"""General-purpose file checksum utilities. + +Provides MD5 and CRC64/NVME checksum computation and verification for local +files. These are protocol-agnostic primitives used by download pipelines +and S3 metadata workflows. +""" + +import base64 +import hashlib +from pathlib import Path + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger + +logger = get_cdm_logger() + + +def compute_md5(file_path: str | Path) -> str: + """Compute the MD5 hex digest of a file. + + :param file_path: path to the file + :return: lowercase hex MD5 string + """ + md5_hash = hashlib.md5() # noqa: S324 + with Path(file_path).open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + md5_hash.update(chunk) + return md5_hash.hexdigest() + + +def verify_md5(file_path: str | Path, expected_md5: str) -> bool: + """Verify a file's MD5 checksum against an expected value. + + :param file_path: path to the file + :param expected_md5: expected lowercase hex MD5 string + :return: True if the checksum matches + """ + return compute_md5(file_path) == expected_md5 + + +def compute_crc64nvme(file_path: str | Path) -> str: + """Compute the CRC64/NVME checksum of a file. + + Returns the base64-encoded string matching the format used by S3-native + checksums (``ChecksumCRC64NVME``). + + :param file_path: path to the file + :return: base64-encoded CRC64/NVME checksum + """ + from awscrt.checksums import crc64nvme as _crc64nvme # noqa: PLC0415 + + crc = 0 + with Path(file_path).open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + crc = _crc64nvme(chunk, crc) + return base64.b64encode(crc.to_bytes(8, byteorder="big")).decode() diff --git a/src/cdm_data_loaders/utils/ftp_client.py b/src/cdm_data_loaders/utils/ftp_client.py new file mode 100644 index 00000000..0f8409e7 --- /dev/null +++ b/src/cdm_data_loaders/utils/ftp_client.py @@ -0,0 +1,162 @@ +"""General-purpose FTP client utilities. + +Provides resilient FTP connections with TCP keepalive, NOOP pings, retry +on transient errors, and thread-local connection management for parallel +downloads. Protocol-agnostic — callers supply the FTP hostname. +""" + +import contextlib +import socket +import threading +import time +from ftplib import FTP, error_temp +from pathlib import Path + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger + +logger = get_cdm_logger() + +DEFAULT_TIMEOUT = 60 + + +def connect_ftp(host: str, timeout: int = DEFAULT_TIMEOUT) -> FTP: + """Connect and log in to an FTP server with TCP keepalive enabled. + + :param host: FTP hostname + :param timeout: connection timeout in seconds + :return: logged-in FTP connection + """ + ftp = FTP(host, timeout=timeout) # noqa: S321 + ftp.login() + _set_keepalive(ftp) + return ftp + + +def _set_keepalive(ftp: FTP, idle: int = 30, interval: int = 10, count: int = 3) -> None: + """Enable TCP keepalive on the FTP control socket. + + Prevents idle-timeout disconnects (e.g. '421 No transfer timeout') when + the control connection sits idle during data transfers or checksum + verification. + """ + sock = ftp.sock + sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + if hasattr(socket, "TCP_KEEPIDLE"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, idle) + if hasattr(socket, "TCP_KEEPINTVL"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, interval) + if hasattr(socket, "TCP_KEEPCNT"): + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, count) + + +def ftp_noop_keepalive(ftp: FTP, last_activity: float, interval: int = 25) -> float: + """Send NOOP if the connection has been idle longer than *interval* seconds. + + :param ftp: active FTP connection + :param last_activity: monotonic timestamp of last FTP activity + :param interval: seconds of idle time before sending NOOP + :return: updated last-activity timestamp + """ + if time.monotonic() - last_activity > interval: + with contextlib.suppress(Exception): + ftp.sendcmd("NOOP") + return time.monotonic() + return last_activity + + +def ftp_list_dir(ftp: FTP, path: str, retries: int = 3) -> list[str]: + """List files in an FTP directory with retry on transient errors. + + :param ftp: active FTP connection + :param path: remote directory path + :param retries: number of retry attempts + :return: list of filenames + """ + ftp.cwd(path) + for attempt in range(1, retries + 1): + try: + files: list[str] = [] + ftp.retrlines("NLST", files.append) + except error_temp as e: + if attempt < retries: + logger.warning("Transient FTP error listing %s (attempt %d/%d): %s", path, attempt, retries, e) + time.sleep(2) + else: + raise + else: + return files + return [] # unreachable, but keeps type checkers happy + + +def ftp_download_file(ftp: FTP, remote_path: str, local_path: str, retries: int = 3) -> None: + """Download a single file from FTP with retry on transient errors. + + :param ftp: active FTP connection + :param remote_path: full remote file path + :param local_path: local destination path + :param retries: number of retry attempts + """ + for attempt in range(1, retries + 1): + try: + with Path(local_path).open("wb") as f: + ftp.retrbinary(f"RETR {remote_path}", f.write) + except error_temp as e: + if attempt < retries: + logger.warning( + "Transient FTP error downloading %s (attempt %d/%d): %s", remote_path, attempt, retries, e + ) + time.sleep(2) + else: + raise + else: + return + + +def ftp_retrieve_text(ftp: FTP, remote_path: str) -> str: + """Retrieve a text file from FTP, returning its content as a string. + + :param ftp: active FTP connection + :param remote_path: full remote file path + :return: file content + """ + lines: list[str] = [] + ftp.retrlines(f"RETR {remote_path}", lines.append) + return "\n".join(lines) + + +class ThreadLocalFTP: + """Manage thread-local FTP connections for parallel downloads. + + Each thread gets its own FTP connection, created on first access. + Call :meth:`close_all` when done to cleanly shut down all connections. + """ + + def __init__(self, host: str, timeout: int = DEFAULT_TIMEOUT) -> None: + """Initialise with FTP host and timeout. + + :param host: FTP hostname (required — no default) + :param timeout: connection timeout in seconds + """ + self._host = host + self._timeout = timeout + self._local = threading.local() + self._lock = threading.Lock() + self._connections: list[FTP] = [] + + def get(self) -> FTP: + """Return the FTP connection for the current thread, creating one if needed.""" + ftp = getattr(self._local, "ftp", None) + if ftp is None: + ftp = connect_ftp(self._host, self._timeout) + self._local.ftp = ftp + with self._lock: + self._connections.append(ftp) + return ftp + + def close_all(self) -> None: + """Close all thread-local FTP connections.""" + with self._lock: + for ftp in self._connections: + with contextlib.suppress(Exception): + ftp.quit() + self._connections.clear() diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 8253e2cd..6fc1696a 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -368,3 +368,117 @@ def delete_object(s3_path: str) -> dict[str, Any]: s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) return s3.delete_object(Bucket=bucket, Key=key) + + +def upload_file_with_metadata( + local_file_path: Path | str, + destination_dir: str, + metadata: dict[str, str], + object_name: str | None = None, +) -> bool: + """Upload a file to S3 with user-defined metadata and CRC64NVME checksum. + + Unlike :func:`upload_file`, this function always uploads (no existence check) + and attaches the supplied *metadata* dict as S3 user metadata. + + :param local_file_path: file to upload + :type local_file_path: Path | str + :param destination_dir: path to the destination directory on s3, INCLUDING the bucket name + :type destination_dir: str + :param metadata: user metadata key/value pairs to attach to the object + :type metadata: dict[str, str] + :param object_name: S3 object name; defaults to the local filename + :type object_name: str | None + :return: True if the upload succeeded + :rtype: bool + """ + if isinstance(local_file_path, str): + local_file_path = Path(local_file_path) + + if not destination_dir: + msg = "No destination directory supplied for the file" + raise ValueError(msg) + + if not object_name: + object_name = local_file_path.name + + s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" + s3 = get_s3_client() + (bucket, key) = split_s3_path(s3_path) + + extra_args = {**DEFAULT_EXTRA_ARGS, "Metadata": metadata} + + file_size = local_file_path.stat().st_size + with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.upload_file( + Filename=str(local_file_path), + Bucket=bucket, + Key=key, + Callback=pbar.update, + ExtraArgs=extra_args, + ) + return True + + +def head_object(s3_path: str) -> dict[str, Any] | None: + """Return metadata for an S3 object, or None if it does not exist. + + The returned dict contains: + - ``size``: content length in bytes + - ``metadata``: user metadata dict + - ``checksum_crc64nvme``: CRC64NVME checksum string (if available) + + :param s3_path: path to the object on s3, INCLUDING the bucket name + :type s3_path: str + :return: dict with object info, or None if the object does not exist + :rtype: dict[str, Any] | None + """ + s3 = get_s3_client() + (bucket, key) = split_s3_path(s3_path) + try: + resp = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + return None + raise + return { + "size": resp["ContentLength"], + "metadata": resp.get("Metadata", {}), + "checksum_crc64nvme": resp.get("ChecksumCRC64NVME"), + } + + +def copy_object_with_metadata( + current_s3_path: str, + new_s3_path: str, + metadata: dict[str, str], +) -> dict[str, Any]: + """Copy an S3 object to a new location, replacing its user metadata. + + Uses ``MetadataDirective='REPLACE'`` so the destination object carries + exactly the supplied *metadata* rather than inheriting the source's metadata. + + A successful copy returns a response where + ``resp["ResponseMetadata"]["HTTPStatusCode"] == 200``. + + :param current_s3_path: source path on s3, INCLUDING the bucket name + :type current_s3_path: str + :param new_s3_path: destination path on s3, INCLUDING the bucket name + :type new_s3_path: str + :param metadata: user metadata to set on the destination object + :type metadata: dict[str, str] + :return: dictionary containing response + :rtype: dict[str, Any] + """ + s3 = get_s3_client() + (current_bucket, current_key) = split_s3_path(current_s3_path) + (new_bucket, new_key) = split_s3_path(new_s3_path) + + return s3.copy_object( + CopySource={"Bucket": current_bucket, "Key": current_key}, + Bucket=new_bucket, + Key=new_key, + Metadata=metadata, + MetadataDirective="REPLACE", + **DEFAULT_EXTRA_ARGS, + ) diff --git a/tests/ncbi_ftp/__init__.py b/tests/ncbi_ftp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/ncbi_ftp/conftest.py b/tests/ncbi_ftp/conftest.py new file mode 100644 index 00000000..2c0923dd --- /dev/null +++ b/tests/ncbi_ftp/conftest.py @@ -0,0 +1,80 @@ +"""Shared fixtures for ncbi_ftp tests.""" + +import functools +from collections.abc import Callable, Generator +from unittest.mock import patch + +import boto3 +import botocore.client +import pytest +from moto import mock_aws + +import cdm_data_loaders.ncbi_ftp.promote as promote_mod +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.utils.s3 import CDM_LAKE_BUCKET, reset_s3_client + +AWS_REGION = "us-east-1" +TEST_BUCKET = CDM_LAKE_BUCKET + + +# Minimal assembly_summary_refseq.txt content (tab-separated, 20+ columns) +SAMPLE_SUMMARY = ( + "# assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\t" + "taxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\t" + "version_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\t" + "asm_name\t16\t17\t18\tftp_path\n" + "GCF_000001215.4\tPRJNA13812\tSAMN02803731\t\treference genome\t7227\t7227\t" + "Drosophila melanogaster\t\t\tlatest\tChromosome\tMajor\tFull\t2014/10/21\t" + "Release_6_plus_ISO1_MT\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT\n" + "GCF_000001405.40\tPRJNA168\tna\t\treference genome\t9606\t9606\t" + "Homo sapiens\t\t\tlatest\tChromosome\tPatch\tFull\t2022/02/03\t" + "GRCh38.p14\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14\n" + "GCF_000005845.2\tPRJNA57779\tSAMN02604091\t\trepresentative genome\t511145\t562\t" + "Escherichia coli\t\t\treplaced\tComplete Genome\tMajor\tFull\t2013/09/26\t" + "ASM584v2\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2\n" + "GCF_000009999.1\tPRJNA999\tSAMN999\t\tna\t0\t0\t" + "Test organism\t\t\tsuppressed\tScaffold\tMajor\tFull\t2010/01/01\t" + "ASM999v1\t\t\t\t" + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/999/GCF_000009999.1_ASM999v1\n" + "GCF_000099999.1\tPRJNA888\tSAMN888\t\tna\t0\t0\t" + "Test organism 2\t\t\tlatest\tContig\tMajor\tFull\t2023/06/15\t" + "ASM9999v1\t\t\t\tna\n" +) + + +def strip_checksum_algorithm(method: Callable) -> Callable: + """Wrap a boto3 S3 method to remove ChecksumAlgorithm (moto CRC64NVME workaround).""" + + @functools.wraps(method) + def wrapper(*args: object, **kwargs: object) -> object: + kwargs.pop("ChecksumAlgorithm", None) # type: ignore[arg-type] + return method(*args, **kwargs) + + return wrapper + + +@pytest.fixture +def mock_s3_client() -> Generator[botocore.client.BaseClient]: + """Yield a mocked S3 client with the CDM Lake bucket created.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(promote_mod, "get_s3_client", return_value=client), + ): + yield client + reset_s3_client() + + +@pytest.fixture +def mock_s3_client_no_checksum(mock_s3_client: botocore.client.BaseClient) -> botocore.client.BaseClient: + """Mocked S3 client with copy_object and upload_file patched to strip ChecksumAlgorithm.""" + mock_s3_client.copy_object = strip_checksum_algorithm(mock_s3_client.copy_object) # type: ignore[method-assign] + mock_s3_client.upload_file = strip_checksum_algorithm(mock_s3_client.upload_file) # type: ignore[method-assign] + return mock_s3_client diff --git a/tests/ncbi_ftp/test_assembly.py b/tests/ncbi_ftp/test_assembly.py new file mode 100644 index 00000000..8aa24b1c --- /dev/null +++ b/tests/ncbi_ftp/test_assembly.py @@ -0,0 +1,114 @@ +"""Tests for ncbi_ftp.assembly module — path helpers, file filtering, checksum parsing.""" + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import ( + FILE_FILTERS, + build_accession_path, + parse_assembly_path, + parse_md5_checksums_file, +) + +_EXPECTED_TWO_ENTRIES = 2 + + +# ── Path helpers ───────────────────────────────────────────────────────── + + +class TestBuildAccessionPath: + """Test output directory path construction from assembly names.""" + + def test_basic(self) -> None: + """Verify standard GCF accession path construction.""" + result = build_accession_path("GCF_000001215.4_Release_6_plus_ISO1_MT") + assert result == "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" + + def test_gca_prefix(self) -> None: + """Verify GCA prefix path construction.""" + result = build_accession_path("GCA_012345678.1_ASM1234v1") + assert result == "raw_data/GCA/012/345/678/GCA_012345678.1_ASM1234v1/" + + def test_invalid_raises(self) -> None: + """Verify ValueError on invalid assembly name.""" + with pytest.raises(ValueError, match="Cannot parse"): + build_accession_path("invalid_name") + + +class TestParseAssemblyPath: + """Test FTP path parsing.""" + + def test_basic(self) -> None: + """Verify db, assembly_dir, and accession are parsed correctly.""" + path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" + _db, _assembly_dir, accession = parse_assembly_path(path) + assert _db == "GCF" + assert _assembly_dir == "GCF_000001215.4_Release_6_plus_ISO1_MT" + assert accession == "GCF_000001215.4" + + def test_without_trailing_slash(self) -> None: + """Verify parsing works without trailing slash.""" + path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" + _db, _assembly_dir, accession = parse_assembly_path(path) + assert accession == "GCF_000001215.4" + + def test_invalid_raises(self) -> None: + """Verify ValueError on invalid path.""" + with pytest.raises(ValueError, match="Cannot parse"): + parse_assembly_path("/random/path/") + + +# ── FILE_FILTERS sanity ───────────────────────────────────────────────── + + +class TestFileFilters: + """Sanity checks for the file suffix filter list.""" + + def test_not_empty(self) -> None: + """Verify FILE_FILTERS is not empty.""" + assert len(FILE_FILTERS) > 0 + + def test_all_start_with_underscore(self) -> None: + """Verify all filter patterns start with an underscore.""" + for f in FILE_FILTERS: + assert f.startswith("_"), f"Filter should start with underscore: {f}" + + def test_genomic_fna_included(self) -> None: + """Verify _genomic.fna.gz is in the filter list.""" + assert "_genomic.fna.gz" in FILE_FILTERS + + def test_assembly_report_included(self) -> None: + """Verify _assembly_report.txt is in the filter list.""" + assert "_assembly_report.txt" in FILE_FILTERS + + +# ── parse_md5_checksums_file ───────────────────────────────────────────── + + +class TestParseMd5ChecksumsFile: + """Test NCBI md5checksums.txt parsing.""" + + def test_basic(self) -> None: + """Verify parsing of standard md5checksums.txt format.""" + text = "abc123 ./GCF_000001215.4_genomic.fna.gz\ndef456 ./GCF_000001215.4_genomic.gff.gz\n" + result = parse_md5_checksums_file(text) + assert result == { + "GCF_000001215.4_genomic.fna.gz": "abc123", + "GCF_000001215.4_genomic.gff.gz": "def456", + } + + def test_no_leading_dot_slash(self) -> None: + """Verify parsing works without leading ./ prefix.""" + text = "abc123 GCF_000001215.4_genomic.fna.gz\n" + result = parse_md5_checksums_file(text) + assert result == {"GCF_000001215.4_genomic.fna.gz": "abc123"} + + def test_empty(self) -> None: + """Verify empty or whitespace-only input returns empty dict.""" + assert parse_md5_checksums_file("") == {} + assert parse_md5_checksums_file(" \n \n") == {} + + def test_blank_lines_ignored(self) -> None: + """Verify blank lines between entries are skipped.""" + text = "abc123 file1.txt\n\n\ndef456 file2.txt\n" + result = parse_md5_checksums_file(text) + assert len(result) == _EXPECTED_TWO_ENTRIES diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py new file mode 100644 index 00000000..61db7e5a --- /dev/null +++ b/tests/ncbi_ftp/test_manifest.py @@ -0,0 +1,516 @@ +"""Tests for ncbi_ftp.manifest module — assembly summary parsing, diff, filtering, writing.""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +from cdm_data_loaders.ncbi_ftp.manifest import ( + DiffResult, + _ftp_dir_from_url, + accession_prefix, + compute_diff, + filter_by_prefix_range, + get_latest_assembly_paths, + parse_assembly_summary, + verify_transfer_candidates, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) + +from .conftest import SAMPLE_SUMMARY + +_EXPECTED_ENTRIES = 4 +_EXPECTED_TWO = 2 +_EXPECTED_TOTAL_TRANSFER = 2 + + +# ── parse_assembly_summary ─────────────────────────────────────────────── + + +class TestParseAssemblySummary: + """Test assembly summary parsing.""" + + def test_parse_basic(self) -> None: + """Verify basic parsing returns expected number of assemblies.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert len(assemblies) == _EXPECTED_ENTRIES + assert "GCF_000001215.4" in assemblies + assert "GCF_000005845.2" in assemblies + assert "GCF_000099999.1" not in assemblies # ftp_path == "na" + + def test_parse_status(self) -> None: + """Verify status field is parsed correctly.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert assemblies["GCF_000001215.4"].status == "latest" + assert assemblies["GCF_000005845.2"].status == "replaced" + assert assemblies["GCF_000009999.1"].status == "suppressed" + + def test_parse_seq_rel_date(self) -> None: + """Verify seq_rel_date field is parsed correctly.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert assemblies["GCF_000001215.4"].seq_rel_date == "2014/10/21" + + def test_parse_assembly_dir(self) -> None: + """Verify assembly_dir is extracted from the FTP path.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert assemblies["GCF_000001215.4"].assembly_dir == "GCF_000001215.4_Release_6_plus_ISO1_MT" + + def test_parse_ftp_path(self) -> None: + """Verify full FTP path is stored.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert assemblies["GCF_000001215.4"].ftp_path == ( + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" + ) + + def test_parse_empty(self) -> None: + """Verify empty or comment-only input returns empty dict.""" + assemblies = parse_assembly_summary("# comment only\n") + assert len(assemblies) == 0 + + def test_parse_skips_comments(self) -> None: + """Verify comment lines are not included in results.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + for acc in assemblies: + assert acc.startswith("GCF_") + + def test_parse_from_file(self, tmp_path: Path) -> None: + """Verify parsing from a file path object.""" + f = tmp_path / "summary.tsv" + f.write_text(SAMPLE_SUMMARY) + assemblies = parse_assembly_summary(f) + assert len(assemblies) == _EXPECTED_ENTRIES + + def test_parse_from_file_str(self, tmp_path: Path) -> None: + """Verify parsing from a string file path.""" + f = tmp_path / "summary.tsv" + f.write_text(SAMPLE_SUMMARY) + assemblies = parse_assembly_summary(str(f)) + assert len(assemblies) == _EXPECTED_ENTRIES + + def test_parse_from_list_of_lines(self) -> None: + """Verify parsing from a list of lines.""" + lines = SAMPLE_SUMMARY.splitlines(keepends=True) + assemblies = parse_assembly_summary(lines) + assert len(assemblies) == _EXPECTED_ENTRIES + + +# ── get_latest_assembly_paths ──────────────────────────────────────────── + + +class TestGetLatestAssemblyPaths: + """Test extraction of FTP paths for latest assemblies.""" + + def test_only_latest(self) -> None: + """Verify only assemblies with status 'latest' are returned.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + paths = get_latest_assembly_paths(assemblies) + accessions = [acc for acc, _ in paths] + assert "GCF_000001215.4" in accessions + assert "GCF_000001405.40" in accessions + assert "GCF_000005845.2" not in accessions # replaced + assert "GCF_000009999.1" not in accessions # suppressed + + def test_path_conversion(self) -> None: + """Verify HTTPS paths are converted to FTP-relative paths.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + paths = dict(get_latest_assembly_paths(assemblies)) + assert paths["GCF_000001215.4"] == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" + + def test_paths_end_with_slash(self) -> None: + """Verify all returned paths end with a trailing slash.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + for _, path in get_latest_assembly_paths(assemblies): + assert path.endswith("/") + + def test_empty(self) -> None: + """Verify empty input returns empty list.""" + assemblies = parse_assembly_summary("# empty\n") + assert get_latest_assembly_paths(assemblies) == [] + + +# ── compute_diff ───────────────────────────────────────────────────────── + + +class TestComputeDiff: + """Test diff computation between current and previous assembly state.""" + + def test_all_new_no_previous(self) -> None: + """Verify all latest assemblies are marked new when no previous state.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + assert "GCF_000001215.4" in diff.new + assert "GCF_000001405.40" in diff.new + assert "GCF_000005845.2" not in diff.new # replaced + + def test_nothing_new_when_all_known(self) -> None: + """Verify no new assemblies when all are already known.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + known = {"GCF_000001215.4", "GCF_000001405.40"} + diff = compute_diff(current, previous_accessions=known) + assert len(diff.new) == 0 + + def test_detects_updated_seq_rel_date(self) -> None: + """Verify assemblies with changed seq_rel_date are marked updated.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + previous = parse_assembly_summary(SAMPLE_SUMMARY) + previous["GCF_000001215.4"].seq_rel_date = "2010/01/01" + diff = compute_diff(current, previous_assemblies=previous) + assert "GCF_000001215.4" in diff.updated + + def test_detects_replaced(self) -> None: + """Verify assemblies with status 'replaced' are detected.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000005845.2"}) + assert "GCF_000005845.2" in diff.replaced + + def test_detects_suppressed(self) -> None: + """Verify assemblies with status 'suppressed' are detected.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000009999.1"}) + assert "GCF_000009999.1" in diff.suppressed + + def test_detects_withdrawn(self) -> None: + """Accessions in previous but entirely absent from current.""" + current = parse_assembly_summary("# empty\n") + diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) + assert "GCF_000001215.4" in diff.suppressed + + def test_scan_store_fallback(self) -> None: + """Verify known accessions are not marked as new.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) + assert "GCF_000001215.4" not in diff.new + assert "GCF_000001405.40" in diff.new + + def test_results_are_sorted(self) -> None: + """Verify diff results are sorted alphabetically.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + assert diff.new == sorted(diff.new) + + +# ── accession_prefix & filter_by_prefix_range ──────────────────────────── + + +class TestPrefixFiltering: + """Test prefix extraction and range filtering.""" + + def test_accession_prefix(self) -> None: + """Verify 3-digit prefix extraction from accessions.""" + assert accession_prefix("GCF_000001215.4") == "000" + assert accession_prefix("GCF_123456789.1") == "123" + assert accession_prefix("invalid") is None + + def test_filter_range_inclusive(self) -> None: + """Verify prefix range filter is inclusive.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + filtered = filter_by_prefix_range(assemblies, "000", "000") + assert len(filtered) == len(assemblies) + + def test_filter_excludes_out_of_range(self) -> None: + """Verify assemblies outside the prefix range are excluded.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + filtered = filter_by_prefix_range(assemblies, "001", "999") + assert len(filtered) == 0 + + def test_no_filter_returns_all(self) -> None: + """Verify no prefix range returns all assemblies.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + filtered = filter_by_prefix_range(assemblies) + assert len(filtered) == len(assemblies) + + +# ── Manifest writing ──────────────────────────────────────────────────── + + +class TestManifestWriting: + """Test manifest file writing.""" + + def test_write_transfer_manifest(self, tmp_path: Path) -> None: + """Verify transfer manifest file is written correctly.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + manifest_file = tmp_path / "transfer.txt" + paths = write_transfer_manifest(diff, current, manifest_file) + assert len(paths) > 0 + lines = [line.strip() for line in manifest_file.read_text().splitlines() if line.strip()] + assert len(lines) == len(paths) + for line in lines: + assert line.startswith("/genomes/") + assert line.endswith("/") + + def test_write_removed_manifest(self, tmp_path: Path) -> None: + """Verify removed manifest lists replaced and suppressed accessions.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000005845.2", "GCF_000009999.1"}) + removed_file = tmp_path / "removed.txt" + removed = write_removed_manifest(diff, removed_file) + assert len(removed) == _EXPECTED_TWO + lines = [line.strip() for line in removed_file.read_text().splitlines() if line.strip()] + assert len(lines) == _EXPECTED_TWO + + def test_write_updated_manifest(self, tmp_path: Path) -> None: + """Verify updated manifest lists only updated accessions.""" + diff = DiffResult(new=["GCF_000001215.4"], updated=["GCF_000005845.2", "GCF_000001405.40"]) + updated_file = tmp_path / "updated.txt" + updated = write_updated_manifest(diff, updated_file) + assert len(updated) == _EXPECTED_TWO + lines = [line.strip() for line in updated_file.read_text().splitlines() if line.strip()] + assert len(lines) == _EXPECTED_TWO + # Should be sorted + assert lines[0] == "GCF_000001405.40" + assert lines[1] == "GCF_000005845.2" + + def test_write_diff_summary(self, tmp_path: Path) -> None: + """Verify diff summary JSON is written with correct counts.""" + diff = DiffResult(new=["a"], updated=["b"], replaced=["c"], suppressed=[]) + summary_file = tmp_path / "summary.json" + summary = write_diff_summary(diff, summary_file, "refseq", "000", "003") + assert summary["counts"]["new"] == 1 + assert summary["counts"]["total_to_transfer"] == _EXPECTED_TOTAL_TRANSFER + assert summary["prefix_range"]["from"] == "000" + + loaded = json.loads(summary_file.read_text()) + assert loaded["database"] == "refseq" + + +# ── _ftp_dir_from_url ─────────────────────────────────────────────────── + + +class TestFtpDirFromUrl: + """Test FTP URL to directory path conversion.""" + + def test_https_url(self) -> None: + """Verify https:// URLs are converted to FTP paths.""" + url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" + assert _ftp_dir_from_url(url) == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" + + def test_ftp_url(self) -> None: + """Verify ftp:// URLs are converted to FTP paths.""" + url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" + assert _ftp_dir_from_url(url) == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" + + def test_bare_path(self) -> None: + """Verify bare paths are returned unchanged.""" + path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" + assert _ftp_dir_from_url(path) == path + + def test_custom_ftp_host(self) -> None: + """Verify custom FTP host is stripped from ftp:// URLs.""" + url = "ftp://custom.host.example.com/genomes/all/GCF/000/001/215" + assert _ftp_dir_from_url(url, ftp_host="custom.host.example.com") == "/genomes/all/GCF/000/001/215" + + +# ── verify_transfer_candidates ─────────────────────────────────────────── + + +_MD5_CHECKSUMS_TXT = ( + "d41d8cd98f00b204e9800998ecf8427e ./GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz\n" + "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4 ./GCF_000001215.4_Release_6_plus_ISO1_MT_protein.faa.gz\n" + "ffffffffffffffffffffffffffffffff ./GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt\n" + "0000000000000000000000000000dead ./GCF_000001215.4_Release_6_plus_ISO1_MT_README.txt\n" +) + + +class TestVerifyTransferCandidates: + """Test S3 checksum verification to prune transfer candidates.""" + + def _assemblies(self) -> dict: + return parse_assembly_summary(SAMPLE_SUMMARY) + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_prunes_when_all_match( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Assemblies where every file matches S3 are pruned from the list.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + if "_genomic.fna.gz" in s3_path: + return { + "size": 100, + "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, + "checksum_crc64nvme": None, + } + if "_protein.faa.gz" in s3_path: + return { + "size": 100, + "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, + "checksum_crc64nvme": None, + } + if "_assembly_report.txt" in s3_path: + return { + "size": 100, + "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, + "checksum_crc64nvme": None, + } + return None + + mock_head.side_effect = head_side_effect + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == [] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_keeps_when_md5_differs( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Assembly is kept when at least one file has a different MD5.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {"md5": "WRONG"}, "checksum_crc64nvme": None} + + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001215.4"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_keeps_when_s3_object_missing( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Assembly is kept when at least one file doesn't exist in S3.""" + mock_connect.return_value = MagicMock() + + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001215.4"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_keeps_when_s3_has_no_md5_metadata( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Assembly is kept when S3 object exists but has no md5 metadata.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {}, "checksum_crc64nvme": None} + + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001215.4"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", side_effect=Exception("FTP error")) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_keeps_when_ftp_fails(self, mock_connect: MagicMock, mock_retrieve: MagicMock) -> None: + """Assembly is kept (conservative) when md5checksums.txt cannot be fetched.""" + mock_connect.return_value = MagicMock() + + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001215.4"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_empty_input(self, mock_connect: MagicMock) -> None: + """Empty accession list returns empty result without connecting.""" + result = verify_transfer_candidates([], {}, "cdm-lake", "prefix/") + assert result == [] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_unknown_accession_kept(self, mock_connect: MagicMock) -> None: + """Accessions not in assemblies dict are kept (conservative).""" + mock_connect.return_value = MagicMock() + result = verify_transfer_candidates(["GCF_999999999.1"], {}, "cdm-lake", "prefix/") + assert result == ["GCF_999999999.1"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_short_circuits_on_first_mismatch( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Verification stops checking after the first missing/mismatched file.""" + mock_connect.return_value = MagicMock() + + verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert mock_head.call_count == 1 + + @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") + @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_mixed_candidates( + self, + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + ) -> None: + """Verify a mix of matching and non-matching assemblies.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + # GCF_000001215.4 assembly dir → all match; GCF_000001405.40 → missing + if "GCF_000001215.4_Release_6_plus_ISO1_MT/" in s3_path: + if "_genomic.fna.gz" in s3_path: + return { + "size": 1, + "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, + "checksum_crc64nvme": None, + } + if "_protein.faa.gz" in s3_path: + return { + "size": 1, + "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, + "checksum_crc64nvme": None, + } + if "_assembly_report.txt" in s3_path: + return { + "size": 1, + "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, + "checksum_crc64nvme": None, + } + return None + + mock_head.side_effect = head_side_effect + result = verify_transfer_candidates( + ["GCF_000001215.4", "GCF_000001405.40"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001405.40"] diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py new file mode 100644 index 00000000..6935c46d --- /dev/null +++ b/tests/ncbi_ftp/test_notebooks.py @@ -0,0 +1,89 @@ +"""Smoke tests for NCBI FTP notebooks — syntax and import validation.""" + +import ast +import json +from pathlib import Path + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST # noqa: F401 +from cdm_data_loaders.ncbi_ftp.manifest import ( # noqa: F401 + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import ( + DEFAULT_PATH_PREFIX, + promote_from_s3, +) +from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path # noqa: F401 + +NOTEBOOKS_DIR = Path(__file__).resolve().parents[2] / "notebooks" + +NCBI_NOTEBOOKS = [ + "ncbi_ftp_manifest.ipynb", + "ncbi_ftp_promote.ipynb", +] + + +def _extract_code_cells(notebook_path: Path) -> list[str]: + """Extract source code from all code cells in a notebook. + + :param notebook_path: path to the .ipynb file + :return: list of source code strings, one per code cell + """ + with notebook_path.open() as f: + nb = json.load(f) + return [ + "".join(cell.get("source", [])) + for cell in nb.get("cells", []) + if cell.get("cell_type") == "code" + ] + + +@pytest.mark.parametrize("notebook", NCBI_NOTEBOOKS) +class TestNotebookSyntax: + """Validate that every code cell in each notebook is syntactically valid Python.""" + + def test_all_cells_parse(self, notebook: str) -> None: + """Verify every code cell compiles without SyntaxError.""" + path = NOTEBOOKS_DIR / notebook + assert path.exists(), f"Notebook not found: {path}" + cells = _extract_code_cells(path) + assert len(cells) > 0, f"No code cells found in {notebook}" + for i, source in enumerate(cells, 1): + try: + ast.parse(source, filename=f"{notebook}:cell{i}") + except SyntaxError as exc: + pytest.fail(f"{notebook} cell {i} has a syntax error: {exc}") + + def test_no_empty_code_cells(self, notebook: str) -> None: + """Verify no code cell is completely empty.""" + path = NOTEBOOKS_DIR / notebook + cells = _extract_code_cells(path) + for i, source in enumerate(cells, 1): + assert source.strip(), f"{notebook} cell {i} is empty" + + +class TestManifestNotebookImports: + """Verify that all imports in the manifest notebook resolve.""" + + def test_imports_resolve(self) -> None: + """All manifest notebook imports are verified at module load time above.""" + assert callable(download_assembly_summary) + assert callable(write_updated_manifest) + + +class TestPromoteNotebookImports: + """Verify that all imports in the promote notebook resolve.""" + + def test_imports_resolve(self) -> None: + """All promote notebook imports are verified at module load time above.""" + assert callable(promote_from_s3) + assert isinstance(DEFAULT_PATH_PREFIX, str) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py new file mode 100644 index 00000000..bd1d7f0f --- /dev/null +++ b/tests/ncbi_ftp/test_promote.py @@ -0,0 +1,278 @@ +"""Tests for ncbi_ftp.promote module — S3 promote, archive, manifest trimming.""" + +from pathlib import Path + +import botocore.client +import pytest + +from cdm_data_loaders.ncbi_ftp.promote import ( + DEFAULT_PATH_PREFIX, + _archive_assemblies, + _trim_manifest, + promote_from_s3, +) +from tests.ncbi_ftp.conftest import TEST_BUCKET + + +@pytest.mark.s3 +class TestPromoteFromS3: + """Test promote_from_s3 with moto-mocked S3.""" + + def _stage_files(self, s3_client: botocore.client.BaseClient, prefix: str) -> None: + """Upload sample staged files to mock S3.""" + for key in [ + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz", + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz.md5", + f"{prefix}download_report.json", + ]: + body = b"md5hash123" if key.endswith(".md5") else b"data" + s3_client.put_object(Bucket=TEST_BUCKET, Key=key, Body=body) + + def test_dry_run_no_writes(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify dry_run does not write any objects.""" + prefix = "staging/run1/" + self._stage_files(mock_s3_client_no_checksum, prefix) + + report = promote_from_s3( + staging_prefix=prefix, + bucket=TEST_BUCKET, + dry_run=True, + ) + assert report["promoted"] == 1 + assert report["dry_run"] is True + + # Final path should NOT exist + final_key = ( + f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key) + assert resp.get("KeyCount", 0) == 0 + + def test_promotes_with_metadata(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify objects are promoted with MD5 metadata attached.""" + prefix = "staging/run1/" + self._stage_files(mock_s3_client_no_checksum, prefix) + + report = promote_from_s3( + staging_prefix=prefix, + bucket=TEST_BUCKET, + ) + assert report["promoted"] == 1 + assert report["failed"] == 0 + + # Check final object exists with metadata + final_key = ( + f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) + assert resp["Metadata"].get("md5") == "md5hash123" + + def test_skips_download_report(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify download_report.json is not promoted.""" + prefix = "staging/run1/" + self._stage_files(mock_s3_client_no_checksum, prefix) + + report = promote_from_s3(staging_prefix=prefix, bucket=TEST_BUCKET) + # Only the .fna.gz data file, not download_report.json + assert report["promoted"] == 1 + + +@pytest.mark.s3 +class TestTrimManifest: + """Test _trim_manifest removes promoted accessions from S3 manifest.""" + + def test_trims_promoted(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify promoted accessions are removed from manifest.""" + manifest_key = "manifests/transfer_manifest.txt" + manifest_body = ( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" + ) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) + + _trim_manifest(manifest_key, TEST_BUCKET, {"GCF_000001215.4"}) + + resp = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key) + remaining = resp["Body"].read().decode() + assert "GCF_000001215.4" not in remaining + assert "GCF_000001405.40" in remaining + + def test_trims_all(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify all entries can be trimmed leaving an empty manifest.""" + manifest_key = "manifests/transfer_manifest.txt" + manifest_body = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) + + _trim_manifest(manifest_key, TEST_BUCKET, {"GCF_000001215.4"}) + + resp = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key) + remaining = resp["Body"].read().decode().strip() + assert remaining == "" + + +@pytest.mark.s3 +class TestArchiveAssemblies: + """Test _archive_assemblies with moto-mocked S3.""" + + def test_archives_and_deletes_removed( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify removed accessions are archived and originals deleted.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + count = _archive_assemblies( + str(manifest), + bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + assert count == 1 + + # Original should be deleted + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) + assert resp.get("KeyCount", 0) == 0 + + # Archived copy should exist + archive_key = ( + f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) + assert resp.get("KeyCount", 0) == 1 + + def test_archives_updated_without_deleting( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify updated accessions are archived but originals remain.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"original-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + count = _archive_assemblies( + str(manifest), + bucket=TEST_BUCKET, + ncbi_release="2024-06", + archive_reason="updated", + delete_source=False, + ) + assert count == 1 + + # Original still exists (promote will overwrite it) + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) + assert resp.get("KeyCount", 0) == 1 + + # Archived copy exists with correct metadata + archive_key = ( + f"{DEFAULT_PATH_PREFIX}archive/2024-06/" + f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) + assert resp["Metadata"]["archive_reason"] == "updated" + assert resp["Metadata"]["ncbi_last_release"] == "2024-06" + + def test_multiple_releases_no_collision( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify archiving the same accession in different releases creates distinct folders.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v1-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + # First archive: release 2024-01 + _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") + + # Simulate promote overwriting source + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") + + # Second archive: release 2024-06 + _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") + + archive_key_1 = ( + f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + archive_key_2 = ( + f"{DEFAULT_PATH_PREFIX}archive/2024-06/" + f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + resp1 = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1) + assert resp1["Body"].read() == b"v1-data" + + resp2 = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_2) + assert resp2["Body"].read() == b"v2-data" + + def test_dry_run_no_side_effects( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify dry_run does not copy or delete anything.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + count = _archive_assemblies( + str(manifest), + bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + dry_run=True, + ) + assert count == 1 + + # Original still exists + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) + assert resp.get("KeyCount", 0) == 1 + + # No archive created + archive_prefix = f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix) + assert resp.get("KeyCount", 0) == 0 + + def test_no_existing_objects_skips( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify accessions with no existing S3 objects are silently skipped.""" + manifest = tmp_path / "updated.txt" + manifest.write_text("GCF_000001215.4\n") + + count = _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01") + assert count == 0 + + def test_unknown_release_fallback( + self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path + ) -> None: + """Verify ncbi_release=None falls back to 'unknown'.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + count = _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release=None) + assert count == 1 + + archive_key = ( + f"{DEFAULT_PATH_PREFIX}archive/unknown/" + f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) + assert resp.get("KeyCount", 0) == 1 diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py new file mode 100644 index 00000000..1dfc0997 --- /dev/null +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -0,0 +1,230 @@ +"""Tests for pipelines.ncbi_ftp_download — settings, batch orchestration, CLI.""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from pydantic import ValidationError + +from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST +from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT +from cdm_data_loaders.pipelines.ncbi_ftp_download import DownloadSettings, download_batch + +_DEFAULT_THREADS = 4 +_CUSTOM_THREADS = 8 +_ALIAS_THREADS = 16 +_BOUNDARY_MIN = 1 +_BOUNDARY_MAX = 32 +_OVER_MAX = 64 +_CUSTOM_LIMIT = 100 +_ALIAS_LIMIT = 50 +_EXPECTED_ATTEMPTED = 2 + + +def make_settings(**kwargs: str | int) -> DownloadSettings: + """Generate a validated DownloadSettings object.""" + return DownloadSettings(_cli_parse_args=[], **kwargs) + + +# ── Settings defaults ──────────────────────────────────────────────────── + + +class TestDownloadSettingsDefaults: + """Test default settings.""" + + def test_manifest_default(self) -> None: + """Verify default manifest path uses INPUT_MOUNT.""" + s = make_settings() + assert s.manifest == f"{INPUT_MOUNT}/transfer_manifest.txt" + + def test_output_dir_default(self) -> None: + """Verify default output_dir uses OUTPUT_MOUNT.""" + s = make_settings() + assert s.output_dir == OUTPUT_MOUNT + + def test_threads_default(self) -> None: + """Verify default threads is 4.""" + s = make_settings() + assert s.threads == _DEFAULT_THREADS + + def test_ftp_host_default(self) -> None: + """Verify default ftp_host matches FTP_HOST constant.""" + s = make_settings() + assert s.ftp_host == FTP_HOST + + def test_limit_default_none(self) -> None: + """Verify default limit is None.""" + s = make_settings() + assert s.limit is None + + +# ── Settings all params ────────────────────────────────────────────────── + + +class TestDownloadSettingsAllParams: + """Test with all params set.""" + + def test_all_params(self) -> None: + """Verify all parameters are correctly set when provided.""" + s = make_settings( + manifest="/data/my_manifest.txt", + output_dir="/data/output", + threads=_CUSTOM_THREADS, + ftp_host="ftp.example.com", + limit=_CUSTOM_LIMIT, + ) + assert s.manifest == "/data/my_manifest.txt" + assert s.output_dir == "/data/output" + assert s.threads == _CUSTOM_THREADS + assert s.ftp_host == "ftp.example.com" + assert s.limit == _CUSTOM_LIMIT + + +# ── Settings aliases ───────────────────────────────────────────────────── + + +class TestDownloadSettingsAliases: + """Test CLI alias resolution.""" + + def test_manifest_alias_m(self) -> None: + """Verify 'm' alias resolves to manifest.""" + s = make_settings(m="/data/m.txt") + assert s.manifest == "/data/m.txt" + + def test_output_dir_alias_o(self) -> None: + """Verify 'o' alias resolves to output_dir.""" + s = make_settings(o="/data/o") + assert s.output_dir == "/data/o" + + def test_threads_alias_t(self) -> None: + """Verify 't' alias resolves to threads.""" + s = make_settings(t=_ALIAS_THREADS) + assert s.threads == _ALIAS_THREADS + + def test_limit_alias_l(self) -> None: + """Verify 'l' alias resolves to limit.""" + s = make_settings(l=_ALIAS_LIMIT) + assert s.limit == _ALIAS_LIMIT + + +# ── Settings validation ────────────────────────────────────────────────── + + +class TestDownloadSettingsValidation: + """Test validation constraints.""" + + def test_threads_too_low(self) -> None: + """Verify threads=0 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(threads=0) + + def test_threads_too_high(self) -> None: + """Verify threads above 32 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(threads=_OVER_MAX) + + def test_threads_boundary_1(self) -> None: + """Verify threads=1 is accepted.""" + s = make_settings(threads=_BOUNDARY_MIN) + assert s.threads == _BOUNDARY_MIN + + def test_threads_boundary_32(self) -> None: + """Verify threads=32 is accepted.""" + s = make_settings(threads=_BOUNDARY_MAX) + assert s.threads == _BOUNDARY_MAX + + def test_limit_must_be_positive(self) -> None: + """Verify limit=0 raises ValidationError.""" + with pytest.raises(ValidationError): + make_settings(limit=0) + + +# ── download_batch ─────────────────────────────────────────────────────── + + +class TestDownloadBatch: + """Test download_batch with mocked internals.""" + + @pytest.fixture(autouse=True) + def _mock_ftp_pool(self) -> None: + """Prevent real FTP connections from the ThreadLocalFTP pool.""" + mock_pool = MagicMock() + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP", return_value=mock_pool): + yield + + def test_reads_manifest_and_calls_download(self, tmp_path: Path) -> None: + """Verify manifest is read and download is called for each entry.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" + ) + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "test", "files_downloaded": 3} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + report = download_batch( + manifest_path=str(manifest), + output_dir=str(output), + threads=1, + ftp_host="ftp.example.com", + ) + + assert report["total_attempted"] == _EXPECTED_ATTEMPTED + assert report["succeeded"] == _EXPECTED_ATTEMPTED + assert report["failed"] == 0 + + def test_limit_truncates(self, tmp_path: Path) -> None: + """Verify limit parameter truncates the number of assemblies processed.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" + ) + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "test", "files_downloaded": 1} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + report = download_batch( + manifest_path=str(manifest), + output_dir=str(output), + threads=1, + limit=1, + ) + assert report["total_attempted"] == 1 + + def test_writes_report_json(self, tmp_path: Path) -> None: + """Verify download_report.json is written to the output directory.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") + output = tmp_path / "output" + output.mkdir() + + mock_stats = {"accession": "GCF_000001215.4", "files_downloaded": 5} + with patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", return_value=mock_stats): + download_batch(manifest_path=str(manifest), output_dir=str(output), threads=1) + + report_file = output / "download_report.json" + assert report_file.exists() + report = json.loads(report_file.read_text()) + assert "timestamp" in report + assert report["succeeded"] == 1 + + def test_handles_download_failure(self, tmp_path: Path) -> None: + """Verify failed downloads are counted and do not crash the batch.""" + manifest = tmp_path / "manifest.txt" + manifest.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") + output = tmp_path / "output" + output.mkdir() + + with patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=RuntimeError("connection lost"), + ): + report = download_batch(manifest_path=str(manifest), output_dir=str(output), threads=1) + + assert report["failed"] == 1 + assert report["succeeded"] == 0 diff --git a/tests/utils/test_checksums.py b/tests/utils/test_checksums.py new file mode 100644 index 00000000..6c7bdbc6 --- /dev/null +++ b/tests/utils/test_checksums.py @@ -0,0 +1,76 @@ +"""Tests for utils.checksums module — MD5 and CRC64/NVME checksum utilities.""" + +import base64 +import hashlib +from pathlib import Path + +import pytest + +from cdm_data_loaders.utils.checksums import compute_md5, verify_md5 + +_EXPECTED_CRC64_BYTE_LEN = 8 + + +class TestComputeMd5: + """Test MD5 computation.""" + + def test_correct_hash(self, tmp_path: Path) -> None: + """Verify MD5 matches hashlib reference.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + assert compute_md5(f) == hashlib.md5(b"Hello, World!").hexdigest() # noqa: S324 + + def test_empty_file(self, tmp_path: Path) -> None: + """Verify MD5 of an empty file.""" + f = tmp_path / "empty" + f.write_bytes(b"") + assert compute_md5(f) == hashlib.md5(b"").hexdigest() # noqa: S324 + + def test_accepts_str_path(self, tmp_path: Path) -> None: + """Verify compute_md5 accepts a string path.""" + f = tmp_path / "test.txt" + f.write_bytes(b"data") + assert compute_md5(str(f)) == hashlib.md5(b"data").hexdigest() # noqa: S324 + + +class TestVerifyMd5: + """Test MD5 verification.""" + + def test_correct(self, tmp_path: Path) -> None: + """Verify True when MD5 matches.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + expected = hashlib.md5(b"Hello, World!").hexdigest() # noqa: S324 + assert verify_md5(f, expected) is True + + def test_incorrect(self, tmp_path: Path) -> None: + """Verify False when MD5 does not match.""" + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + assert verify_md5(f, "0000000000000000") is False + + +class TestComputeCrc64nvme: + """Test CRC64/NVME computation (skipped if awscrt unavailable).""" + + @pytest.fixture(autouse=True) + def _skip_if_no_awscrt(self) -> None: + pytest.importorskip("awscrt") + + def test_returns_base64(self, tmp_path: Path) -> None: + """Verify CRC64/NVME returns an 8-byte base64 string.""" + from cdm_data_loaders.utils.checksums import compute_crc64nvme # noqa: PLC0415 + + f = tmp_path / "test.bin" + f.write_bytes(b"Hello, World!") + crc = compute_crc64nvme(f) + decoded = base64.b64decode(crc) + assert len(decoded) == _EXPECTED_CRC64_BYTE_LEN + + def test_deterministic(self, tmp_path: Path) -> None: + """Verify repeated calls return the same checksum.""" + from cdm_data_loaders.utils.checksums import compute_crc64nvme # noqa: PLC0415 + + f = tmp_path / "test.bin" + f.write_bytes(b"test data for checksum") + assert compute_crc64nvme(f) == compute_crc64nvme(f) diff --git a/tests/utils/test_ftp_client.py b/tests/utils/test_ftp_client.py new file mode 100644 index 00000000..385fd329 --- /dev/null +++ b/tests/utils/test_ftp_client.py @@ -0,0 +1,199 @@ +"""Tests for utils.ftp_client module — mock ftplib for keepalive, retry, thread-local.""" + +import socket +import time +from collections.abc import Callable +from ftplib import FTP, error_temp +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from cdm_data_loaders.utils.ftp_client import ( + ThreadLocalFTP, + _set_keepalive, + connect_ftp, + ftp_download_file, + ftp_list_dir, + ftp_noop_keepalive, + ftp_retrieve_text, +) + +_IDLE_SECONDS = 30 +_KEEPIDLE_VALUE = 60 +_KEEPALIVE_INTERVAL = 25 +_EXPECTED_RETRY_COUNT = 2 +_FTP_TIMEOUT = 30 +_ERR_421 = "421 timeout" + + +class TestSetKeepalive: + """Test TCP keepalive socket options.""" + + def test_sets_so_keepalive(self) -> None: + """Verify SO_KEEPALIVE is set on the socket.""" + mock_ftp = MagicMock(spec=FTP) + mock_sock = MagicMock() + mock_ftp.sock = mock_sock + _set_keepalive(mock_ftp) + mock_sock.setsockopt.assert_any_call(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + + def test_sets_tcp_keepidle(self) -> None: + """Verify TCP_KEEPIDLE is set when available.""" + mock_ftp = MagicMock(spec=FTP) + mock_sock = MagicMock() + mock_ftp.sock = mock_sock + _set_keepalive(mock_ftp, idle=_KEEPIDLE_VALUE) + if hasattr(socket, "TCP_KEEPIDLE"): + mock_sock.setsockopt.assert_any_call(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, _KEEPIDLE_VALUE) + + +class TestConnectFtp: + """Test connect_ftp creates and configures an FTP connection.""" + + @patch("cdm_data_loaders.utils.ftp_client.FTP") + def test_connect_and_login(self, mock_ftp_cls: MagicMock) -> None: + """Verify FTP object is created, login called, and returned.""" + mock_ftp = MagicMock() + mock_ftp.sock = MagicMock() + mock_ftp_cls.return_value = mock_ftp + result = connect_ftp("ftp.example.com", timeout=_FTP_TIMEOUT) + mock_ftp_cls.assert_called_once_with("ftp.example.com", timeout=_FTP_TIMEOUT) + mock_ftp.login.assert_called_once() + assert result is mock_ftp + + +class TestFtpNoopKeepalive: + """Test NOOP keepalive logic.""" + + def test_sends_noop_when_idle(self) -> None: + """Verify NOOP is sent when idle exceeds interval.""" + mock_ftp = MagicMock(spec=FTP) + old_time = time.monotonic() - _IDLE_SECONDS + new_time = ftp_noop_keepalive(mock_ftp, old_time, interval=_KEEPALIVE_INTERVAL) + mock_ftp.sendcmd.assert_called_once_with("NOOP") + assert new_time > old_time + + def test_no_noop_when_recent(self) -> None: + """Verify no NOOP is sent when activity is recent.""" + mock_ftp = MagicMock(spec=FTP) + recent = time.monotonic() + result = ftp_noop_keepalive(mock_ftp, recent, interval=_KEEPALIVE_INTERVAL) + mock_ftp.sendcmd.assert_not_called() + assert result == recent + + +class TestFtpListDir: + """Test ftp_list_dir with retry.""" + + def test_returns_file_list(self) -> None: + """Verify file listing is returned correctly.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + for name in ["file1.txt", "file2.gz"]: + callback(name) + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_list_dir(mock_ftp, "/some/path") + assert result == ["file1.txt", "file2.gz"] + mock_ftp.cwd.assert_called_once_with("/some/path") + + def test_retries_on_error_temp(self) -> None: + """Verify retry logic on FTP temporary errors.""" + mock_ftp = MagicMock(spec=FTP) + call_count = 0 + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + nonlocal call_count + call_count += 1 + if call_count < _EXPECTED_RETRY_COUNT: + raise error_temp(_ERR_421) # noqa: S321 + callback("file.txt") + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_list_dir(mock_ftp, "/path", retries=3) + assert result == ["file.txt"] + assert call_count == _EXPECTED_RETRY_COUNT + + def test_raises_after_exhausted_retries(self) -> None: + """Verify error is raised after all retries are exhausted.""" + mock_ftp = MagicMock(spec=FTP) + mock_ftp.retrlines.side_effect = error_temp(_ERR_421) # noqa: S321 + with pytest.raises(error_temp): + ftp_list_dir(mock_ftp, "/path", retries=_EXPECTED_RETRY_COUNT) + + +class TestFtpDownloadFile: + """Test ftp_download_file with retry.""" + + def test_downloads_file(self, tmp_path: Path) -> None: + """Verify file is downloaded and written to disk.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrbinary(_cmd: str, callback: Callable[[bytes], None]) -> None: + callback(b"file data") + + mock_ftp.retrbinary.side_effect = fake_retrbinary + local = tmp_path / "out.bin" + ftp_download_file(mock_ftp, "remote.bin", str(local)) + assert local.read_bytes() == b"file data" + + def test_retries_on_error_temp(self, tmp_path: Path) -> None: + """Verify download retries on FTP temporary errors.""" + mock_ftp = MagicMock(spec=FTP) + call_count = 0 + + def fake_retrbinary(_cmd: str, callback: Callable[[bytes], None]) -> None: + nonlocal call_count + call_count += 1 + if call_count < _EXPECTED_RETRY_COUNT: + msg = "421" + raise error_temp(msg) # noqa: S321 + callback(b"ok") + + mock_ftp.retrbinary.side_effect = fake_retrbinary + local = str(tmp_path / "out.bin") + ftp_download_file(mock_ftp, "remote.bin", local, retries=3) + assert call_count == _EXPECTED_RETRY_COUNT + + +class TestFtpRetrieveText: + """Test ftp_retrieve_text.""" + + def test_returns_content(self) -> None: + """Verify text content is retrieved and joined with newlines.""" + mock_ftp = MagicMock(spec=FTP) + + def fake_retrlines(_cmd: str, callback: Callable[[str], None]) -> None: + for line in ["line1", "line2"]: + callback(line) + + mock_ftp.retrlines.side_effect = fake_retrlines + result = ftp_retrieve_text(mock_ftp, "remote.txt") + assert result == "line1\nline2" + + +class TestThreadLocalFTP: + """Test thread-local FTP connection management.""" + + @patch("cdm_data_loaders.utils.ftp_client.connect_ftp") + def test_get_returns_same_connection(self, mock_connect: MagicMock) -> None: + """Verify get() returns the same FTP connection on repeated calls.""" + mock_ftp = MagicMock() + mock_connect.return_value = mock_ftp + pool = ThreadLocalFTP("ftp.example.com") + ftp1 = pool.get() + ftp2 = pool.get() + assert ftp1 is ftp2 + mock_connect.assert_called_once() + + @patch("cdm_data_loaders.utils.ftp_client.connect_ftp") + def test_close_all(self, mock_connect: MagicMock) -> None: + """Verify close_all() quits the FTP connection.""" + mock_ftp = MagicMock() + mock_connect.return_value = mock_ftp + pool = ThreadLocalFTP("ftp.example.com") + pool.get() + pool.close_all() + mock_ftp.quit.assert_called_once() diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 7d6398d2..26292a16 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -16,15 +16,18 @@ CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, copy_object, + copy_object_with_metadata, delete_object, download_file, get_s3_client, + head_object, list_matching_objects, object_exists, reset_s3_client, split_s3_path, upload_dir, upload_file, + upload_file_with_metadata, ) AWS_REGION = "us-east-1" @@ -595,3 +598,128 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol resp = delete_object(s3_path) assert object_exists(s3_path) is False assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + + +# upload_file_with_metadata +@pytest.mark.parametrize("bucket", BUCKETS) +@pytest.mark.s3 +def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample_file: Path, bucket: str) -> None: + """Verify that upload_file_with_metadata stores user metadata on the uploaded object.""" + metadata = {"md5": "abc123", "source": "ncbi"} + result = upload_file_with_metadata(sample_file, f"{bucket}/uploads", metadata=metadata) + assert result is True + + resp = mock_s3_client.head_object(Bucket=bucket, Key=f"uploads/{sample_file.name}") + assert resp["Metadata"]["md5"] == "abc123" + assert resp["Metadata"]["source"] == "ncbi" + + +@pytest.mark.s3 +def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that the object_name parameter overrides the filename.""" + result = upload_file_with_metadata( + sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"k": "v"}, object_name="renamed.txt" + ) + assert result is True + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key="uploads/renamed.txt") + assert obj["Body"].read() == b"hello s3" + + +@pytest.mark.s3 +def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that upload_file_with_metadata uploads even when the object already exists.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") + result = upload_file_with_metadata(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"new": "true"}) + assert result is True + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") + assert obj["Body"].read() == b"hello s3" + + +@pytest.mark.usefixtures("mock_s3_client") +@pytest.mark.s3 +def test_upload_file_with_metadata_raises_on_empty_destination(sample_file: Path) -> None: + """Verify ValueError when destination_dir is empty.""" + with pytest.raises(ValueError, match="No destination directory"): + upload_file_with_metadata(sample_file, "", metadata={"k": "v"}) + + +@pytest.mark.usefixtures("mock_s3_client") +@pytest.mark.parametrize("path_type", [str, Path]) +@pytest.mark.s3 +def test_upload_file_with_metadata_accepts_str_and_path(sample_file: Path, path_type: type[str] | type[Path]) -> None: + """Verify that upload_file_with_metadata accepts both str and Path.""" + result = upload_file_with_metadata(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", metadata={}) + assert result is True + + +# head_object +@pytest.mark.s3 +def test_head_object_returns_info(mock_s3_client: Any) -> None: + """Verify that head_object returns size, metadata, and checksum fields.""" + mock_s3_client.put_object( + Bucket=CDM_LAKE_BUCKET, Key="info/file.txt", Body=b"hello", Metadata={"md5": "abc123"} + ) + result = head_object(f"{CDM_LAKE_BUCKET}/info/file.txt") + assert result is not None + assert result["size"] == 5 + assert result["metadata"]["md5"] == "abc123" + # moto may not populate CRC64NVME, but the key should be present + assert "checksum_crc64nvme" in result + + +@pytest.mark.s3 +def test_head_object_returns_none_for_missing(mock_s3_client: Any) -> None: + """Verify that head_object returns None for a non-existent object.""" + result = head_object(f"{CDM_LAKE_BUCKET}/does/not/exist.txt") + assert result is None + + +@pytest.mark.parametrize("protocol", ["", "s3://", "s3a://"]) +@pytest.mark.s3 +def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: + """Verify that head_object handles all valid protocol prefixes.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="proto/file.txt", Body=b"data") + result = head_object(f"{protocol}{CDM_LAKE_BUCKET}/proto/file.txt") + assert result is not None + assert result["size"] == 4 + + +# copy_object_with_metadata +@pytest.mark.parametrize("destination", BUCKETS) +@pytest.mark.s3 +def test_copy_object_with_metadata_replaces_metadata( + mocked_s3_client_no_checksum: Any, destination: str +) -> None: + """Verify that copy_object_with_metadata copies and replaces metadata.""" + mocked_s3_client_no_checksum.put_object( + Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"old_key": "old_val"} + ) + new_metadata = {"archive_reason": "replaced", "archive_date": "2026-04-16"} + response = copy_object_with_metadata( + f"{CDM_LAKE_BUCKET}/src/file.txt", + f"{destination}/archive/file.txt", + metadata=new_metadata, + ) + assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 + + # verify the destination has the new metadata, not the old + resp = mocked_s3_client_no_checksum.head_object(Bucket=destination, Key="archive/file.txt") + assert resp["Metadata"]["archive_reason"] == "replaced" + assert resp["Metadata"]["archive_date"] == "2026-04-16" + assert "old_key" not in resp["Metadata"] + + # verify source still exists + assert object_exists(f"{CDM_LAKE_BUCKET}/src/file.txt") + + +@pytest.mark.s3 +def test_copy_object_with_metadata_preserves_content(mocked_s3_client_no_checksum: Any) -> None: + """Verify that the content of the copied object matches the original.""" + mocked_s3_client_no_checksum.put_object(Bucket=CDM_LAKE_BUCKET, Key="src/data.bin", Body=b"binary data") + copy_object_with_metadata( + f"{CDM_LAKE_BUCKET}/src/data.bin", + f"{CDM_LAKE_BUCKET}/dst/data.bin", + metadata={"tag": "value"}, + ) + obj = mocked_s3_client_no_checksum.get_object(Bucket=CDM_LAKE_BUCKET, Key="dst/data.bin") + assert obj["Body"].read() == b"binary data" From fa5fa972c10ac64c31fde7169e53e4a069302af3 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 16 Apr 2026 16:02:35 -0700 Subject: [PATCH 02/76] add local minio testing --- README.md | 41 +++ pyproject.toml | 3 +- tests/integration/__init__.py | 0 tests/integration/conftest.py | 251 +++++++++++++++++++ tests/integration/test_download_e2e.py | 129 ++++++++++ tests/integration/test_full_pipeline.py | 216 ++++++++++++++++ tests/integration/test_manifest_e2e.py | 211 ++++++++++++++++ tests/integration/test_promote_e2e.py | 320 ++++++++++++++++++++++++ 8 files changed, 1170 insertions(+), 1 deletion(-) create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/conftest.py create mode 100644 tests/integration/test_download_e2e.py create mode 100644 tests/integration/test_full_pipeline.py create mode 100644 tests/integration/test_manifest_e2e.py create mode 100644 tests/integration/test_promote_e2e.py diff --git a/README.md b/README.md index 98aba015..2273f299 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,47 @@ To generate coverage for the tests, run The standard python `coverage` package is used and coverage can be generated as html or other formats by changing the parameters. +#### Integration tests (MinIO + NCBI FTP) + +End-to-end integration tests for the NCBI assembly pipeline live in `tests/integration/`. They exercise the full flow — manifest diffing, FTP download, S3 promote/archive — against a locally running [MinIO](https://min.io/) container and the real NCBI FTP server. + +**Requirements:** +- Docker (for MinIO) +- Network access to `ftp.ncbi.nlm.nih.gov` + +**1. Start MinIO locally:** + +```sh +docker run -d \ + --name minio \ + -p 9000:9000 \ + -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + minio/minio server /data --console-address ":9001" +``` + +**2. Run the integration tests:** + +```sh +> uv run pytest tests/integration/ -m integration -v +``` + +Tests are automatically skipped when MinIO is not reachable, so the default `uv run pytest` will never fail due to a missing MinIO instance. + +**3. Inspect results:** + +Buckets are **not** cleaned up after tests. Browse the MinIO console at [http://localhost:9001](http://localhost:9001) (login: `minioadmin` / `minioadmin`) to inspect the final state of each test bucket. Each test method creates its own bucket (e.g. `integ-test-promote-dry-run`). + +**4. Stop MinIO when done:** + +```sh +docker stop minio && docker rm minio +``` + +> **Note:** These tests download real assemblies from NCBI FTP and are inherently slow (~30–60s per assembly). They are also marked `slow_test` so you can exclude them independently: `uv run pytest -m "not slow_test"`. + + ## Loading genomes, contigs, and features The [genome loader](src/cdm_data_loaders/parsers/genome_loader.py) can be used to load and integrate data from related GFF and FASTA files. Currently, the loader requires a GFF file and two FASTA files (one for amino acid seqs, one for nucleic acid seqs) for each genome. The list of files to be processed should be specified in the genome paths file, which has the following format: diff --git a/pyproject.toml b/pyproject.toml index 5898efde..de5e8154 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -172,6 +172,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "*.ipynb" = ["T201"] # ignore printing in notebooks "tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002", "ARG002"] # use of assert, booleans, unused mock args +"tests/integration/**/*.py" = ["S101", "T201", "FBT001", "FBT002", "ARG002", "ANN401"] "tests/utils/test_s3.py" = ["ANN401"] "**/__init__.py" = ["D104"] @@ -192,7 +193,7 @@ log_cli = true log_cli_level = "INFO" log_level = "INFO" addopts = ["-v"] -markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin"] +markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin", "integration: end-to-end tests requiring a running MinIO instance and network access"] # environment settings for running tests [tool.pytest_env] diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..1d90faa6 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,251 @@ +"""Shared fixtures and helpers for MinIO-backed integration tests. + +Integration tests are auto-skipped when MinIO is not reachable. Each test +method gets its own bucket (derived from the test node name) that is emptied +on re-run but **never deleted** after the test — this lets developers inspect +the final state of the object store via the MinIO console. +""" + +from __future__ import annotations + +import functools +import hashlib +import os +import re +from pathlib import Path +from typing import TYPE_CHECKING, Any +from unittest.mock import patch + +import boto3 +import botocore.client +import pytest + +import cdm_data_loaders.ncbi_ftp.manifest as manifest_mod +import cdm_data_loaders.ncbi_ftp.promote as promote_mod +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path +from cdm_data_loaders.utils.s3 import reset_s3_client + +if TYPE_CHECKING: + from collections.abc import Callable + +# ── MinIO connection defaults ─────────────────────────────────────────── + +MINIO_ENDPOINT_URL = os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000") +MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") +MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") + +# Maximum length of a bucket name per S3/DNS spec +_MAX_BUCKET_LEN = 63 + + +# ── MinIO reachability check ──────────────────────────────────────────── + +_minio_available: bool | None = None + + +def _minio_reachable() -> bool: + """Return True if the MinIO endpoint accepts connections.""" + try: + client = boto3.client( + "s3", + endpoint_url=MINIO_ENDPOINT_URL, + aws_access_key_id=MINIO_ACCESS_KEY, + aws_secret_access_key=MINIO_SECRET_KEY, + ) + client.list_buckets() + except Exception: # noqa: BLE001 + return False + return True + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: # noqa: ARG001 + """Auto-skip ``@pytest.mark.integration`` tests when MinIO is unreachable.""" + global _minio_available # noqa: PLW0603 + if _minio_available is None: + _minio_available = _minio_reachable() + if _minio_available: + return + skip_marker = pytest.mark.skip(reason="MinIO not reachable — skipping integration tests") + for item in items: + if "integration" in item.keywords: + item.add_marker(skip_marker) + + +# ── CRC64NVME workaround ─────────────────────────────────────────────── + + +def strip_checksum_algorithm(method: Callable) -> Callable: + """Wrap a boto3 S3 method to remove ``ChecksumAlgorithm`` if unsupported.""" + + @functools.wraps(method) + def wrapper(*args: object, **kwargs: object) -> object: + kwargs.pop("ChecksumAlgorithm", None) # type: ignore[arg-type] + return method(*args, **kwargs) + + return wrapper + + +# ── Fixtures ──────────────────────────────────────────────────────────── + + +@pytest.fixture(scope="session") +def minio_s3_client() -> botocore.client.BaseClient: + """Session-scoped real boto3 S3 client pointed at the local MinIO instance. + + Patches ``get_s3_client`` on every module that uses it so internal calls + are transparently routed to MinIO. + """ + client = boto3.client( + "s3", + endpoint_url=MINIO_ENDPOINT_URL, + aws_access_key_id=MINIO_ACCESS_KEY, + aws_secret_access_key=MINIO_SECRET_KEY, + ) + + # MinIO may not support CRC64NVME — strip to be safe + client.upload_file = strip_checksum_algorithm(client.upload_file) # type: ignore[method-assign] + client.copy_object = strip_checksum_algorithm(client.copy_object) # type: ignore[method-assign] + + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(promote_mod, "get_s3_client", return_value=client), + patch.object(manifest_mod, "head_object", wraps=s3_utils.head_object), + patch.object(s3_utils, "_s3_client", client), + ): + yield client + reset_s3_client() + + +def _bucket_name_from_node(node_id: str) -> str: + """Derive a DNS-compliant S3 bucket name from a pytest node ID. + + :param node_id: e.g. ``tests/integration/test_promote_e2e.py::test_dry_run`` + :return: e.g. ``integ-test-dry-run`` + """ + # Extract test function name from the node ID + parts = node_id.split("::") + name = parts[-1] if parts else node_id + # Lowercase, replace non-alphanumeric with hyphens, collapse multiples + name = re.sub(r"[^a-z0-9]+", "-", name.lower()).strip("-") + name = f"integ-{name}" + if len(name) > _MAX_BUCKET_LEN: + # Truncate but keep it unique via a short hash suffix + suffix = hashlib.md5(name.encode()).hexdigest()[:6] # noqa: S324 + name = f"{name[: _MAX_BUCKET_LEN - 7]}-{suffix}" + return name + + +@pytest.fixture +def test_bucket(minio_s3_client: botocore.client.BaseClient, request: pytest.FixtureRequest) -> str: + """Create a per-test-method bucket in MinIO and return its name. + + On re-run, any existing objects are deleted first so the test starts clean. + The bucket is **not** deleted after the test. + """ + bucket = _bucket_name_from_node(request.node.nodeid) + s3 = minio_s3_client + + try: + s3.head_bucket(Bucket=bucket) + # Bucket exists — empty it for a clean run + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + for obj in page.get("Contents", []): + s3.delete_object(Bucket=bucket, Key=obj["Key"]) + except s3.exceptions.NoSuchBucket: + s3.create_bucket(Bucket=bucket) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("404", "NoSuchBucket"): + s3.create_bucket(Bucket=bucket) + else: + raise + + return bucket + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def stage_files_to_minio( + s3: botocore.client.BaseClient, + bucket: str, + local_dir: str | Path, + staging_prefix: str, +) -> list[str]: + """Upload a local directory tree to a MinIO staging prefix. + + :param s3: boto3 S3 client + :param bucket: target bucket + :param local_dir: local root directory to upload + :param staging_prefix: S3 key prefix (e.g. ``"staging/run1/"``) + :return: list of S3 keys uploaded + """ + local_dir = Path(local_dir) + keys: list[str] = [] + for path in sorted(local_dir.rglob("*")): + if path.is_dir(): + continue + rel = path.relative_to(local_dir) + key = f"{staging_prefix.rstrip('/')}/{rel}" + s3.upload_file(Filename=str(path), Bucket=bucket, Key=key) + keys.append(key) + return keys + + +def seed_lakehouse( # noqa: PLR0913 + s3: botocore.client.BaseClient, + bucket: str, + accession: str, + files: dict[str, str | bytes], + path_prefix: str, + assembly_dir: str | None = None, +) -> list[str]: + """Seed assembly files at the final Lakehouse path in MinIO. + + :param s3: boto3 S3 client + :param bucket: target bucket + :param accession: assembly accession (e.g. ``"GCF_000001215.4"``) + :param files: mapping of filename → content (str or bytes) + :param path_prefix: Lakehouse prefix (e.g. ``"tenant-general-warehouse/…/ncbi/"``) + :param assembly_dir: full assembly dir name; if None, uses ``accession`` + :return: list of S3 keys created + """ + adir = assembly_dir or accession + rel = build_accession_path(adir) + keys: list[str] = [] + for fname, content in files.items(): + key = f"{path_prefix}{rel}{fname}" + body = content.encode() if isinstance(content, str) else content + md5 = hashlib.md5(body).hexdigest() # noqa: S324 + s3.put_object(Bucket=bucket, Key=key, Body=body, Metadata={"md5": md5}) + keys.append(key) + return keys + + +def list_all_keys(s3: botocore.client.BaseClient, bucket: str, prefix: str = "") -> list[str]: + """List all object keys in a bucket under a prefix. + + :param s3: boto3 S3 client + :param bucket: bucket name + :param prefix: optional key prefix filter + :return: sorted list of keys + """ + keys: list[str] = [] + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + keys.extend(obj["Key"] for obj in page.get("Contents", [])) + return sorted(keys) + + +def get_object_metadata(s3: botocore.client.BaseClient, bucket: str, key: str) -> dict[str, Any]: + """Return the user metadata dict for an S3 object. + + :param s3: boto3 S3 client + :param bucket: bucket name + :param key: object key + :return: metadata dict + """ + resp = s3.head_object(Bucket=bucket, Key=key) + return resp.get("Metadata", {}) diff --git a/tests/integration/test_download_e2e.py b/tests/integration/test_download_e2e.py new file mode 100644 index 00000000..b527de96 --- /dev/null +++ b/tests/integration/test_download_e2e.py @@ -0,0 +1,129 @@ +"""End-to-end tests for Phase 2 — FTP download of assemblies. + +These tests download real (small) assemblies from the NCBI FTP server. +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pathlib import Path + +from cdm_data_loaders.ncbi_ftp.manifest import ( + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_transfer_manifest, +) +from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch + +# Use same stable prefix as manifest tests +STABLE_PREFIX = "900" + + +def _manifest_for_one_assembly(tmp_path: Path) -> tuple[Path, str]: + """Create a transfer manifest containing exactly one FTP path. + + Returns ``(manifest_path, accession)`` for the first latest assembly + in the stable prefix range. + """ + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + diff = compute_diff(filtered, previous_assemblies=None) + + assert len(diff.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest_path = tmp_path / "transfer_manifest.txt" + write_transfer_manifest(diff, filtered, manifest_path) + + return manifest_path, diff.new[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestDownloadSmallBatch: + """Download a single assembly from NCBI FTP and verify local output.""" + + def test_download_small_batch(self, tmp_path: Path) -> None: + """Download one assembly and verify directory structure and report.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + output_dir = tmp_path / "output" + output_dir.mkdir() + + report = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + + # Verify directory structure exists + raw_data = output_dir / "raw_data" + assert raw_data.exists(), "Expected raw_data/ directory in output" + + # Should have at least one assembly directory with files + assembly_dirs = list(raw_data.rglob("GCF_*")) + assert len(assembly_dirs) > 0, "Expected at least one assembly directory" + + # Check for .md5 sidecar files + md5_files = list(raw_data.rglob("*.md5")) + assert len(md5_files) > 0, "Expected .md5 sidecar files" + + # Check download report + report_file = output_dir / "download_report.json" + assert report_file.exists() + with report_file.open() as f: + saved_report = json.load(f) + assert saved_report["succeeded"] >= 1 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestDownloadResumeIncomplete: + """Verify download handles re-runs when some files are already present.""" + + def test_download_resume(self, tmp_path: Path) -> None: + """Re-running download on the same manifest succeeds without errors.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + output_dir = tmp_path / "output" + output_dir.mkdir() + + # First download + report1 = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + assert report1["succeeded"] >= 1 + + files_after_first = set(output_dir.rglob("*")) + + # Second download — same manifest + report2 = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + + # Should succeed without errors (files overwritten or skipped) + assert report2["succeeded"] >= 1 + assert report2["failed"] == 0 + + # All original files should still exist + files_after_second = set(output_dir.rglob("*")) + assert files_after_first.issubset(files_after_second) diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py new file mode 100644 index 00000000..b98bf342 --- /dev/null +++ b/tests/integration/test_full_pipeline.py @@ -0,0 +1,216 @@ +"""End-to-end tests for the full NCBI assembly pipeline (Phase 1 → 2 → 3). + +Exercises the entire flow: download summary from real NCBI FTP, compute diff, +download a single assembly, stage in MinIO, promote to final Lakehouse path. + +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from pathlib import Path + +from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX, promote_from_s3 +from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch + +from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio + +STABLE_PREFIX = "900" +STAGING_PREFIX = "staging/run1/" +PATH_PREFIX = DEFAULT_PATH_PREFIX + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestFullPipelineSmallBatch: + """Run the complete pipeline for a single assembly: diff → download → promote.""" + + def test_full_pipeline_small_batch( + self, + minio_s3_client: object, + test_bucket: str, + tmp_path: Path, + ) -> None: + """Single assembly flows through all three phases into MinIO.""" + s3 = minio_s3_client + + # ── Phase 1: Manifest generation ──────────────────────────────── + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + + diff = compute_diff(filtered, previous_assemblies=None) + assert len(diff.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest_path = tmp_path / "transfer_manifest.txt" + write_transfer_manifest(diff, filtered, manifest_path) + + # ── Phase 2: Download one assembly from real FTP ──────────────── + output_dir = tmp_path / "output" + output_dir.mkdir() + + report = download_batch( + manifest_path=str(manifest_path), + output_dir=str(output_dir), + threads=1, + limit=1, + ) + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + + # ── Upload local output to MinIO staging ──────────────────────── + keys = stage_files_to_minio(s3, test_bucket, output_dir, STAGING_PREFIX) + assert len(keys) > 0, "Expected files staged to MinIO" + + # ── Phase 3: Promote from staging to final path ───────────────── + promote_report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + ) + assert promote_report["promoted"] >= 1 + assert promote_report["failed"] == 0 + + # ── Verify final state ────────────────────────────────────────── + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 1, "Expected files at final Lakehouse path" + + # At least one file should have MD5 metadata + has_md5 = False + for key in final_keys: + meta = get_object_metadata(s3, test_bucket, key) + if meta.get("md5"): + has_md5 = True + break + assert has_md5, "Expected at least one file with MD5 metadata" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestFullPipelineIncrementalSync: + """Run the pipeline twice to test incremental sync with archival.""" + + def test_full_pipeline_incremental( + self, + minio_s3_client: object, + test_bucket: str, + tmp_path: Path, + ) -> None: + """Second sync archives the old version and promotes the new one.""" + s3 = minio_s3_client + + # ── First sync: Phase 1 → 2 → 3 ──────────────────────────────── + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + + diff1 = compute_diff(filtered, previous_assemblies=None) + assert len(diff1.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" + + manifest1 = tmp_path / "transfer_manifest_1.txt" + write_transfer_manifest(diff1, filtered, manifest1) + + output1 = tmp_path / "output1" + output1.mkdir() + report1 = download_batch(str(manifest1), str(output1), threads=1, limit=1) + assert report1["succeeded"] >= 1 + + stage_files_to_minio(s3, test_bucket, output1, STAGING_PREFIX) + + # Upload manifest to MinIO for trimming + manifest_key = "ncbi/transfer_manifest.txt" + s3.upload_file(Filename=str(manifest1), Bucket=test_bucket, Key=manifest_key) + + promote1 = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + manifest_path=manifest_key, + path_prefix=PATH_PREFIX, + ) + assert promote1["promoted"] >= 1 + + first_sync_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(first_sync_keys) >= 1 + + # ── Second sync: Manufacture "previous" with a tweak ──────────── + # Treat first-sync state as "previous", but modify one assembly's + # seq_rel_date so it shows up as "updated". + previous: dict[str, AssemblyRecord] = {} + for acc, rec in filtered.items(): + previous[acc] = AssemblyRecord( + accession=rec.accession, + status=rec.status, + seq_rel_date=rec.seq_rel_date, + ftp_path=rec.ftp_path, + assembly_dir=rec.assembly_dir, + ) + + # Pick the first accession that was actually downloaded + downloaded_acc = diff1.new[0] + if downloaded_acc in previous: + previous[downloaded_acc].seq_rel_date = "1999/01/01" + + diff2 = compute_diff(filtered, previous_assemblies=previous) + + # The modified assembly should appear as "updated" + if downloaded_acc in previous: + assert downloaded_acc in diff2.updated, f"Expected {downloaded_acc} in updated list" + + manifest2 = tmp_path / "transfer_manifest_2.txt" + write_transfer_manifest(diff2, filtered, manifest2) + + updated_manifest = tmp_path / "updated_manifest.txt" + write_updated_manifest(diff2, updated_manifest) + + removed_manifest = tmp_path / "removed_manifest.txt" + write_removed_manifest(diff2, removed_manifest) + + # Phase 2 — re-download the updated assembly + output2 = tmp_path / "output2" + output2.mkdir() + report2 = download_batch(str(manifest2), str(output2), threads=1, limit=1) + assert report2["succeeded"] >= 1 + + # Clean staging and re-stage + staging_keys = list_all_keys(s3, test_bucket, STAGING_PREFIX) + for key in staging_keys: + s3.delete_object(Bucket=test_bucket, Key=key) + stage_files_to_minio(s3, test_bucket, output2, STAGING_PREFIX) + + # Phase 3 — promote with archival + promote2 = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + updated_manifest=str(updated_manifest), + ncbi_release="test-incremental", + path_prefix=PATH_PREFIX, + ) + assert promote2["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/test-incremental/") + if promote2["archived"] > 0: + assert len(archive_keys) >= 1 + for key in archive_keys: + meta = get_object_metadata(s3, test_bucket, key) + assert meta.get("archive_reason") == "updated" + + # Final Lakehouse path should still have files + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 1 diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py new file mode 100644 index 00000000..a8c22c02 --- /dev/null +++ b/tests/integration/test_manifest_e2e.py @@ -0,0 +1,211 @@ +"""End-to-end tests for Phase 1 — manifest generation and diffing. + +These tests hit the real NCBI FTP server (with tight prefix filters) and +optionally use MinIO for checksum verification. Marked ``integration`` +and ``slow_test``; auto-skipped when MinIO is unreachable. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import FILE_FILTERS, FTP_HOST, build_accession_path, parse_md5_checksums_file +from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, + compute_diff, + download_assembly_summary, + filter_by_prefix_range, + parse_assembly_summary, + verify_transfer_candidates, + write_diff_summary, + write_removed_manifest, + write_transfer_manifest, + write_updated_manifest, +) +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX +from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_retrieve_text + +if TYPE_CHECKING: + from pathlib import Path + +# Use a high-numbered prefix range that typically has only a handful of +# assemblies, keeping FTP traffic minimal. +STABLE_PREFIX = "900" + + +# ── Helpers ───────────────────────────────────────────────────────────── + + +def _download_and_filter() -> tuple[dict[str, AssemblyRecord], dict[str, AssemblyRecord]]: + """Download the current refseq summary and filter to the stable prefix range. + + Returns ``(full_parsed, filtered)``. + """ + raw = download_assembly_summary(database="refseq") + full = parse_assembly_summary(raw) + filtered = filter_by_prefix_range(full, prefix_from=STABLE_PREFIX, prefix_to=STABLE_PREFIX) + return full, filtered + + +# ── Tests ─────────────────────────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestFreshSyncNoPrevious: + """Phase 1 with no previous snapshot — everything is 'new'.""" + + def test_fresh_sync_no_previous(self, tmp_path: Path) -> None: + """All assemblies in range appear as new when there is no previous snapshot.""" + _full, filtered = _download_and_filter() + assert len(filtered) > 0, f"Expected assemblies in prefix {STABLE_PREFIX}" + + diff = compute_diff(filtered, previous_assemblies=None) + + # With no previous, every *latest* assembly is new + latest_count = sum(1 for r in filtered.values() if r.status == "latest") + assert len(diff.new) == latest_count + assert len(diff.updated) == 0 + assert len(diff.replaced) == 0 + assert len(diff.suppressed) == 0 + + # Write manifests + transfer_path = tmp_path / "transfer_manifest.txt" + removed_path = tmp_path / "removed_manifest.txt" + updated_path = tmp_path / "updated_manifest.txt" + summary_path = tmp_path / "diff_summary.json" + + paths = write_transfer_manifest(diff, filtered, transfer_path) + removed = write_removed_manifest(diff, removed_path) + updated = write_updated_manifest(diff, updated_path) + write_diff_summary(diff, summary_path, "refseq", STABLE_PREFIX, STABLE_PREFIX) + + assert len(paths) == latest_count + assert len(removed) == 0 + assert len(updated) == 0 + assert transfer_path.exists() + assert summary_path.exists() + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestIncrementalDiffSyntheticPrevious: + """Phase 1 incremental diff with a manufactured 'previous' snapshot.""" + + def test_incremental_diff(self, tmp_path: Path) -> None: + """Detects new, updated, replaced, and suppressed assemblies correctly.""" + _full, filtered = _download_and_filter() + latest = {a: r for a, r in filtered.items() if r.status == "latest"} + assert len(latest) >= 2, f"Need >=2 latest assemblies in prefix {STABLE_PREFIX}" # noqa: PLR2004 + + accs = sorted(latest.keys()) + + # Build synthetic previous: copy current, then mutate + previous: dict[str, AssemblyRecord] = {} + for acc, rec in filtered.items(): + previous[acc] = AssemblyRecord( + accession=rec.accession, + status=rec.status, + seq_rel_date=rec.seq_rel_date, + ftp_path=rec.ftp_path, + assembly_dir=rec.assembly_dir, + ) + + # Remove the first latest → should appear as "new" in diff + new_acc = accs[0] + del previous[new_acc] + + # Modify seq_rel_date of the second latest → should appear as "updated" + updated_acc = accs[1] + previous[updated_acc].seq_rel_date = "1999/01/01" + + # Add a fake accession to previous that is not in current → "suppressed" + fake_suppressed = "GCF_900999999.1" + previous[fake_suppressed] = AssemblyRecord( + accession=fake_suppressed, + status="latest", + seq_rel_date="2020/01/01", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/999/999/GCF_900999999.1_FakeAsm", + assembly_dir="GCF_900999999.1_FakeAsm", + ) + + diff = compute_diff(filtered, previous_assemblies=previous) + + assert new_acc in diff.new + assert updated_acc in diff.updated + assert fake_suppressed in diff.suppressed + + # Write and verify manifests + transfer_path = tmp_path / "transfer_manifest.txt" + removed_path = tmp_path / "removed_manifest.txt" + updated_path = tmp_path / "updated_manifest.txt" + + paths = write_transfer_manifest(diff, filtered, transfer_path) + removed = write_removed_manifest(diff, removed_path) + updated_list = write_updated_manifest(diff, updated_path) + + assert len(paths) >= 2 # noqa: PLR2004 # at least the new + updated + assert fake_suppressed in removed + assert updated_acc in updated_list + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestVerifyTransferCandidatesPrunes: + """verify_transfer_candidates should prune assemblies already in the store.""" + + def test_prunes_existing_matching_md5( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Assemblies with matching MD5 metadata in MinIO are pruned from the transfer list.""" + _full, filtered = _download_and_filter() + latest = {a: r for a, r in filtered.items() if r.status == "latest"} + if not latest: + pytest.skip(f"No latest assemblies in prefix {STABLE_PREFIX}") + + # Pick one assembly to pre-seed in MinIO with correct checksums + acc = next(iter(sorted(latest))) + rec = latest[acc] + ftp_dir = rec.ftp_path.replace("https://ftp.ncbi.nlm.nih.gov", "") + + # Fetch the real md5checksums.txt from FTP + ftp = connect_ftp(FTP_HOST) + try: + md5_text = ftp_retrieve_text(ftp, ftp_dir.rstrip("/") + "/md5checksums.txt") + finally: + ftp.quit() + + checksums = parse_md5_checksums_file(md5_text) + + # Seed MinIO with dummy files that have the right MD5 metadata + rel = build_accession_path(rec.assembly_dir) + s3 = minio_s3_client + path_prefix = DEFAULT_PATH_PREFIX + for fname, md5 in checksums.items(): + if any(fname.endswith(suffix) for suffix in FILE_FILTERS): + key = f"{path_prefix}{rel}{fname}" + s3.put_object( + Bucket=test_bucket, + Key=key, + Body=b"placeholder", + Metadata={"md5": md5}, + ) + + # verify_transfer_candidates should prune the seeded assembly + candidates = sorted(latest.keys()) + result = verify_transfer_candidates( + candidates, + filtered, + bucket=test_bucket, + path_prefix=path_prefix, + ) + + assert acc not in result, f"Expected {acc} to be pruned (MD5 matches)" + # Other candidates without seeded data should remain + remaining_candidates = [c for c in candidates if c != acc] + for c in remaining_candidates: + assert c in result, f"Expected {c} to remain (not seeded)" diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py new file mode 100644 index 00000000..10670598 --- /dev/null +++ b/tests/integration/test_promote_e2e.py @@ -0,0 +1,320 @@ +"""End-to-end tests for Phase 3 — promote and archive in MinIO. + +Pre-stages fake assembly files in MinIO and exercises ``promote_from_s3`` +with various combinations of manifests, archive operations, dry-run mode, +manifest trimming, and incomplete staging. + +Marked ``integration`` and ``slow_test``; auto-skipped when MinIO is +unreachable. Each test method gets its own bucket. +""" + +from __future__ import annotations + +import hashlib +from typing import TYPE_CHECKING + +import pytest + +from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX, promote_from_s3 + +from .conftest import get_object_metadata, list_all_keys, seed_lakehouse + +if TYPE_CHECKING: + from pathlib import Path + +# Fake assembly details used across tests +ACCESSION_A = "GCF_900000001.1" +ASSEMBLY_DIR_A = "GCF_900000001.1_FakeAssemblyA" +ACCESSION_B = "GCF_900000002.1" +ASSEMBLY_DIR_B = "GCF_900000002.1_FakeAssemblyB" +ACCESSION_C = "GCF_900000003.1" +ASSEMBLY_DIR_C = "GCF_900000003.1_FakeAssemblyC" + +STAGING_PREFIX = "staging/run1/" +PATH_PREFIX = DEFAULT_PATH_PREFIX + +# Fake file contents for staging +FAKE_GENOMIC = b">seq1\nATCGATCG\n" +FAKE_PROTEIN = b">prot1\nMKKL\n" + + +def _md5(data: bytes) -> str: + return hashlib.md5(data).hexdigest() # noqa: S324 + + +def _stage_assembly( + s3: object, + bucket: str, + assembly_dir: str, +) -> None: + """Stage a fake assembly with data files and .md5 sidecars under the staging prefix.""" + rel = build_accession_path(assembly_dir) + base = f"{STAGING_PREFIX}{rel}" + + files = { + f"{assembly_dir}_genomic.fna.gz": FAKE_GENOMIC, + f"{assembly_dir}_protein.faa.gz": FAKE_PROTEIN, + } + + for fname, content in files.items(): + key = f"{base}{fname}" + s3.put_object(Bucket=bucket, Key=key, Body=content) + # Write .md5 sidecar + md5_key = f"{key}.md5" + s3.put_object(Bucket=bucket, Key=md5_key, Body=_md5(content).encode()) + + +def _write_manifest(tmp_path: Path, accessions: list[str], name: str) -> Path: + """Write a manifest file (one accession per line).""" + path = tmp_path / name + path.write_text("\n".join(accessions) + "\n") + return path + + +# ── Tests ─────────────────────────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteFromStaging: + """Promote staged files to final Lakehouse paths.""" + + def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str) -> None: + """Staged files appear at the final Lakehouse path with MD5 metadata.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + ) + + assert report["promoted"] >= 2 # noqa: PLR2004 # genomic + protein + assert report["failed"] == 0 + assert report["dry_run"] is False + + # Verify files at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) >= 2 # noqa: PLR2004 + + # Verify MD5 metadata is set + for key in final_keys: + meta = get_object_metadata(s3, test_bucket, key) + assert "md5" in meta, f"Missing md5 metadata on {key}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteIdempotent: + """Promoting the same staging data twice should succeed without errors.""" + + def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str) -> None: + """Second promote succeeds and produces the same final state.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + report1 = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + ) + keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + report2 = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + ) + keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + assert report1["failed"] == 0 + assert report2["failed"] == 0 + assert report2["promoted"] >= 1 + assert keys_after_first == keys_after_second + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveUpdated: + """Archive existing assemblies before overwriting with updated versions.""" + + def test_archive_updated(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + """Updated assemblies are archived before being overwritten.""" + s3 = minio_s3_client + + # Seed "old" version at the final Lakehouse path + old_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "old genomic content", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": "old protein content", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, old_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + # Stage "new" version + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + updated_manifest=str(updated_manifest), + ncbi_release="2024-01", + path_prefix=PATH_PREFIX, + ) + + assert report["archived"] >= 2 # noqa: PLR2004 + assert report["promoted"] >= 2 # noqa: PLR2004 + assert report["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/2024-01/") + assert len(archive_keys) >= 2 # noqa: PLR2004 + + # Verify archive metadata + for key in archive_keys: + meta = get_object_metadata(s3, test_bucket, key) + assert meta.get("archive_reason") == "updated" + assert meta.get("ncbi_last_release") == "2024-01" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveRemoved: + """Archive and delete replaced/suppressed assemblies.""" + + def test_archive_removed(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + """Removed assemblies are archived and source objects are deleted.""" + s3 = minio_s3_client + + # Seed assemblies at final path + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "content to archive", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + + # Stage something (even empty staging is fine — promote won't find data files for this accession) + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + removed_manifest=str(removed_manifest), + ncbi_release="2024-01", + path_prefix=PATH_PREFIX, + ) + + assert report["archived"] >= 1 + assert report["failed"] == 0 + + # Verify archive exists + archive_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "archive/2024-01/") + assert len(archive_keys) >= 1 + + # Verify archive metadata + for key in archive_keys: + meta = get_object_metadata(s3, test_bucket, key) + assert meta.get("archive_reason") == "replaced_or_suppressed" + + # Verify source objects are deleted + rel = build_accession_path(ASSEMBLY_DIR_A) + source_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + rel) + assert len(source_keys) == 0, f"Expected source objects deleted, found: {source_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRun: + """Dry-run mode should not create any objects.""" + + def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str) -> None: + """Dry-run logs actions but creates no objects at the final path.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + dry_run=True, + ) + + assert report["dry_run"] is True + assert report["promoted"] >= 1 + + # No objects should exist at the final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run should not create objects, found: {final_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteTrimsManifest: + """Manifest trimming removes promoted accessions.""" + + def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + """Transfer manifest in MinIO is trimmed to exclude promoted accessions.""" + s3 = minio_s3_client + + # Upload a transfer manifest with 3 entries to MinIO + manifest_key = "ncbi/transfer_manifest.txt" + manifest_lines = [ + "/genomes/all/GCF/900/000/001/GCF_900000001.1_FakeAssemblyA/\n", + "/genomes/all/GCF/900/000/002/GCF_900000002.1_FakeAssemblyB/\n", + "/genomes/all/GCF/900/000/003/GCF_900000003.1_FakeAssemblyC/\n", + ] + s3.put_object(Bucket=test_bucket, Key=manifest_key, Body="".join(manifest_lines).encode()) + + # Stage only assemblies A and B (not C) + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_B) + + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + manifest_path=manifest_key, + path_prefix=PATH_PREFIX, + ) + + assert report["failed"] == 0 + + # Read back the manifest from MinIO + resp = s3.get_object(Bucket=test_bucket, Key=manifest_key) + remaining = resp["Body"].read().decode() + remaining_lines = [line.strip() for line in remaining.strip().splitlines() if line.strip()] + + # Only C should remain (A and B were promoted) + assert len(remaining_lines) == 1, f"Expected 1 remaining entry, got {len(remaining_lines)}: {remaining_lines}" + assert "GCF_900000003" in remaining_lines[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteIncompleteStaging: + """Incomplete staging (sidecar only, no data) should not promote anything.""" + + def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> None: + """Only .md5 sidecars staged → nothing promoted.""" + s3 = minio_s3_client + + # Stage only .md5 sidecars (no data files) + rel = build_accession_path(ASSEMBLY_DIR_A) + base = f"{STAGING_PREFIX}{rel}" + fname = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + md5_key = f"{base}{fname}.md5" + s3.put_object(Bucket=test_bucket, Key=md5_key, Body=_md5(FAKE_GENOMIC).encode()) + + report = promote_from_s3( + staging_prefix=STAGING_PREFIX, + bucket=test_bucket, + path_prefix=PATH_PREFIX, + ) + + # .md5 files are sidecars and should not be promoted as data + assert report["promoted"] == 0 + assert report["failed"] == 0 + + # No objects at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0 From f4d838ad067fa22ae89dd3382dfdc13138f3679d Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Fri, 17 Apr 2026 09:20:49 -0700 Subject: [PATCH 03/76] add NCBI end-to-end testing instructions --- README.md | 2 +- docs/ncbi_ftp_e2e_walkthrough.md | 355 ++++++++++++++++++++++++++++++ notebooks/ncbi_ftp_manifest.ipynb | 33 +++ tests/integration/conftest.py | 24 +- 4 files changed, 390 insertions(+), 24 deletions(-) create mode 100644 docs/ncbi_ftp_e2e_walkthrough.md diff --git a/README.md b/README.md index 2273f299..6d6e643e 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ docker run -d \ -p 9001:9001 \ -e MINIO_ROOT_USER=minioadmin \ -e MINIO_ROOT_PASSWORD=minioadmin \ - minio/minio server /data --console-address ":9001" + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data --console-address ":9001" ``` **2. Run the integration tests:** diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md new file mode 100644 index 00000000..733345be --- /dev/null +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -0,0 +1,355 @@ +# NCBI FTP Pipeline — Local End-to-End Walkthrough + +Step-by-step instructions for running a small (≤ 10 assembly) end-to-end sync +of NCBI RefSeq records against a local MinIO container. The walkthrough uses +the two existing Jupyter notebooks for Phases 1 and 3, and the project's Docker +image for the Phase 2 download step. + +> **Prerequisites:** +> - Docker or Podman +> - [uv](https://docs.astral.sh/uv/) (for running notebooks locally) +> - Network access to `ftp.ncbi.nlm.nih.gov` + +--- + +## Architecture overview + +``` + Phase 1 (notebook) Phase 2 (container) Phase 3 (notebook) +┌────────────────────┐ ┌───────────────────────┐ ┌──────────────────────┐ +│ Manifest notebook │ │ ncbi_ftp_sync CLI │ │ Promote notebook │ +│ ─ download FTP │────▶│ ─ read manifest │────▶│ ─ promote staged │ +│ assembly summary │ │ ─ parallel FTP DL │ │ files to Lakehouse │ +│ ─ diff against │ │ ─ MD5 verify │ │ ─ archive old ver. │ +│ previous │ │ ─ write .md5 sidecars │ │ ─ trim manifest │ +│ ─ write manifests │ └──────────┬────────────┘ └──────────────────────┘ +└────────────────────┘ │ + local volume + mounted into + the container + │ + ▼ + ┌────────────────────┐ + │ MinIO (S3-compat) │ + │ localhost:9000 │ + └────────────────────┘ +``` + +--- + +## 1. Start MinIO + +```sh +docker run -d \ + --name minio \ + -p 9000:9000 \ + -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + minio/minio:RELEASE.2025-02-28T09-55-16Z server /data --console-address ":9001" +``` + +Create a test bucket via the [MinIO console](http://localhost:9001) +(login: `minioadmin` / `minioadmin`), or from the command line: + +```sh +# Using the AWS CLI (or `mc` if installed) +aws --endpoint-url http://localhost:9000 \ + s3 mb s3://cdm-lake \ + --no-sign-request 2>/dev/null || true + +# Alternatively, using the MinIO client: +# mc alias set local http://localhost:9000 minioadmin minioadmin +# mc mb local/cdm-lake +``` + +--- + +## 2. Phase 1 — Generate manifests (notebook) + +Open `notebooks/ncbi_ftp_manifest.ipynb` in JupyterLab or VS Code. + +### Constants to change (Cell 3) + +| Constant | Walkthrough value | Why | +|-----------------------|----------------------------------|---------------------------------------------------------| +| `DATABASE` | `"refseq"` | keep as-is | +| `PREFIX_FROM` | `"900"` | high-numbered prefix → few assemblies, fast diffing | +| `PREFIX_TO` | `"900"` | single prefix bucket | +| `LIMIT` | `10` | cap to 10 assemblies | +| `PREVIOUS_SUMMARY_S3` | `None` | first run — everything is "new" | +| `SNAPSHOT_UPLOAD_S3` | `None` | skip S3 upload for local testing | +| `OUTPUT_DIR` | `Path("output")` | keep as-is (local directory) | + +### Run the notebook + +Execute all cells in order. After Cell 7 finishes you should see files in +`output/`: + +``` +output/ + transfer_manifest.txt # ≤ 10 FTP directory paths + removed_manifest.txt # empty on first run + updated_manifest.txt # empty on first run + diff_summary.json # counts of new/updated/replaced/suppressed +``` + +Inspect `transfer_manifest.txt` — each line is an FTP directory path like: + +``` +/genomes/all/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly +``` + +### Optional: upload manifests to S3 for CTS + +Cell 7 optionally uploads the manifests to an S3 staging prefix so that CTS +can stage them into the container. For local testing, set +`STAGING_S3_PREFIX = None` (the default) and copy the manifest manually in +Step 3b below. + +If you are testing against MinIO and want to exercise the S3 upload path: + +```python +STAGING_S3_PREFIX = "s3://cdm-lake/staging/run1/" +``` + +> **Tip:** If you re-run later with `PREVIOUS_SUMMARY_S3` pointing at a +> snapshot from a prior run you will see `updated`, `replaced`, and +> `suppressed` entries in the diff. + +--- + +## 3. Phase 2 — Download assemblies (container) + +Phase 2 uses the `ncbi_ftp_sync` CLI, which is the container's built-in entry +point for parallel FTP downloads. + +> **CTS (CDM Task Service):** In production, Phase 2 runs as a CTS job. +> CTS stages input files from S3 into the container's filesystem mount +> (`/input_dir`) and copies container output back to S3 (`/output_dir`). +> The container itself never receives S3 credentials. +> See [cdm-task-service](https://github.com/kbase/cdm-task-service) for details. + +For local testing without a CTS instance we run the container directly with +Docker (or Podman), mounting the manifest produced in Phase 1 as input and a +local staging directory as output. + +### 3a. Build the container image + +```sh +# From the repository root +docker build -t cdm-data-loaders . +``` + +### 3b. Prepare local directories + +```sh +mkdir -p staging +cp output/transfer_manifest.txt staging/ +``` + +### 3c. Run the download + +```sh +docker run --rm \ + -v "$(pwd)/staging:/input:ro" \ + -v "$(pwd)/staging:/output" \ + cdm-data-loaders ncbi_ftp_sync \ + --manifest /input/transfer_manifest.txt \ + --output-dir /output \ + --threads 2 \ + --limit 10 +``` + +| Flag | Purpose | +|-----------------|-----------------------------------------------------------| +| `--manifest` | Path to the transfer manifest inside the container | +| `--output-dir` | Where downloads land (mounted from host `staging/`) | +| `--threads` | Parallel FTP connections (2 is polite for testing) | +| `--limit` | Redundant safety cap (already limited in Phase 1) | + +After the container exits, `staging/` will contain: + +``` +staging/ + raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/ + GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz + GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz.md5 + GCF_900000615.1_PRJEB7657_assembly_protein.faa.gz + GCF_900000615.1_PRJEB7657_assembly_protein.faa.gz.md5 + ... + download_report.json +``` + +Each data file has a `.md5` sidecar containing the hex digest verified against +the FTP server's `md5checksums.txt`. + +> **Without Docker:** You can also run the CLI directly if you have the project +> installed locally: +> +> ```sh +> uv run ncbi_ftp_sync \ +> --manifest output/transfer_manifest.txt \ +> --output-dir staging \ +> --threads 2 --limit 10 +> ``` + +### 3d. Upload staged files to MinIO + +The download step writes to the local filesystem. To feed Phase 3 we need +to upload the staged files into MinIO under a staging prefix: + +```sh +aws --endpoint-url http://localhost:9000 \ + s3 cp staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ \ + --recursive \ + --no-sign-request +``` + +Verify the upload: + +```sh +aws --endpoint-url http://localhost:9000 \ + s3 ls s3://cdm-lake/staging/run1/ \ + --recursive --no-sign-request | head -20 +``` + +--- + +## 4. Phase 3 — Promote & archive (notebook) + +Open `notebooks/ncbi_ftp_promote.ipynb`. + +### Constants to change (Cell 3) + +| Constant | Walkthrough value | Why | +|---------------------|------------------------------------------------------|---------------------------------------------| +| `BUCKET` | `"cdm-lake"` | matches the bucket created in Step 1 | +| `STAGING_PREFIX` | `"staging/run1/"` | matches the upload prefix from Step 3d | +| `REMOVED_MANIFEST` | `None` | nothing to remove on first run | +| `UPDATED_MANIFEST` | `None` | nothing to archive on first run | +| `NCBI_RELEASE` | `None` | no release tag needed for local testing | +| `MANIFEST_S3_KEY` | `None` | skip manifest trimming | +| `PATH_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | keep default | +| `DRY_RUN` | `True` | **start with dry-run!** | + +### Initialise the S3 client for MinIO + +The notebook calls `get_s3_client()` which, by default, tries to import +credentials from `berdl_notebook_utils`. For local MinIO you need to +initialise the client manually **before** running Cell 4. Insert a new cell +after Cell 2 (Imports) with: + +```python +from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client + +reset_s3_client() # clear any cached client +get_s3_client({ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "minioadmin", + "aws_secret_access_key": "minioadmin", +}) +``` + +### Run the notebook + +1. Execute all cells. With `DRY_RUN = True` the promote step will log what it + *would* do without moving any objects. +2. Review the report in Cell 6. +3. If the dry-run looks correct, set `DRY_RUN = False` in Cell 3 and re-run + from Cell 5. + +After promotion the final Lakehouse layout in MinIO will look like: + +``` +cdm-lake/ + tenant-general-warehouse/kbase/datasets/ncbi/ + raw_data/GCF/900/000/615/GCF_900000615.1_.../ + GCF_900000615.1_..._genomic.fna.gz (with md5 in user metadata) + GCF_900000615.1_..._protein.faa.gz + ... +``` + +--- + +## 5. Inspect results in MinIO + +Browse the [MinIO console](http://localhost:9001) or use the CLI: + +```sh +# List final Lakehouse objects +aws --endpoint-url http://localhost:9000 \ + s3 ls s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/ \ + --recursive --no-sign-request | head -20 + +# Check user metadata (md5) on a specific object +aws --endpoint-url http://localhost:9000 \ + s3api head-object \ + --bucket cdm-lake \ + --key "tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz" \ + --no-sign-request | jq .Metadata +``` + +--- + +## 6. Incremental run (second sync) + +To exercise the diff/update/archive logic, repeat the pipeline with a +previous snapshot: + +1. **Phase 1:** Set `PREVIOUS_SUMMARY_S3` to an S3 path where you upload the + raw summary from the first run, or save the `raw_summary` string from Cell 4 + to a local file and pass it via `parse_assembly_summary(Path("prev.txt"))`. +2. **Phase 1:** The diff will now show `updated`, `replaced`, and + `suppressed` entries (if any changed between runs). +3. **Phase 2:** Download the new manifest. +4. **Phase 3:** Set `REMOVED_MANIFEST` and `UPDATED_MANIFEST` to the paths + from Phase 1. Updated assemblies will be archived before overwrite; + removed assemblies will be archived and deleted. + +--- + +## 7. Cleanup + +```sh +# Stop and remove MinIO +docker stop minio && docker rm minio + +# Remove local staging data +rm -rf staging/ output/ +``` + +--- + +## Troubleshooting + +| Symptom | Cause | Fix | +|---------|-------|-----| +| `berdl_notebook_utils` import error in notebook | Missing local MinIO client init | Add the `get_s3_client({...})` cell described in Step 4 | +| `connect_ftp() timeout` | NCBI FTP may be slow or rate-limited | Retry; reduce `--threads` to 1 | +| `CRC64NVME` errors uploading to MinIO | MinIO version too old (needs ≥ `2025-02-07`) | Pin to `minio/minio:RELEASE.2025-02-28T09-55-16Z` or newer | +| Phase 3 shows 0 promoted | Staging prefix doesn't match or bucket is wrong | Verify `STAGING_PREFIX` matches the S3 upload path from Step 3d | +| Container can't reach FTP | Docker network isolation | Use `--network host` or ensure DNS resolution works inside the container | + +--- + +## Reference: file filters + +Phase 2 downloads only files matching these suffixes (defined in +`cdm_data_loaders.ncbi_ftp.assembly.FILE_FILTERS`): + +| Suffix | Content | +|--------|---------| +| `_genomic.fna.gz` | Genome nucleotide sequences | +| `_genomic.gff.gz` | Genome annotations (GFF3) | +| `_protein.faa.gz` | Protein sequences | +| `_gene_ontology.gaf.gz` | GO annotations | +| `_assembly_report.txt` | Assembly metadata | +| `_assembly_stats.txt` | Assembly statistics | +| `_assembly_regions.txt` | Assembly regions | +| `_ani_contam_ranges.tsv` | ANI contamination ranges | +| `_gene_expression_counts.txt.gz` | Gene expression counts | +| `_normalized_gene_expression_counts.txt.gz` | Normalised expression counts | + +Plus the per-assembly `md5checksums.txt` which is always downloaded for +integrity verification. diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 0cbb5295..3c22c072 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -198,6 +198,39 @@ "else:\n", " print(\"Skipping S3 snapshot upload (SNAPSHOT_UPLOAD_S3 not set)\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "112c497a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Upload manifests to S3 for CTS input staging (optional).\"\"\"\n", + "\n", + "# S3 prefix where CTS will read input files from.\n", + "# Set to None to skip upload (local-only testing).\n", + "STAGING_S3_PREFIX: str | None = None # e.g. \"s3://cdm-lake/staging/run1/\"\n", + "\n", + "if STAGING_S3_PREFIX:\n", + " from cdm_data_loaders.utils.s3 import upload_file_with_metadata\n", + "\n", + " s3 = get_s3_client()\n", + " bucket, prefix = split_s3_path(STAGING_S3_PREFIX)\n", + " prefix = prefix.rstrip(\"/\") + \"/\"\n", + "\n", + " for manifest in [\"transfer_manifest.txt\", \"removed_manifest.txt\",\n", + " \"updated_manifest.txt\", \"diff_summary.json\"]:\n", + " local_path = OUTPUT_DIR / manifest\n", + " if local_path.exists():\n", + " key = f\"{prefix}{manifest}\"\n", + " upload_file_with_metadata(s3, str(local_path), bucket, key)\n", + " print(f\"Uploaded {manifest} -> s3://{bucket}/{key}\")\n", + "\n", + " print(f\"\\nManifests staged for CTS at s3://{bucket}/{prefix}\")\n", + "else:\n", + " print(\"Skipping S3 manifest upload (STAGING_S3_PREFIX not set)\")\n" + ] } ], "metadata": { diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1d90faa6..33e53519 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -8,12 +8,11 @@ from __future__ import annotations -import functools import hashlib import os import re from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import Any from unittest.mock import patch import boto3 @@ -26,9 +25,6 @@ from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path from cdm_data_loaders.utils.s3 import reset_s3_client -if TYPE_CHECKING: - from collections.abc import Callable - # ── MinIO connection defaults ─────────────────────────────────────────── MINIO_ENDPOINT_URL = os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000") @@ -72,20 +68,6 @@ def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item item.add_marker(skip_marker) -# ── CRC64NVME workaround ─────────────────────────────────────────────── - - -def strip_checksum_algorithm(method: Callable) -> Callable: - """Wrap a boto3 S3 method to remove ``ChecksumAlgorithm`` if unsupported.""" - - @functools.wraps(method) - def wrapper(*args: object, **kwargs: object) -> object: - kwargs.pop("ChecksumAlgorithm", None) # type: ignore[arg-type] - return method(*args, **kwargs) - - return wrapper - - # ── Fixtures ──────────────────────────────────────────────────────────── @@ -103,10 +85,6 @@ def minio_s3_client() -> botocore.client.BaseClient: aws_secret_access_key=MINIO_SECRET_KEY, ) - # MinIO may not support CRC64NVME — strip to be safe - client.upload_file = strip_checksum_algorithm(client.upload_file) # type: ignore[method-assign] - client.copy_object = strip_checksum_algorithm(client.copy_object) # type: ignore[method-assign] - reset_s3_client() with ( patch.object(s3_utils, "get_s3_client", return_value=client), From 5c6c547e1c58251a78c4d0c518038147286bc068 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Fri, 17 Apr 2026 14:21:03 -0700 Subject: [PATCH 04/76] debug and cleanup --- Dockerfile | 4 +- docs/ncbi_ftp_e2e_walkthrough.md | 185 ++++++++++++------ notebooks/ncbi_ftp_manifest.ipynb | 153 +++++++++++---- notebooks/ncbi_ftp_promote.ipynb | 98 +++++++--- scripts/s3_local.py | 127 ++++++++++++ src/cdm_data_loaders/ncbi_ftp/manifest.py | 41 ++-- src/cdm_data_loaders/ncbi_ftp/promote.py | 73 ++++--- .../pipelines/ncbi_ftp_download.py | 2 +- tests/integration/test_full_pipeline.py | 20 +- tests/integration/test_manifest_e2e.py | 6 +- tests/integration/test_promote_e2e.py | 42 ++-- tests/ncbi_ftp/test_manifest.py | 47 ++++- tests/ncbi_ftp/test_notebooks.py | 4 +- tests/ncbi_ftp/test_promote.py | 34 ++-- tests/pipelines/test_ncbi_ftp_download.py | 6 +- 15 files changed, 612 insertions(+), 230 deletions(-) create mode 100755 scripts/s3_local.py diff --git a/Dockerfile b/Dockerfile index 35a2085c..a218a8bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,7 +54,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Place executables in the environment at the front of the path ENV PATH="/app/.venv/bin:$PATH" -COPY --chmod=+x ./scripts/entrypoint.sh /app/ +RUN chmod +x ./scripts/entrypoint.sh # Use the non-root user to run our application USER nonroot -ENTRYPOINT ["./entrypoint.sh"] +ENTRYPOINT ["./scripts/entrypoint.sh"] diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index 733345be..40b467a7 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -24,15 +24,54 @@ image for the Phase 2 download step. │ previous │ │ ─ write .md5 sidecars │ │ ─ trim manifest │ │ ─ write manifests │ └──────────┬────────────┘ └──────────────────────┘ └────────────────────┘ │ - local volume - mounted into - the container - │ - ▼ - ┌────────────────────┐ - │ MinIO (S3-compat) │ - │ localhost:9000 │ - └────────────────────┘ + local volume + mounted into + the container +``` + +--- + +## Path anatomy + +All S3 paths in this pipeline compose from a small set of variables. +Understanding this decomposition is the key to configuring the notebooks. + +### Path formats used + +| Format | Example | Description | +|--------|---------|-------------| +| **s3:// URI** | `s3://cdm-lake/staging/run1/` | Full URI with scheme + bucket + key | +| **bucket name** | `cdm-lake` | Just the bucket, no scheme | +| **S3 key prefix** | `tenant-general-warehouse/kbase/datasets/ncbi/` | Path within a bucket (no scheme, no bucket) | +| **S3 object key** | `staging/transfer_manifest.txt` | Single object key within a bucket | +| **local path** | `output/removed_manifest.txt` | Filesystem path on the host | + +### Lakehouse object (final location) + +``` +s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ──┘ └── key prefix ──────┘└── build_accession_path() ────────────────────────┘ +``` + +Example: +``` +s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz +``` + +### Staging object (Phase 2 output) + +``` +s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ──┘ └── key prefix ────┘└── build_accession_path() ────────────────────────┘ +``` + +### Local output (Phase 1) + +``` +{OUTPUT_DIR}/transfer_manifest.txt +{OUTPUT_DIR}/removed_manifest.txt +{OUTPUT_DIR}/updated_manifest.txt +{OUTPUT_DIR}/diff_summary.json ``` --- @@ -50,17 +89,12 @@ docker run -d \ ``` Create a test bucket via the [MinIO console](http://localhost:9001) -(login: `minioadmin` / `minioadmin`), or from the command line: +(login: `minioadmin` / `minioadmin`), or from the command line using the +included `scripts/s3_local.py` helper (requires no extra installs — only +`boto3` which is already a project dependency): ```sh -# Using the AWS CLI (or `mc` if installed) -aws --endpoint-url http://localhost:9000 \ - s3 mb s3://cdm-lake \ - --no-sign-request 2>/dev/null || true - -# Alternatively, using the MinIO client: -# mc alias set local http://localhost:9000 minioadmin minioadmin -# mc mb local/cdm-lake +uv run python scripts/s3_local.py mb s3://cdm-lake ``` --- @@ -71,15 +105,40 @@ Open `notebooks/ncbi_ftp_manifest.ipynb` in JupyterLab or VS Code. ### Constants to change (Cell 3) -| Constant | Walkthrough value | Why | -|-----------------------|----------------------------------|---------------------------------------------------------| -| `DATABASE` | `"refseq"` | keep as-is | -| `PREFIX_FROM` | `"900"` | high-numbered prefix → few assemblies, fast diffing | -| `PREFIX_TO` | `"900"` | single prefix bucket | -| `LIMIT` | `10` | cap to 10 assemblies | -| `PREVIOUS_SUMMARY_S3` | `None` | first run — everything is "new" | -| `SNAPSHOT_UPLOAD_S3` | `None` | skip S3 upload for local testing | -| `OUTPUT_DIR` | `Path("output")` | keep as-is (local directory) | +| Constant | Walkthrough value | Format | Why | +|-----------------------|----------------------------------|--------|---------------------------------------------------------| +| `DATABASE` | `"refseq"` | string | keep as-is | +| `PREFIX_FROM` | `"900"` | string | high-numbered prefix → few assemblies, fast diffing | +| `PREFIX_TO` | `"900"` | string | single prefix bucket | +| `LIMIT` | `10` | int | cap to 10 assemblies | +| `PREVIOUS_SUMMARY_URI` | `None` | s3:// URI | first run — everything is "new" | +| `SNAPSHOT_UPLOAD_URI` | `None` | s3:// URI | skip S3 upload for local testing | +| `STORE_BUCKET` | `"cdm-lake"` (or `None`) | bucket name | set to prune assemblies already in the Lakehouse | +| `STORE_KEY_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | S3 key prefix | default Lakehouse path prefix | +| `OUTPUT_DIR` | `Path("output")` | local path | keep as-is (local directory) | + +### Initialise the S3 client for MinIO + +If you set `PREVIOUS_SUMMARY_URI`, `SNAPSHOT_UPLOAD_URI`, `STORE_BUCKET`, +or `STAGING_URI` to point at your local MinIO, you must initialise +the S3 client **before** running the cells that use them. Insert a new cell +after Cell 1 (Imports) with: + +```python +from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client + +reset_s3_client() +get_s3_client({ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "minioadmin", + "aws_secret_access_key": "minioadmin", +}) +``` + +If all three S3 variables are `None` (purely local testing), this cell can +be skipped — though on repeat runs you should set `STORE_BUCKET` so +assemblies already promoted to the Lakehouse are pruned from the transfer +manifest. ### Run the notebook @@ -104,16 +163,16 @@ Inspect `transfer_manifest.txt` — each line is an FTP directory path like: Cell 7 optionally uploads the manifests to an S3 staging prefix so that CTS can stage them into the container. For local testing, set -`STAGING_S3_PREFIX = None` (the default) and copy the manifest manually in +`STAGING_URI = None` (the default) and copy the manifest manually in Step 3b below. If you are testing against MinIO and want to exercise the S3 upload path: ```python -STAGING_S3_PREFIX = "s3://cdm-lake/staging/run1/" +STAGING_URI = "s3://cdm-lake/staging/run1/" ``` -> **Tip:** If you re-run later with `PREVIOUS_SUMMARY_S3` pointing at a +> **Tip:** If you re-run later with `PREVIOUS_SUMMARY_URI` pointing at a > snapshot from a prior run you will see `updated`, `replaced`, and > `suppressed` entries in the diff. @@ -144,16 +203,17 @@ docker build -t cdm-data-loaders . ### 3b. Prepare local directories ```sh -mkdir -p staging -cp output/transfer_manifest.txt staging/ +mkdir -p notebooks/staging +cp notebooks/output/transfer_manifest.txt notebooks/staging/ ``` ### 3c. Run the download ```sh docker run --rm \ - -v "$(pwd)/staging:/input:ro" \ - -v "$(pwd)/staging:/output" \ + --userns=keep-id \ + -v "$(pwd)/notebooks/staging:/input:ro" \ + -v "$(pwd)/notebooks/staging:/output" \ cdm-data-loaders ncbi_ftp_sync \ --manifest /input/transfer_manifest.txt \ --output-dir /output \ @@ -161,6 +221,10 @@ docker run --rm \ --limit 10 ``` +> **Note:** `--userns=keep-id` maps your host UID into the container so +> bind-mount writes work with Podman's rootless mode. If you use Docker +> instead, replace it with `--user "$(id -u):$(id -g)"`. + | Flag | Purpose | |-----------------|-----------------------------------------------------------| | `--manifest` | Path to the transfer manifest inside the container | @@ -168,7 +232,7 @@ docker run --rm \ | `--threads` | Parallel FTP connections (2 is polite for testing) | | `--limit` | Redundant safety cap (already limited in Phase 1) | -After the container exits, `staging/` will contain: +After the container exits, `notebooks/staging/` will contain: ``` staging/ @@ -189,7 +253,7 @@ the FTP server's `md5checksums.txt`. > > ```sh > uv run ncbi_ftp_sync \ -> --manifest output/transfer_manifest.txt \ +> --manifest notebooks/output/transfer_manifest.txt \ > --output-dir staging \ > --threads 2 --limit 10 > ``` @@ -200,18 +264,13 @@ The download step writes to the local filesystem. To feed Phase 3 we need to upload the staged files into MinIO under a staging prefix: ```sh -aws --endpoint-url http://localhost:9000 \ - s3 cp staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ \ - --recursive \ - --no-sign-request +uv run python scripts/s3_local.py cp notebooks/staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ ``` Verify the upload: ```sh -aws --endpoint-url http://localhost:9000 \ - s3 ls s3://cdm-lake/staging/run1/ \ - --recursive --no-sign-request | head -20 +uv run python scripts/s3_local.py ls s3://cdm-lake/staging/run1/ ``` --- @@ -222,16 +281,16 @@ Open `notebooks/ncbi_ftp_promote.ipynb`. ### Constants to change (Cell 3) -| Constant | Walkthrough value | Why | -|---------------------|------------------------------------------------------|---------------------------------------------| -| `BUCKET` | `"cdm-lake"` | matches the bucket created in Step 1 | -| `STAGING_PREFIX` | `"staging/run1/"` | matches the upload prefix from Step 3d | -| `REMOVED_MANIFEST` | `None` | nothing to remove on first run | -| `UPDATED_MANIFEST` | `None` | nothing to archive on first run | -| `NCBI_RELEASE` | `None` | no release tag needed for local testing | -| `MANIFEST_S3_KEY` | `None` | skip manifest trimming | -| `PATH_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | keep default | -| `DRY_RUN` | `True` | **start with dry-run!** | +| Constant | Walkthrough value | Format | Why | +|-------------------------|------------------------------------------------------|--------|---------------------------------------------| +| `STORE_BUCKET` | `"cdm-lake"` | bucket name | matches the bucket created in Step 1 | +| `STAGING_KEY_PREFIX` | `"staging/run1/"` | S3 key prefix | matches the upload prefix from Step 3d | +| `REMOVED_MANIFEST_PATH` | `None` | local path | nothing to remove on first run | +| `UPDATED_MANIFEST_PATH` | `None` | local path | nothing to archive on first run | +| `NCBI_RELEASE` | `None` | string | no release tag needed for local testing | +| `MANIFEST_S3_KEY` | `None` | S3 object key | skip manifest trimming | +| `LAKEHOUSE_KEY_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | S3 key prefix | keep default | +| `DRY_RUN` | `True` | bool | **start with dry-run!** | ### Initialise the S3 client for MinIO @@ -257,7 +316,7 @@ get_s3_client({ *would* do without moving any objects. 2. Review the report in Cell 6. 3. If the dry-run looks correct, set `DRY_RUN = False` in Cell 3 and re-run - from Cell 5. + from Cell 3. After promotion the final Lakehouse layout in MinIO will look like: @@ -278,16 +337,12 @@ Browse the [MinIO console](http://localhost:9001) or use the CLI: ```sh # List final Lakehouse objects -aws --endpoint-url http://localhost:9000 \ - s3 ls s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/ \ - --recursive --no-sign-request | head -20 +uv run python scripts/s3_local.py ls \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/ # Check user metadata (md5) on a specific object -aws --endpoint-url http://localhost:9000 \ - s3api head-object \ - --bucket cdm-lake \ - --key "tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz" \ - --no-sign-request | jq .Metadata +uv run python scripts/s3_local.py head \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz ``` --- @@ -297,13 +352,13 @@ aws --endpoint-url http://localhost:9000 \ To exercise the diff/update/archive logic, repeat the pipeline with a previous snapshot: -1. **Phase 1:** Set `PREVIOUS_SUMMARY_S3` to an S3 path where you upload the +2. **Phase 1:** Set `PREVIOUS_SUMMARY_URI` to an S3 path where you upload the raw summary from the first run, or save the `raw_summary` string from Cell 4 to a local file and pass it via `parse_assembly_summary(Path("prev.txt"))`. 2. **Phase 1:** The diff will now show `updated`, `replaced`, and `suppressed` entries (if any changed between runs). 3. **Phase 2:** Download the new manifest. -4. **Phase 3:** Set `REMOVED_MANIFEST` and `UPDATED_MANIFEST` to the paths +4. **Phase 3:** Set `REMOVED_MANIFEST_PATH` and `UPDATED_MANIFEST_PATH` to the paths from Phase 1. Updated assemblies will be archived before overwrite; removed assemblies will be archived and deleted. @@ -328,7 +383,7 @@ rm -rf staging/ output/ | `berdl_notebook_utils` import error in notebook | Missing local MinIO client init | Add the `get_s3_client({...})` cell described in Step 4 | | `connect_ftp() timeout` | NCBI FTP may be slow or rate-limited | Retry; reduce `--threads` to 1 | | `CRC64NVME` errors uploading to MinIO | MinIO version too old (needs ≥ `2025-02-07`) | Pin to `minio/minio:RELEASE.2025-02-28T09-55-16Z` or newer | -| Phase 3 shows 0 promoted | Staging prefix doesn't match or bucket is wrong | Verify `STAGING_PREFIX` matches the S3 upload path from Step 3d | +| Phase 3 shows 0 promoted | Staging prefix doesn't match or bucket is wrong | Verify `STAGING_KEY_PREFIX` matches the S3 upload path from Step 3d | | Container can't reach FTP | Docker network isolation | Use `--network host` or ensure DNS resolution works inside the container | --- diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 3c22c072..20ee5016 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -15,7 +15,28 @@ "- `diff_summary.json` — human-readable summary of changes\n", "\n", "All filtering (prefix range, limit) is applied here so downstream phases\n", - "receive a final, pre-filtered manifest." + "receive a final, pre-filtered manifest.\n", + "\n", + "Optionally verifies candidates against the S3 Lakehouse (`STORE_BUCKET`) so\n", + "assemblies that were already downloaded and promoted are pruned from the\n", + "transfer manifest." + ] + }, + { + "cell_type": "markdown", + "id": "d0d3063c", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_URI` | `s3://bucket/key/…` | `s3://cdm-lake/staging/run1/` |\n", + "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `tenant-general-warehouse/kbase/datasets/ncbi/` |\n", + "| `_DIR` / `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", + "\n", + "Lakehouse object: `s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}raw_data/…/{filename}`" ] }, { @@ -60,28 +81,37 @@ "DATABASE = \"refseq\"\n", "\n", "# Accession prefix filtering (3-digit, inclusive). Set to None to skip.\n", - "PREFIX_FROM: str | None = None # e.g. \"000\"\n", - "PREFIX_TO: str | None = None # e.g. \"003\"\n", + "PREFIX_FROM: str | None = \"900\" # e.g. \"000\"\n", + "PREFIX_TO: str | None = \"900\" # e.g. \"003\"\n", "\n", "# Maximum number of new/updated assemblies to include (None = unlimited)\n", - "LIMIT: int | None = None\n", + "LIMIT: int | None = 10\n", "\n", - "# S3 path to the previous assembly summary snapshot (set to None on first run)\n", - "PREVIOUS_SUMMARY_S3: str | None = None # e.g. \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/assembly_summary_refseq_prev.txt\"\n", + "# Previous assembly summary snapshot\n", + "# format: s3:// URI (e.g. \"s3://cdm-lake/.../assembly_summary_refseq_prev.txt\")\n", + "PREVIOUS_SUMMARY_URI: str | None = None\n", "\n", - "# S3 path where the new snapshot will be uploaded after diffing\n", - "SNAPSHOT_UPLOAD_S3: str | None = None # e.g. \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/assembly_summary_refseq_curr.txt\"\n", + "# S3 location where the new snapshot will be uploaded after diffing\n", + "# format: s3:// URI\n", + "SNAPSHOT_UPLOAD_URI: str | None = None\n", + "\n", + "# Verify candidates against the S3 Lakehouse — prune assemblies already present.\n", + "# Set STORE_BUCKET to your bucket name to enable, or None to skip.\n", + "# STORE_KEY_PREFIX should point to the directory containing `raw_data/`.\n", + "# format: bucket name (no s3:// scheme)\n", + "STORE_BUCKET: str | None = \"cdm-lake\"\n", + "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "STORE_KEY_PREFIX = \"tenant-general-warehouse/kbase/datasets/ncbi/\"\n", "\n", "# Local output directory for manifest files\n", + "# format: local directory path\n", "OUTPUT_DIR = Path(\"output\")\n", "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", "\n", - "# FTP hostname (default is the standard NCBI FTP server)\n", - "FTP_HOSTNAME = FTP_HOST\n", - "\n", "print(f\"Database: {DATABASE}\")\n", "print(f\"Prefix range: {PREFIX_FROM} -> {PREFIX_TO}\")\n", "print(f\"Limit: {LIMIT}\")\n", + "print(f\"Verify against S3: {STORE_BUCKET or 'disabled'}\")\n", "print(f\"Output dir: {OUTPUT_DIR}\")" ] }, @@ -94,7 +124,7 @@ "source": [ "\"\"\"Download current assembly summary from NCBI FTP.\"\"\"\n", "\n", - "raw_summary = download_assembly_summary(database=DATABASE, ftp_host=FTP_HOSTNAME)\n", + "raw_summary = download_assembly_summary(database=DATABASE, ftp_host=FTP_HOST)\n", "current = parse_assembly_summary(raw_summary)\n", "print(f\"Parsed {len(current)} assemblies from current {DATABASE} summary\")" ] @@ -110,9 +140,9 @@ "\n", "previous: dict[str, AssemblyRecord] | None = None\n", "\n", - "if PREVIOUS_SUMMARY_S3:\n", + "if PREVIOUS_SUMMARY_URI:\n", " s3 = get_s3_client()\n", - " bucket, key = split_s3_path(PREVIOUS_SUMMARY_S3)\n", + " bucket, key = split_s3_path(PREVIOUS_SUMMARY_URI)\n", " resp = s3.get_object(Bucket=bucket, Key=key)\n", " prev_text = resp[\"Body\"].read().decode(\"utf-8\")\n", " previous = parse_assembly_summary(prev_text)\n", @@ -128,7 +158,7 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"Compute diff and apply filters.\"\"\"\n", + "\"\"\"Compute diff and apply prefix filter.\"\"\"\n", "\n", "# Filter current assemblies by prefix range\n", "filtered = filter_by_prefix_range(current, prefix_from=PREFIX_FROM, prefix_to=PREFIX_TO)\n", @@ -145,16 +175,52 @@ "print(f\"Replaced: {len(diff.replaced)}\")\n", "print(f\"Suppressed: {len(diff.suppressed)}\")\n", "print(f\"Total to transfer: {len(diff.new) + len(diff.updated)}\")\n", - "print(f\"Total to remove: {len(diff.replaced) + len(diff.suppressed)}\")\n", + "print(f\"Total to remove: {len(diff.replaced) + len(diff.suppressed)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91ad314a", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Verify candidates against S3 Lakehouse, then apply LIMIT.\n", + "\n", + "Verification (optional): for each candidate, fetch md5checksums.txt from\n", + "NCBI FTP and compare against md5 metadata on existing S3 objects.\n", + "Assemblies already present with matching checksums are pruned.\n", + "\n", + "LIMIT is applied *after* verification so the cap counts only assemblies\n", + "that genuinely need downloading.\n", + "\"\"\"\n", + "\n", + "# ── Verify against Lakehouse ──\n", + "if STORE_BUCKET:\n", + " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", + "\n", + " candidates = diff.new + diff.updated\n", + " print(f\"Verifying {len(candidates)} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", + " confirmed = set(verify_transfer_candidates(\n", + " candidates, filtered, STORE_BUCKET, STORE_KEY_PREFIX, ftp_host=FTP_HOST,\n", + " ))\n", + " before = len(diff.new) + len(diff.updated)\n", + " diff.new = [a for a in diff.new if a in confirmed]\n", + " diff.updated = [a for a in diff.updated if a in confirmed]\n", + " after = len(diff.new) + len(diff.updated)\n", + " print(f\"Verified: {after} need downloading, {before - after} pruned (already in store)\")\n", + "else:\n", + " print(\"Skipping S3 verification (STORE_BUCKET not set)\")\n", "\n", - "# Apply limit if set\n", + "# ── Apply LIMIT ──\n", "if LIMIT is not None:\n", " original_new = len(diff.new)\n", " original_updated = len(diff.updated)\n", " combined = diff.new + diff.updated\n", " limited = combined[:LIMIT]\n", - " diff.new = [a for a in diff.new if a in set(limited)]\n", - " diff.updated = [a for a in diff.updated if a in set(limited)]\n", + " limited_set = set(limited)\n", + " diff.new = [a for a in diff.new if a in limited_set]\n", + " diff.updated = [a for a in diff.updated if a in limited_set]\n", " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", " print(f\" (was {original_new} new, {original_updated} updated)\")" ] @@ -170,7 +236,7 @@ "\n", "# Write transfer manifest\n", "transfer_path = OUTPUT_DIR / \"transfer_manifest.txt\"\n", - "paths = write_transfer_manifest(diff, filtered, transfer_path, ftp_host=FTP_HOSTNAME)\n", + "paths = write_transfer_manifest(diff, filtered, transfer_path, ftp_host=FTP_HOST)\n", "print(f\"Transfer manifest: {len(paths)} entries -> {transfer_path}\")\n", "\n", "# Write removed manifest\n", @@ -190,13 +256,13 @@ "print(json.dumps(summary[\"counts\"], indent=2))\n", "\n", "# Upload new snapshot to S3 for future diffing\n", - "if SNAPSHOT_UPLOAD_S3:\n", + "if SNAPSHOT_UPLOAD_URI:\n", " s3 = get_s3_client()\n", - " bucket, key = split_s3_path(SNAPSHOT_UPLOAD_S3)\n", + " bucket, key = split_s3_path(SNAPSHOT_UPLOAD_URI)\n", " s3.put_object(Bucket=bucket, Key=key, Body=raw_summary.encode(\"utf-8\"))\n", - " print(f\"Uploaded new snapshot to {SNAPSHOT_UPLOAD_S3}\")\n", + " print(f\"Uploaded new snapshot to {SNAPSHOT_UPLOAD_URI}\")\n", "else:\n", - " print(\"Skipping S3 snapshot upload (SNAPSHOT_UPLOAD_S3 not set)\")" + " print(\"Skipping S3 snapshot upload (SNAPSHOT_UPLOAD_URI not set)\")" ] }, { @@ -206,17 +272,22 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"Upload manifests to S3 for CTS input staging (optional).\"\"\"\n", + "\"\"\"Upload manifests to S3 for CTS input staging (optional).\n", "\n", - "# S3 prefix where CTS will read input files from.\n", - "# Set to None to skip upload (local-only testing).\n", - "STAGING_S3_PREFIX: str | None = None # e.g. \"s3://cdm-lake/staging/run1/\"\n", + "Note: STAGING_URI is a full s3:// URI. The promote notebook splits this into\n", + "STORE_BUCKET + STAGING_KEY_PREFIX (separate bucket and key prefix parameters).\n", "\n", - "if STAGING_S3_PREFIX:\n", - " from cdm_data_loaders.utils.s3 import upload_file_with_metadata\n", + "This is for local testing. The CTS will stage the container's input folder in production.\n", + "\"\"\"\n", "\n", + "# S3 location where CTS will read input files from.\n", + "# Set to None to skip upload (local-only testing).\n", + "# format: s3:// URI (e.g. \"s3://cdm-lake/staging/run1/\")\n", + "STAGING_URI: str | None = \"s3://cdm-lake/staging/run1/input/\"\n", + "\n", + "if STAGING_URI:\n", " s3 = get_s3_client()\n", - " bucket, prefix = split_s3_path(STAGING_S3_PREFIX)\n", + " bucket, prefix = split_s3_path(STAGING_URI)\n", " prefix = prefix.rstrip(\"/\") + \"/\"\n", "\n", " for manifest in [\"transfer_manifest.txt\", \"removed_manifest.txt\",\n", @@ -224,18 +295,32 @@ " local_path = OUTPUT_DIR / manifest\n", " if local_path.exists():\n", " key = f\"{prefix}{manifest}\"\n", - " upload_file_with_metadata(s3, str(local_path), bucket, key)\n", + " s3.upload_file(Filename=str(local_path), Bucket=bucket, Key=key)\n", " print(f\"Uploaded {manifest} -> s3://{bucket}/{key}\")\n", "\n", " print(f\"\\nManifests staged for CTS at s3://{bucket}/{prefix}\")\n", "else:\n", - " print(\"Skipping S3 manifest upload (STAGING_S3_PREFIX not set)\")\n" + " print(\"Skipping S3 manifest upload (STAGING_URI not set)\")" ] } ], "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" } }, "nbformat": 4, diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index de19c0e6..5dd46312 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -20,6 +20,24 @@ "6. Trim manifest (remove promoted entries)" ] }, + { + "cell_type": "markdown", + "id": "2f98c43e", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `staging/run1/` |\n", + "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/transfer_manifest.txt` |\n", + "| `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", + "\n", + "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", + "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" + ] + }, { "cell_type": "code", "execution_count": null, @@ -34,7 +52,7 @@ "import json\n", "\n", "from cdm_data_loaders.ncbi_ftp.promote import (\n", - " DEFAULT_PATH_PREFIX,\n", + " DEFAULT_LAKEHOUSE_KEY_PREFIX,\n", " promote_from_s3,\n", ")\n", "from cdm_data_loaders.utils.s3 import get_s3_client" @@ -47,39 +65,51 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"Configure parameters.\"\"\"\n", + "\"\"\"Configure parameters.\n", + "\n", + "Path layout (how variables compose into a full S3 object path):\n", + " s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{file}\n", + " s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{assembly_dir}/{file}\n", + "\"\"\"\n", "\n", "# S3 bucket where staged files and final Lakehouse data live\n", - "BUCKET = \"cdm-lake\" # e.g. \"cdm-lake\"\n", + "# format: bucket name (no s3:// scheme)\n", + "STORE_BUCKET = \"cdm-lake\"\n", "\n", - "# Staging prefix written by CTS Phase 2 (from the CTS output mount)\n", - "STAGING_PREFIX = \"staging/run1/\" # e.g. \"staging/run1/\"\n", + "# Staging prefix written by CTS Phase 2\n", + "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = \"staging/run1/\"\n", "\n", "# Local path to removed_manifest.txt from Phase 1 (or None to skip archiving)\n", - "REMOVED_MANIFEST: str | None = None # e.g. \"output/removed_manifest.txt\"\n", + "# format: local file path\n", + "REMOVED_MANIFEST_PATH: str | None = None # e.g. \"output/removed_manifest.txt\"\n", "\n", "# Local path to updated_manifest.txt from Phase 1 (or None to skip pre-overwrite archiving)\n", - "UPDATED_MANIFEST: str | None = None # e.g. \"output/updated_manifest.txt\"\n", + "# format: local file path\n", + "UPDATED_MANIFEST_PATH: str | None = None # e.g. \"output/updated_manifest.txt\"\n", "\n", "# NCBI release tag for archive metadata (e.g. \"2024-01\")\n", "NCBI_RELEASE: str | None = None\n", "\n", - "# S3 key of transfer_manifest.txt for trimming (or None to skip)\n", - "MANIFEST_S3_KEY: str | None = None # e.g. \"ncbi/transfer_manifest.txt\"\n", + "# S3 key of transfer_manifest.txt for trimming after promotion (or None to skip).\n", + "# Only needed if the manifest was uploaded to S3 (e.g. via the staging cell in Phase 1).\n", + "# format: S3 object key within STORE_BUCKET (no scheme, no bucket)\n", + "MANIFEST_S3_KEY: str | None = \"staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", "\n", "# Final Lakehouse path prefix\n", - "PATH_PREFIX = DEFAULT_PATH_PREFIX\n", + "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", "\n", "# Dry-run mode — log actions without making changes\n", - "DRY_RUN = True\n", - "\n", - "print(f\"Updated manifest: {UPDATED_MANIFEST}\")\n", - "print(f\"NCBI release: {NCBI_RELEASE}\")\n", - "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", - "print(f\"Path prefix: {PATH_PREFIX}\")\n", - "\n", - "print(f\"Dry-run: {DRY_RUN}\")\n", - "print(f\"Path prefix: {PATH_PREFIX}\")" + "DRY_RUN = False\n", + "\n", + "print(f\"Bucket: {STORE_BUCKET}\")\n", + "print(f\"Staging key prefix: {STAGING_KEY_PREFIX}\")\n", + "print(f\"Updated manifest: {UPDATED_MANIFEST_PATH}\")\n", + "print(f\"NCBI release: {NCBI_RELEASE}\")\n", + "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Lakehouse prefix: {LAKEHOUSE_KEY_PREFIX}\")\n", + "print(f\"Dry-run: {DRY_RUN}\")" ] }, { @@ -95,7 +125,7 @@ "paginator = s3.get_paginator(\"list_objects_v2\")\n", "\n", "staged: list[str] = []\n", - "for page in paginator.paginate(Bucket=BUCKET, Prefix=STAGING_PREFIX):\n", + "for page in paginator.paginate(Bucket=STORE_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", " staged.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", "\n", "sidecars = [k for k in staged if k.endswith((\".md5\", \".crc64nvme\"))]\n", @@ -123,13 +153,13 @@ "\"\"\"Promote staged files to final Lakehouse paths.\"\"\"\n", "\n", "report = promote_from_s3(\n", - " staging_prefix=STAGING_PREFIX,\n", - " bucket=BUCKET,\n", - " removed_manifest=REMOVED_MANIFEST,\n", - " updated_manifest=UPDATED_MANIFEST,\n", + " staging_key_prefix=STAGING_KEY_PREFIX,\n", + " bucket=STORE_BUCKET,\n", + " removed_manifest_path=REMOVED_MANIFEST_PATH,\n", + " updated_manifest_path=UPDATED_MANIFEST_PATH,\n", " ncbi_release=NCBI_RELEASE,\n", - " manifest_path=MANIFEST_S3_KEY,\n", - " path_prefix=PATH_PREFIX,\n", + " manifest_s3_key=MANIFEST_S3_KEY,\n", + " lakehouse_key_prefix=LAKEHOUSE_KEY_PREFIX,\n", " dry_run=DRY_RUN,\n", ")\n", "\n", @@ -163,8 +193,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" } }, "nbformat": 4, diff --git a/scripts/s3_local.py b/scripts/s3_local.py new file mode 100755 index 00000000..65f80396 --- /dev/null +++ b/scripts/s3_local.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# ruff: noqa: T201, EM101, EM102, TRY003, D103 +"""Thin S3 CLI for local MinIO testing (no aws-cli install required). + +Usage (all commands assume ``uv run`` from the repo root): + + uv run python scripts/s3_local.py mb s3://cdm-lake + uv run python scripts/s3_local.py cp staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ + uv run python scripts/s3_local.py ls s3://cdm-lake/staging/run1/ + uv run python scripts/s3_local.py head s3://cdm-lake/some/key.gz + +Environment variables (with defaults for the walkthrough): + + MINIO_ENDPOINT_URL http://localhost:9000 + MINIO_ACCESS_KEY minioadmin + MINIO_SECRET_KEY minioadmin +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +import boto3 + + +def _client() -> boto3.client: + return boto3.client( + "s3", + endpoint_url=os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000"), + aws_access_key_id=os.environ.get("MINIO_ACCESS_KEY", "minioadmin"), + aws_secret_access_key=os.environ.get("MINIO_SECRET_KEY", "minioadmin"), + ) + + +def _split(uri: str) -> tuple[str, str]: + """Split ``s3://bucket/key`` into ``(bucket, key)``.""" + if not uri.startswith("s3://"): + raise SystemExit(f"Expected s3:// URI, got: {uri}") + parts = uri[5:].split("/", 1) + return parts[0], parts[1] if len(parts) > 1 else "" + + +# ── subcommands ───────────────────────────────────────────────────────── + + +def cmd_mb(args: list[str]) -> None: + """Create a bucket: ``mb s3://bucket``.""" + if not args: + raise SystemExit("Usage: s3_local.py mb s3://BUCKET") + bucket, _ = _split(args[0]) + s3 = _client() + try: + s3.head_bucket(Bucket=bucket) + print(f"Bucket already exists: {bucket}") + except Exception: # noqa: BLE001 + s3.create_bucket(Bucket=bucket) + print(f"Created bucket: {bucket}") + + +def cmd_cp(args: list[str]) -> None: + """Recursive upload: ``cp LOCAL_DIR s3://bucket/prefix/``.""" + if len(args) < 2: # noqa: PLR2004 + raise SystemExit("Usage: s3_local.py cp LOCAL_DIR s3://BUCKET/PREFIX/") + local_dir = Path(args[0]) + bucket, prefix = _split(args[1]) + prefix = prefix.rstrip("/") + "/" if prefix else "" + s3 = _client() + count = 0 + for path in sorted(local_dir.rglob("*")): + if path.is_dir(): + continue + rel = path.relative_to(local_dir) + key = f"{prefix}{rel}" + s3.upload_file(Filename=str(path), Bucket=bucket, Key=key) + count += 1 + print(f" {key}") + print(f"Uploaded {count} files to s3://{bucket}/{prefix}") + + +def cmd_ls(args: list[str]) -> None: + """List objects: ``ls s3://bucket/prefix/ [--limit N]``.""" + if not args: + raise SystemExit("Usage: s3_local.py ls s3://BUCKET/PREFIX/ [--limit N]") + bucket, prefix = _split(args[0]) + limit = 20 + if "--limit" in args: + idx = args.index("--limit") + limit = int(args[idx + 1]) + s3 = _client() + paginator = s3.get_paginator("list_objects_v2") + shown = 0 + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []): + print(f" {obj['Size']:>10} {obj['Key']}") + shown += 1 + if shown >= limit: + return + + +def cmd_head(args: list[str]) -> None: + """Show metadata: ``head s3://bucket/key``.""" + if not args: + raise SystemExit("Usage: s3_local.py head s3://BUCKET/KEY") + bucket, key = _split(args[0]) + s3 = _client() + resp = s3.head_object(Bucket=bucket, Key=key) + meta = resp.get("Metadata", {}) + print(json.dumps(meta, indent=2)) + + +# ── dispatch ──────────────────────────────────────────────────────────── + +COMMANDS = {"mb": cmd_mb, "cp": cmd_cp, "ls": cmd_ls, "head": cmd_head} + + +def main() -> None: + if len(sys.argv) < 2 or sys.argv[1] not in COMMANDS: # noqa: PLR2004 + cmds = ", ".join(COMMANDS) + raise SystemExit(f"Usage: s3_local.py <{cmds}> [args ...]\n\n{__doc__}") + COMMANDS[sys.argv[1]](sys.argv[2:]) + + +if __name__ == "__main__": + main() diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index 21aa084d..e67c2185 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -26,7 +26,7 @@ ) from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_noop_keepalive, ftp_retrieve_text -from cdm_data_loaders.utils.s3 import head_object +from cdm_data_loaders.utils.s3 import get_s3_client, head_object logger = get_cdm_logger() @@ -264,7 +264,7 @@ def verify_transfer_candidates( accessions: list[str], current_assemblies: dict[str, AssemblyRecord], bucket: str, - path_prefix: str, + key_prefix: str, ftp_host: str = FTP_HOST, ) -> list[str]: """Verify which transfer candidates actually need downloading. @@ -280,16 +280,18 @@ def verify_transfer_candidates( :param accessions: list of candidate accessions (new + updated from diff) :param current_assemblies: parsed current assembly summary :param bucket: S3 bucket name - :param path_prefix: Lakehouse path prefix (e.g. ``"tenant-general-warehouse/kbase/datasets/ncbi/"``) + :param key_prefix: S3 key prefix for the Lakehouse dataset root :param ftp_host: NCBI FTP hostname :return: filtered list of accessions that actually need downloading """ if not accessions: return [] - ftp = connect_ftp(ftp_host) + s3 = get_s3_client() + ftp: Any = None # lazily connected only when needed confirmed: list[str] = [] pruned = 0 + skipped_missing = 0 last_activity = time.monotonic() try: @@ -299,10 +301,24 @@ def verify_transfer_candidates( confirmed.append(acc) continue - # Keep FTP alive between assemblies + # Build S3 prefix for this assembly + s3_rel = build_accession_path(rec.assembly_dir) + s3_prefix = f"{key_prefix}{s3_rel}" + + # Quick check: does *anything* exist under this prefix? + resp = s3.list_objects_v2(Bucket=bucket, Prefix=s3_prefix, MaxKeys=1) + if resp.get("KeyCount", 0) == 0: + # Nothing in the store — definitely needs downloading + confirmed.append(acc) + skipped_missing += 1 + continue + + # Objects exist — need FTP md5 checksums to decide + if ftp is None: + ftp = connect_ftp(ftp_host) + last_activity = ftp_noop_keepalive(ftp, last_activity) - # Download md5checksums.txt from FTP ftp_dir = _ftp_dir_from_url(rec.ftp_path, ftp_host) try: md5_text = ftp_retrieve_text(ftp, ftp_dir.rstrip("/") + "/md5checksums.txt") @@ -324,13 +340,10 @@ def verify_transfer_candidates( confirmed.append(acc) continue - # Build S3 prefix for this assembly - s3_rel = build_accession_path(rec.assembly_dir) - # Short-circuit: if any file differs or is missing, keep the assembly needs_update = False for fname, expected_md5 in target_checksums.items(): - s3_path = f"{bucket}/{path_prefix}{s3_rel}{fname}" + s3_path = f"{bucket}/{s3_prefix}{fname}" obj_info = head_object(s3_path) if obj_info is None: @@ -349,12 +362,14 @@ def verify_transfer_candidates( pruned += 1 logger.debug("Pruned %s — all files match S3 checksums", acc) finally: - with contextlib.suppress(Exception): - ftp.quit() + if ftp is not None: + with contextlib.suppress(Exception): + ftp.quit() logger.info( - "Checksum verification: %d confirmed, %d pruned (of %d candidates)", + "Checksum verification: %d confirmed (%d missing from store), %d pruned (of %d candidates)", len(confirmed), + skipped_missing, pruned, len(accessions), ) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 37f1fa82..52d9e57f 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -12,6 +12,8 @@ from pathlib import Path from typing import Any +import botocore.exceptions + from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.s3 import ( copy_object_with_metadata, @@ -22,20 +24,20 @@ logger = get_cdm_logger() -DEFAULT_PATH_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" +DEFAULT_LAKEHOUSE_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" # ── Promote from S3 staging prefix ────────────────────────────────────── def promote_from_s3( # noqa: PLR0913 - staging_prefix: str, + staging_key_prefix: str, bucket: str, - removed_manifest: str | Path | None = None, - updated_manifest: str | Path | None = None, + removed_manifest_path: str | Path | None = None, + updated_manifest_path: str | Path | None = None, ncbi_release: str | None = None, - manifest_path: str | None = None, - path_prefix: str = DEFAULT_PATH_PREFIX, + manifest_s3_key: str | None = None, + lakehouse_key_prefix: str = DEFAULT_LAKEHOUSE_KEY_PREFIX, *, dry_run: bool = False, ) -> dict[str, Any]: @@ -44,13 +46,13 @@ def promote_from_s3( # noqa: PLR0913 Downloads each file to a temp location and re-uploads to the final path with MD5 metadata from ``.md5`` sidecar files. - :param staging_prefix: S3 key prefix where CTS output was written + :param staging_key_prefix: S3 key prefix where CTS output was written :param bucket: S3 bucket name - :param removed_manifest: local path to the removed_manifest file - :param updated_manifest: local path to the updated_manifest file + :param removed_manifest_path: local path to the removed_manifest file + :param updated_manifest_path: local path to the updated_manifest file :param ncbi_release: NCBI release version tag for archiving - :param manifest_path: S3 path to transfer_manifest.txt for trimming - :param path_prefix: Lakehouse path prefix for final locations + :param manifest_s3_key: S3 object key for transfer_manifest.txt (for trimming) + :param lakehouse_key_prefix: S3 key prefix for final Lakehouse locations :param dry_run: if True, log actions without side effects :return: report dict with counts """ @@ -62,7 +64,7 @@ def promote_from_s3( # noqa: PLR0913 # Collect all objects under the staging prefix staged_objects: list[str] = [] - for page in paginator.paginate(Bucket=bucket, Prefix=staging_prefix): + for page in paginator.paginate(Bucket=bucket, Prefix=staging_key_prefix): staged_objects.extend(obj["Key"] for obj in page.get("Contents", [])) # Separate data files from sidecars @@ -74,15 +76,15 @@ def promote_from_s3( # noqa: PLR0913 # Archive all affected assemblies BEFORE promoting or deleting archived = 0 for manifest_file, reason, delete in [ - (updated_manifest, "updated", False), - (removed_manifest, "replaced_or_suppressed", True), + (updated_manifest_path, "updated", False), + (removed_manifest_path, "replaced_or_suppressed", True), ]: if manifest_file and Path(str(manifest_file)).is_file(): archived += _archive_assemblies( str(manifest_file), bucket=bucket, ncbi_release=ncbi_release, - path_prefix=path_prefix, + lakehouse_key_prefix=lakehouse_key_prefix, archive_reason=reason, delete_source=delete, dry_run=dry_run, @@ -94,10 +96,10 @@ def promote_from_s3( # noqa: PLR0913 if staged_key.endswith("download_report.json"): continue - rel_path = staged_key[len(staging_prefix) :] + rel_path = staged_key[len(staging_key_prefix) :] if not rel_path.startswith("raw_data/"): continue - final_key = path_prefix + rel_path + final_key = lakehouse_key_prefix + rel_path if dry_run: logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) @@ -137,8 +139,8 @@ def promote_from_s3( # noqa: PLR0913 failed += 1 # Trim manifest for resumability - if manifest_path and promoted_accessions and not dry_run: - _trim_manifest(manifest_path, bucket, promoted_accessions) + if manifest_s3_key and promoted_accessions and not dry_run: + _trim_manifest(manifest_s3_key, bucket, promoted_accessions) report: dict[str, Any] = { "timestamp": datetime.now(UTC).isoformat(), @@ -162,10 +164,10 @@ def promote_from_s3( # noqa: PLR0913 def _archive_assemblies( # noqa: PLR0913 - manifest_path: str, + manifest_local_path: str, bucket: str, ncbi_release: str | None = None, - path_prefix: str = DEFAULT_PATH_PREFIX, + lakehouse_key_prefix: str = DEFAULT_LAKEHOUSE_KEY_PREFIX, archive_reason: str = "unknown", *, delete_source: bool = False, @@ -178,10 +180,10 @@ def _archive_assemblies( # noqa: PLR0913 objects are deleted after copying. When False (updated), the originals remain in place to be overwritten by the promote step. - :param manifest_path: local path to a manifest file (one accession per line) + :param manifest_local_path: local path to a manifest file (one accession per line) :param bucket: S3 bucket name :param ncbi_release: release tag used in the archive path - :param path_prefix: Lakehouse path prefix + :param lakehouse_key_prefix: S3 key prefix for the Lakehouse dataset root :param archive_reason: metadata value describing why the object was archived :param delete_source: if True, delete the source object after copying :param dry_run: if True, log without making changes @@ -192,7 +194,7 @@ def _archive_assemblies( # noqa: PLR0913 datestamp = datetime.now(UTC).strftime("%Y-%m-%d") archived = 0 - with Path(manifest_path).open() as f: + with Path(manifest_local_path).open() as f: accessions = [line.strip() for line in f if line.strip()] for accession in accessions: @@ -203,7 +205,7 @@ def _archive_assemblies( # noqa: PLR0913 db = m.group(1) p1, p2, p3 = m.group(2), m.group(3), m.group(4) - source_prefix = f"{path_prefix}raw_data/{db}/{p1}/{p2}/{p3}/" + source_prefix = f"{lakehouse_key_prefix}raw_data/{db}/{p1}/{p2}/{p3}/" paginator = s3.get_paginator("list_objects_v2") matching_keys: list[str] = [] @@ -215,8 +217,8 @@ def _archive_assemblies( # noqa: PLR0913 continue for source_key in matching_keys: - rel = source_key[len(path_prefix) :] - archive_key = f"{path_prefix}archive/{release_tag}/{rel}" + rel = source_key[len(lakehouse_key_prefix) :] + archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{rel}" if dry_run: logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) @@ -247,10 +249,10 @@ def _archive_assemblies( # noqa: PLR0913 # ── Manifest trimming ─────────────────────────────────────────────────── -def _trim_manifest(manifest_s3_path: str, bucket: str, promoted_accessions: set[str]) -> None: +def _trim_manifest(manifest_s3_key: str, bucket: str, promoted_accessions: set[str]) -> None: """Remove promoted accessions from the transfer manifest in S3. - :param manifest_s3_path: S3 key of the transfer_manifest.txt + :param manifest_s3_key: S3 object key of the transfer_manifest.txt :param bucket: S3 bucket name :param promoted_accessions: set of accessions that were successfully promoted """ @@ -260,7 +262,16 @@ def _trim_manifest(manifest_s3_path: str, bucket: str, promoted_accessions: set[ tmp_path = tmp.name try: - s3.download_file(Bucket=bucket, Key=manifest_s3_path, Filename=tmp_path) + try: + s3.download_file(Bucket=bucket, Key=manifest_s3_key, Filename=tmp_path) + except s3.exceptions.NoSuchKey: + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", bucket, manifest_s3_key) + return + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", bucket, manifest_s3_key) + return + raise with Path(tmp_path).open() as f: lines = f.readlines() @@ -270,7 +281,7 @@ def _trim_manifest(manifest_s3_path: str, bucket: str, promoted_accessions: set[ with Path(tmp_path).open("w") as f: f.writelines(remaining) - s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=manifest_s3_path) + s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=manifest_s3_key) logger.info( "Trimmed manifest: %d -> %d entries (%d promoted)", len(lines), diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 6508fc1f..616afee4 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -40,7 +40,7 @@ class DownloadSettings(CtsDefaultSettings): output_dir: str = Field( default=OUTPUT_MOUNT, description="Output directory for downloaded assembly files", - validation_alias=AliasChoices("o", "output-dir", "output_dir"), + validation_alias=AliasChoices("output-dir", "output_dir"), ) threads: int = Field( default=4, diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index b98bf342..2d1faab7 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -26,14 +26,14 @@ write_transfer_manifest, write_updated_manifest, ) -from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX, promote_from_s3 +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio STABLE_PREFIX = "900" STAGING_PREFIX = "staging/run1/" -PATH_PREFIX = DEFAULT_PATH_PREFIX +PATH_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX @pytest.mark.integration @@ -80,9 +80,9 @@ def test_full_pipeline_small_batch( # ── Phase 3: Promote from staging to final path ───────────────── promote_report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) assert promote_report["promoted"] >= 1 assert promote_report["failed"] == 0 @@ -138,10 +138,10 @@ def test_full_pipeline_incremental( s3.upload_file(Filename=str(manifest1), Bucket=test_bucket, Key=manifest_key) promote1 = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - manifest_path=manifest_key, - path_prefix=PATH_PREFIX, + manifest_s3_key=manifest_key, + lakehouse_key_prefix=PATH_PREFIX, ) assert promote1["promoted"] >= 1 @@ -195,11 +195,11 @@ def test_full_pipeline_incremental( # Phase 3 — promote with archival promote2 = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - updated_manifest=str(updated_manifest), + updated_manifest_path=str(updated_manifest), ncbi_release="test-incremental", - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) assert promote2["failed"] == 0 diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py index a8c22c02..df25d410 100644 --- a/tests/integration/test_manifest_e2e.py +++ b/tests/integration/test_manifest_e2e.py @@ -24,7 +24,7 @@ write_transfer_manifest, write_updated_manifest, ) -from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX from cdm_data_loaders.utils.ftp_client import connect_ftp, ftp_retrieve_text if TYPE_CHECKING: @@ -184,7 +184,7 @@ def test_prunes_existing_matching_md5( # Seed MinIO with dummy files that have the right MD5 metadata rel = build_accession_path(rec.assembly_dir) s3 = minio_s3_client - path_prefix = DEFAULT_PATH_PREFIX + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX for fname, md5 in checksums.items(): if any(fname.endswith(suffix) for suffix in FILE_FILTERS): key = f"{path_prefix}{rel}{fname}" @@ -201,7 +201,7 @@ def test_prunes_existing_matching_md5( candidates, filtered, bucket=test_bucket, - path_prefix=path_prefix, + key_prefix=path_prefix, ) assert acc not in result, f"Expected {acc} to be pruned (MD5 matches)" diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 10670598..69fa3999 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -16,7 +16,7 @@ import pytest from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path -from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_PATH_PREFIX, promote_from_s3 +from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from .conftest import get_object_metadata, list_all_keys, seed_lakehouse @@ -32,7 +32,7 @@ ASSEMBLY_DIR_C = "GCF_900000003.1_FakeAssemblyC" STAGING_PREFIX = "staging/run1/" -PATH_PREFIX = DEFAULT_PATH_PREFIX +PATH_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX # Fake file contents for staging FAKE_GENOMIC = b">seq1\nATCGATCG\n" @@ -86,9 +86,9 @@ def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str) - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) assert report["promoted"] >= 2 # noqa: PLR2004 # genomic + protein @@ -116,16 +116,16 @@ def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str) -> _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) report1 = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") report2 = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") @@ -157,11 +157,11 @@ def test_archive_updated(self, minio_s3_client: object, test_bucket: str, tmp_pa updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - updated_manifest=str(updated_manifest), + updated_manifest_path=str(updated_manifest), ncbi_release="2024-01", - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) assert report["archived"] >= 2 # noqa: PLR2004 @@ -198,11 +198,11 @@ def test_archive_removed(self, minio_s3_client: object, test_bucket: str, tmp_pa # Stage something (even empty staging is fine — promote won't find data files for this accession) report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - removed_manifest=str(removed_manifest), + removed_manifest_path=str(removed_manifest), ncbi_release="2024-01", - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) assert report["archived"] >= 1 @@ -234,9 +234,9 @@ def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str) -> Non _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, dry_run=True, ) @@ -271,10 +271,10 @@ def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, tmp_pat _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_B) report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - manifest_path=manifest_key, - path_prefix=PATH_PREFIX, + manifest_s3_key=manifest_key, + lakehouse_key_prefix=PATH_PREFIX, ) assert report["failed"] == 0 @@ -306,9 +306,9 @@ def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> s3.put_object(Bucket=test_bucket, Key=md5_key, Body=_md5(FAKE_GENOMIC).encode()) report = promote_from_s3( - staging_prefix=STAGING_PREFIX, + staging_key_prefix=STAGING_PREFIX, bucket=test_bucket, - path_prefix=PATH_PREFIX, + lakehouse_key_prefix=PATH_PREFIX, ) # .md5 files are sidecars and should not be promoted as data diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index 61db7e5a..dac0400f 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -314,12 +314,27 @@ def test_custom_ftp_host(self) -> None: ) +def _mock_s3_with_objects() -> MagicMock: + """Return a mock S3 client whose list_objects_v2 always reports objects exist.""" + client = MagicMock() + client.list_objects_v2.return_value = {"KeyCount": 1} + return client + + +def _mock_s3_empty() -> MagicMock: + """Return a mock S3 client whose list_objects_v2 reports no objects.""" + client = MagicMock() + client.list_objects_v2.return_value = {"KeyCount": 0} + return client + + class TestVerifyTransferCandidates: """Test S3 checksum verification to prune transfer candidates.""" def _assemblies(self) -> dict: return parse_assembly_summary(SAMPLE_SUMMARY) + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -328,6 +343,7 @@ def test_prunes_when_all_match( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Assemblies where every file matches S3 are pruned from the list.""" mock_connect.return_value = MagicMock() @@ -362,6 +378,7 @@ def head_side_effect(s3_path: str) -> dict | None: ) assert result == [] + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -370,6 +387,7 @@ def test_keeps_when_md5_differs( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Assembly is kept when at least one file has a different MD5.""" mock_connect.return_value = MagicMock() @@ -383,6 +401,7 @@ def test_keeps_when_md5_differs( ) assert result == ["GCF_000001215.4"] + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -391,6 +410,7 @@ def test_keeps_when_s3_object_missing( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Assembly is kept when at least one file doesn't exist in S3.""" mock_connect.return_value = MagicMock() @@ -403,6 +423,7 @@ def test_keeps_when_s3_object_missing( ) assert result == ["GCF_000001215.4"] + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -411,6 +432,7 @@ def test_keeps_when_s3_has_no_md5_metadata( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Assembly is kept when S3 object exists but has no md5 metadata.""" mock_connect.return_value = MagicMock() @@ -424,9 +446,10 @@ def test_keeps_when_s3_has_no_md5_metadata( ) assert result == ["GCF_000001215.4"] + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", side_effect=Exception("FTP error")) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_keeps_when_ftp_fails(self, mock_connect: MagicMock, mock_retrieve: MagicMock) -> None: + def test_keeps_when_ftp_fails(self, mock_connect: MagicMock, mock_retrieve: MagicMock, mock_s3: MagicMock) -> None: """Assembly is kept (conservative) when md5checksums.txt cannot be fetched.""" mock_connect.return_value = MagicMock() @@ -451,6 +474,7 @@ def test_unknown_accession_kept(self, mock_connect: MagicMock) -> None: result = verify_transfer_candidates(["GCF_999999999.1"], {}, "cdm-lake", "prefix/") assert result == ["GCF_999999999.1"] + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -459,6 +483,7 @@ def test_short_circuits_on_first_mismatch( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Verification stops checking after the first missing/mismatched file.""" mock_connect.return_value = MagicMock() @@ -471,6 +496,7 @@ def test_short_circuits_on_first_mismatch( ) assert mock_head.call_count == 1 + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") @@ -479,6 +505,7 @@ def test_mixed_candidates( mock_connect: MagicMock, mock_retrieve: MagicMock, mock_head: MagicMock, + mock_s3: MagicMock, ) -> None: """Verify a mix of matching and non-matching assemblies.""" mock_connect.return_value = MagicMock() @@ -514,3 +541,21 @@ def head_side_effect(s3_path: str) -> dict | None: "tenant-general-warehouse/kbase/datasets/ncbi/", ) assert result == ["GCF_000001405.40"] + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_empty()) + @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") + def test_skips_ftp_when_folder_missing_from_store( + self, + mock_connect: MagicMock, + mock_s3: MagicMock, + ) -> None: + """Accessions with no objects in S3 are confirmed without FTP round-trip.""" + result = verify_transfer_candidates( + ["GCF_000001215.4"], + self._assemblies(), + "cdm-lake", + "tenant-general-warehouse/kbase/datasets/ncbi/", + ) + assert result == ["GCF_000001215.4"] + # FTP should never have been connected (lazy init) + mock_connect.assert_not_called() diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index 6935c46d..fdd4a372 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -19,7 +19,7 @@ write_updated_manifest, ) from cdm_data_loaders.ncbi_ftp.promote import ( - DEFAULT_PATH_PREFIX, + DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3, ) from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path # noqa: F401 @@ -86,4 +86,4 @@ class TestPromoteNotebookImports: def test_imports_resolve(self) -> None: """All promote notebook imports are verified at module load time above.""" assert callable(promote_from_s3) - assert isinstance(DEFAULT_PATH_PREFIX, str) + assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index bd1d7f0f..a2a98e2b 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -6,7 +6,7 @@ import pytest from cdm_data_loaders.ncbi_ftp.promote import ( - DEFAULT_PATH_PREFIX, + DEFAULT_LAKEHOUSE_KEY_PREFIX, _archive_assemblies, _trim_manifest, promote_from_s3, @@ -34,7 +34,7 @@ def test_dry_run_no_writes(self, mock_s3_client_no_checksum: botocore.client.Bas self._stage_files(mock_s3_client_no_checksum, prefix) report = promote_from_s3( - staging_prefix=prefix, + staging_key_prefix=prefix, bucket=TEST_BUCKET, dry_run=True, ) @@ -43,7 +43,7 @@ def test_dry_run_no_writes(self, mock_s3_client_no_checksum: botocore.client.Bas # Final path should NOT exist final_key = ( - f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key) assert resp.get("KeyCount", 0) == 0 @@ -54,7 +54,7 @@ def test_promotes_with_metadata(self, mock_s3_client_no_checksum: botocore.clien self._stage_files(mock_s3_client_no_checksum, prefix) report = promote_from_s3( - staging_prefix=prefix, + staging_key_prefix=prefix, bucket=TEST_BUCKET, ) assert report["promoted"] == 1 @@ -62,7 +62,7 @@ def test_promotes_with_metadata(self, mock_s3_client_no_checksum: botocore.clien # Check final object exists with metadata final_key = ( - f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) assert resp["Metadata"].get("md5") == "md5hash123" @@ -72,7 +72,7 @@ def test_skips_download_report(self, mock_s3_client_no_checksum: botocore.client prefix = "staging/run1/" self._stage_files(mock_s3_client_no_checksum, prefix) - report = promote_from_s3(staging_prefix=prefix, bucket=TEST_BUCKET) + report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET) # Only the .fna.gz data file, not download_report.json assert report["promoted"] == 1 @@ -119,7 +119,7 @@ def test_archives_and_deletes_removed( ) -> None: """Verify removed accessions are archived and originals deleted.""" accession = "GCF_000005845.2" - key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") manifest = tmp_path / "removed.txt" @@ -140,7 +140,7 @@ def test_archives_and_deletes_removed( # Archived copy should exist archive_key = ( - f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) @@ -152,7 +152,7 @@ def test_archives_updated_without_deleting( """Verify updated accessions are archived but originals remain.""" accession = "GCF_000001215.4" asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"original-data") manifest = tmp_path / "updated.txt" @@ -173,7 +173,7 @@ def test_archives_updated_without_deleting( # Archived copy exists with correct metadata archive_key = ( - f"{DEFAULT_PATH_PREFIX}archive/2024-06/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/" f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) @@ -186,7 +186,7 @@ def test_multiple_releases_no_collision( """Verify archiving the same accession in different releases creates distinct folders.""" accession = "GCF_000001215.4" asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v1-data") manifest = tmp_path / "updated.txt" @@ -202,11 +202,11 @@ def test_multiple_releases_no_collision( _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") archive_key_1 = ( - f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" ) archive_key_2 = ( - f"{DEFAULT_PATH_PREFIX}archive/2024-06/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/" f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" ) resp1 = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1) @@ -220,7 +220,7 @@ def test_dry_run_no_side_effects( ) -> None: """Verify dry_run does not copy or delete anything.""" accession = "GCF_000005845.2" - key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") manifest = tmp_path / "removed.txt" @@ -241,7 +241,7 @@ def test_dry_run_no_side_effects( assert resp.get("KeyCount", 0) == 1 # No archive created - archive_prefix = f"{DEFAULT_PATH_PREFIX}archive/2024-01/" + archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix) assert resp.get("KeyCount", 0) == 0 @@ -261,7 +261,7 @@ def test_unknown_release_fallback( """Verify ncbi_release=None falls back to 'unknown'.""" accession = "GCF_000001215.4" asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_PATH_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") manifest = tmp_path / "updated.txt" @@ -271,7 +271,7 @@ def test_unknown_release_fallback( assert count == 1 archive_key = ( - f"{DEFAULT_PATH_PREFIX}archive/unknown/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/" f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index 1dfc0997..90df299f 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -92,9 +92,9 @@ def test_manifest_alias_m(self) -> None: s = make_settings(m="/data/m.txt") assert s.manifest == "/data/m.txt" - def test_output_dir_alias_o(self) -> None: - """Verify 'o' alias resolves to output_dir.""" - s = make_settings(o="/data/o") + def test_output_dir_alias(self) -> None: + """Verify 'output_dir' / 'output-dir' alias resolves to output_dir.""" + s = make_settings(output_dir="/data/o") assert s.output_dir == "/data/o" def test_threads_alias_t(self) -> None: From e5bc2f6b74a1c69a742b6edc3e83f0e235607a89 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Fri, 17 Apr 2026 14:56:19 -0700 Subject: [PATCH 05/76] cleanup and formatting --- cdm_ncbi_fto.prompt.md | 292 ------------------------------ notebooks/ncbi_ftp_manifest.ipynb | 15 +- notebooks/ncbi_ftp_promote.ipynb | 4 +- tests/ncbi_ftp/test_notebooks.py | 6 +- tests/ncbi_ftp/test_promote.py | 8 +- tests/utils/test_s3.py | 10 +- 6 files changed, 18 insertions(+), 317 deletions(-) delete mode 100644 cdm_ncbi_fto.prompt.md diff --git a/cdm_ncbi_fto.prompt.md b/cdm_ncbi_fto.prompt.md deleted file mode 100644 index a1e52592..00000000 --- a/cdm_ncbi_fto.prompt.md +++ /dev/null @@ -1,292 +0,0 @@ -# Plan: Port NCBI Assembly Sync to cdm-data-loaders - -Port the 3-phase NCBI assembly transfer pipeline from this repo -([kbase-transfers](https://github.com/kbase/kbase-transfers)) on the `develop-ncbi-automation` branchinto -[kbase/cdm-data-loaders](https://github.com/kbase/cdm-data-loaders/tree/develop) -(develop branch). - -- **Phase 2** (container download) becomes a new CTS entrypoint command. -- **Phases 1 and 3** become Jupyter notebooks in `notebooks/`. -- The existing cdm-data-loaders `utils/s3.py` gets new functions for metadata support - (existing functions are not modified). -- Tests use **moto** (matching cdm-data-loaders conventions). -- FTP logic stays as **ftplib**. -- New code lives in a dedicated `src/cdm_data_loaders/ncbi_ftp/` module, - separate from existing NCBI REST / refseq code. - -### Phase responsibilities - -Each phase has a deliberately narrow scope: - -| Phase | Input | Output | Responsibility | -|-------|-------|--------|----------------| -| 1 — Manifest | NCBI assembly summary + previous snapshot from S3 | `transfer_manifest.txt`, `removed_manifest.txt`, `diff_summary.json` | **All** filtering logic (prefix ranges, limits, diffing). Produces a final list of what to transfer and what to archive. | -| 2 — Download | `transfer_manifest.txt` (from input mount) | Downloaded files in output mount, preserving FTP directory structure (`GCF/000/001/.../assembly_dir/`) | Reads the manifest; downloads exactly those assemblies from NCBI FTP; verifies MD5; writes `.md5` sidecars. No filtering, no S3 access. | -| 3 — Promote | Downloaded files in S3 staging prefix + `removed_manifest.txt` | Files at final Lakehouse paths; archived replaced assemblies | Syncs staging → final location. Archives replaced/suppressed assemblies. **Removes successfully-promoted entries from `transfer_manifest.txt`** so an interrupted Phase 2 can resume from where it left off. | - ---- - -## Background: the 3-phase pipeline - -The pipeline is documented in this repo's -[scripts/ncbi/README.md](README.md#semi-automated-transfer-pipeline): - -| Phase | Script (this repo) | Purpose | -|-------|-------------------|---------| -| 1 — Manifest | [`generate_transfer_manifest.py`](generate_transfer_manifest.py) | Diff NCBI assembly summary against previous snapshot; produce download + removal manifests | -| 2 — Download | [`container_download.py`](container_download.py) | Download assemblies from NCBI FTP, verify MD5, write `.md5` sidecars (runs in CTS container) | -| 3 — Promote | [`promote_staged_files.py`](promote_staged_files.py) | Copy staged files to final Lakehouse paths; archive replaced/suppressed assemblies | - -Supporting code in this repo: - -| File | What to port | -|------|-------------| -| [`download_genomes.py`](download_genomes.py) | FTP resilience patterns (TCP keepalive, NOOP pings, thread-local connections), file filters, accession path construction | -| [`../../kbase_transfers/minio_client.py`](../../kbase_transfers/minio_client.py) | Metadata-aware upload pattern (MD5 as user metadata, CRC64/NVME checksums) | -| [`../../tests/test_sync.py`](../../tests/test_sync.py) | Unit tests for parsing, diffing — port to moto-based tests | -| [`../../tests/test_minio_client.py`](../../tests/test_minio_client.py) | Integration test patterns for S3 operations | - ---- - -## Phase A: Extend cdm-data-loaders `utils/s3.py` - -The promote step needs to attach user-metadata (MD5) to uploads and read -checksums via HEAD. The existing -[`s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/s3.py) -doesn't support custom metadata on upload or `head_object` with checksum -retrieval. - -**Prefer adding new functions over modifying existing ones to minimise -impact on other scripts.** - -### Steps - -1. **Add `upload_file_with_metadata()`** — new function that accepts - `local_file_path`, `destination_dir`, `metadata: dict[str, str]`, - optional `object_name`. Passes `Metadata` in `ExtraArgs` alongside the - existing `ChecksumAlgorithm: CRC64NVME`. Same upload logic as - `upload_file()` but with metadata support. - -2. **Add `head_object(s3_path)`** — new function returning dict with `size`, - `metadata`, `checksum_crc64nvme` (from `ChecksumCRC64NVME` header), or - `None` if 404. Uses `ChecksumMode='ENABLED'`. - -3. **Add `copy_object_with_metadata()`** — new function wrapping - `s3.copy_object()` that accepts `metadata` + `MetadataDirective='REPLACE'` - for archiving replaced assemblies with tags (`archive_reason`, `archive_date`, - `ncbi_last_release`). - -4. **Add moto-based tests** following the existing `mock_s3_client` fixture pattern - in [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py). - Use the existing `strip_checksum_algorithm` workaround for moto's CRC64NVME limitation. - -**Files modified:** -- `src/cdm_data_loaders/utils/s3.py` — add 3 new functions (no changes to existing functions) -- `tests/utils/test_s3.py` — add corresponding tests - ---- - -## Phase B: Create the `ncbi_ftp` module - -New module at `src/cdm_data_loaders/ncbi_ftp/`, separate from the -existing `ncbi_rest_api.py` pipeline and `refseq_pipeline/`. - -``` -src/cdm_data_loaders/ncbi_ftp/ -├── __init__.py -├── ftp_client.py # FTP: connect, keepalive, list, download, retry -├── manifest.py # Phase 1: summary diffing, manifest generation, ALL filtering -├── download.py # Phase 2: CTS container download + MD5 verification -├── promote.py # Phase 3: sync staging → final, archive, trim manifest -├── checksums.py # MD5 verification, CRC64/NVME computation -└── settings.py # Pydantic settings (extends CtsDefaultSettings) -``` - -### Steps - -5. **`ftp_client.py`** — Port from [`download_genomes.py`](download_genomes.py) - and [`container_download.py`](container_download.py): - - `connect_ftp(host, timeout)` with TCP keepalive (`SO_KEEPALIVE`, `TCP_KEEPIDLE`, `TCP_KEEPINTVL`) - - `ftp_noop_keepalive(ftp, interval)` — NOOP sender for idle connections - - `ftp_list_dir(ftp, path)` — NLST wrapper with retry on `error_temp` - - `ftp_download_file(ftp, remote_path, local_path)` — `RETR` with retry - - Thread-local FTP connection management for parallel downloads - - Use `get_cdm_logger()` instead of print statements - -6. **`checksums.py`** — Port from [`download_genomes.py`](download_genomes.py): - - `compute_crc64nvme(file_path) -> str` — reads in 1MB chunks, returns base64-encoded 8-byte big-endian (uses `awscrt.checksums.crc64nvme`) - - `verify_md5(file_path, expected_md5) -> bool` - - `parse_md5_checksums_file(text) -> dict[str, str]` — parses NCBI `md5checksums.txt` - -7. **`settings.py`** — Pydantic settings following - [`cts_defaults.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/cts_defaults.py) pattern: - - `DownloadSettings(CtsDefaultSettings)` — adds `manifest`, `threads`, `ftp_host` - - CLI-parseable with `AliasChoices`, `field_validator` for constraints - -8. **`manifest.py`** — Port from [`generate_transfer_manifest.py`](generate_transfer_manifest.py): - - `download_assembly_summary(ftp, database) -> str` - - `parse_assembly_summary(text) -> dict[str, AssemblyRecord]` - - `compute_diff(current, previous) -> DiffResult` — new/updated/replaced/suppressed/withdrawn - - `write_manifest(diff, output_dir)` — writes `transfer_manifest.txt`, `removed_manifest.txt`, `diff_summary.json` - - **All filtering logic lives here**: prefix-range filtering (`--prefix-from`, `--prefix-to`), `--limit`, any other subsetting - - The output `transfer_manifest.txt` is the final, filtered list — Phase 2 downloads exactly what's in it - -9. **`download.py`** — Port from [`container_download.py`](container_download.py). - **This phase is deliberately simple**: read manifest, download, verify. - - `run_download(settings: DownloadSettings)` — main CTS entry point - - Reads `transfer_manifest.txt` from input mount; each line is an FTP path - - `download_assembly(ftp, ftp_path, output_dir) -> DownloadResult` - - File filters: `_genomic.fna.gz`, `_genomic.gff.gz`, `_protein.faa.gz`, `_gene_ontology.gaf.gz`, `_assembly_report.txt`, `_assembly_stats.txt`, etc. - - MD5 verification (3 retries), `.md5` sidecar writing - - `ThreadPoolExecutor` for parallel downloads - - Output preserves FTP directory structure: `{GCF|GCA}/{000}/{001}/{215}/{assembly_dir}/` (same subfolder hierarchy as on the FTP server) - - Writes `download_report.json` summary - - `cli()` function using `run_cli(DownloadSettings, run_download)` from - [`core.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/core.py) - - **No filtering or subsetting logic** — downloads exactly what's in the manifest - -10. **`promote.py`** — Port from [`promote_staged_files.py`](promote_staged_files.py): - - `run_promote(staging_prefix, removed_manifest, release_tag, manifest_path)` - - Walk staged files in S3 staging prefix, upload each to final Lakehouse path via `upload_file_with_metadata()` with MD5 from `.md5` sidecars - - Skip `.md5` and `.crc64nvme` sidecar files themselves - - Archive replaced/suppressed assemblies (from `removed_manifest.txt`): `copy_object_with_metadata()` to `archive/{release}/...` with metadata tags, then `delete_object()` - - **Manifest trimming for resumability**: after successfully promoting an assembly's files, remove that entry from `transfer_manifest.txt`. If Phase 2 is re-run after a partial failure, it only downloads the remaining entries. - - `--dry-run` support - ---- - -## Phase C: Notebooks for Phases 1 and 3 - -11. **`notebooks/ncbi_ftp_manifest.ipynb`** — Phase 1: - - Cell 1: imports + S3 client init (`get_s3_client()`) - - Cell 2: configure parameters (database, prefix-from/to, limit, dry-run) - - Cell 3: download current assembly summary from FTP - - Cell 4: load previous summary from S3 (or scan existing prefixes) - - Cell 5: compute diff, display summary - - Cell 6: apply filters (prefix range, limit), write manifest files, upload new summary to S3 - -12. **`notebooks/ncbi_ftp_promote.ipynb`** — Phase 3: - - Cell 1: imports + S3 client init - - Cell 2: configure parameters (staging prefix, removed manifest path, release tag, manifest path for trimming, dry-run) - - Cell 3: scan staged files, display summary - - Cell 4: promote files to final paths - - Cell 5: archive replaced/suppressed assemblies - - Cell 6: trim manifest (remove promoted entries), display promotion report - ---- - -## Phase D: Container integration (Phase 2) - -13. Register CLI entry point in `pyproject.toml`: - ```toml - [project.scripts] - ncbi_ftp_sync = "cdm_data_loaders.ncbi_ftp.download:cli" - ``` - -14. Add command to `scripts/entrypoint.sh`: - ```bash - ncbi_ftp_sync) - exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" - ;; - ``` - -15. No Dockerfile changes needed (package installed via `uv sync`; entrypoint dispatches). - ---- - -## Phase E: Tests - -All tests use **moto** for S3 mocking. No live MinIO dependency in CI. - -``` -tests/ncbi_ftp/ -├── __init__.py -├── conftest.py # Mock FTP, sample manifests, assembly records -├── test_ftp_client.py # Mock ftplib: keepalive, retry, thread-local -├── test_checksums.py # MD5 verify, CRC64/NVME, md5checksums.txt parsing -├── test_manifest.py # Summary parsing, diff logic, filtering (port from test_sync.py) -├── test_download.py # Mock FTP + fs: filters, MD5 verify, sidecars, layout -├── test_promote.py # moto S3: upload with metadata, archive, manifest trimming, dry-run -└── test_settings.py # Pydantic validation (follow test_ncbi_rest_api.py) -``` - -### Steps - -16. **`test_checksums.py`** — `verify_md5` correct/incorrect, `parse_md5_checksums_file`, - `compute_crc64nvme` (skip if `awscrt` unavailable) - -17. **`test_manifest.py`** — Port relevant tests from this repo's - [`tests/test_sync.py`](../../tests/test_sync.py): `parse_assembly_summary`, - `compute_diff` (new/updated/replaced/suppressed/withdrawn), `write_manifest`, - prefix-range filtering, limit filtering - -18. **`test_ftp_client.py`** — Mock `ftplib.FTP`: keepalive options, retry on - `error_temp`, thread-local connections - -19. **`test_download.py`** — Mock FTP + filesystem: file filter logic, MD5 - verification, sidecar writing, directory layout preserves FTP structure, - `download_report.json` - -20. **`test_promote.py`** — moto `mock_s3_client` fixture: upload with metadata, - archive copy with tags, deletion of originals, **manifest trimming** (verify - promoted entries removed, remaining entries preserved), dry-run no side effects. - Use `strip_checksum_algorithm` workaround for CRC64NVME. - -21. **`test_settings.py`** — Follow - [`test_ncbi_rest_api.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/pipelines/test_ncbi_rest_api.py) - pattern: defaults, all params, CLI variants, invalid values, boolean parsing - ---- - -## Phase F: Dependencies and CI - -22. Add `awscrt` to `pyproject.toml` if not already covered by `boto3[crt]`. - -23. All new tests run automatically in CI — no `requires_spark` marks needed. - ruff checks apply (120 char lines, py313 target). - ---- - -## Key reference patterns in cdm-data-loaders - -| Pattern | Where to find it | -|---------|-----------------| -| S3 utility functions + moto tests | [`utils/s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/s3.py), [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py) | -| CTS settings base class | [`pipelines/cts_defaults.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/cts_defaults.py) | -| `run_cli()` entry point | [`pipelines/core.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/pipelines/core.py) | -| Logger | [`utils/cdm_logger.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/src/cdm_data_loaders/utils/cdm_logger.py) | -| Settings test pattern | [`tests/pipelines/test_ncbi_rest_api.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/pipelines/test_ncbi_rest_api.py) | -| Entrypoint dispatch | [`scripts/entrypoint.sh`](https://github.com/kbase/cdm-data-loaders/blob/develop/scripts/entrypoint.sh) | -| moto CRC64NVME workaround | `strip_checksum_algorithm()` in [`tests/utils/test_s3.py`](https://github.com/kbase/cdm-data-loaders/blob/develop/tests/utils/test_s3.py) | - ---- - -## Verification - -1. `ruff check src/cdm_data_loaders/ncbi_ftp/ tests/ncbi_ftp/` — lint passes -2. `ruff format --check src/cdm_data_loaders/ncbi_ftp/ tests/ncbi_ftp/` — formatting passes -3. `uv run pytest tests/ncbi_ftp/ -v` — all unit tests pass -4. `uv run pytest tests/utils/test_s3.py -v` — new S3 function tests pass -5. Manual: build Docker image, run `ncbi_ftp_sync` with small manifest against local MinIO -6. Manual: run both notebooks against local MinIO for end-to-end verification - ---- - -## Decisions - -- **`ncbi_ftp` naming** — distinguishes bulk FTP file transfer from the existing NCBI REST API pipeline (`ncbi_rest_api.py`) and Spark-based refseq processing (`refseq_pipeline/`) -- **New functions in `s3.py`, not modified existing ones** — minimises impact on other scripts; avoids signature changes that could break callers -- **All filtering in Phase 1** — Phase 2 is a pure download-what's-in-the-manifest step; Phase 3 is a pure sync-and-archive step. Clean separation of concerns. -- **Manifest trimming in Phase 3** — enables resumable Phase 2 runs. After promoting files, Phase 3 removes those entries from `transfer_manifest.txt`. Re-running Phase 2 only downloads what's left. -- **Output preserves FTP directory structure** — Phase 2 writes files under the same `GCF/000/001/.../assembly_dir/` path used on the FTP server, making it trivial to correlate staged files with their NCBI source -- **moto for tests** — matches cdm-data-loaders conventions; fast, no Docker in CI. The `strip_checksum_algorithm` workaround handles the CRC64NVME gap. -- **ftplib over httpx** — NCBI FTP is the established bulk download protocol; existing keepalive/NOOP/retry patterns are proven -- **Notebooks for Phases 1 & 3** — interactive, judgement-requiring steps; natural fit for JupyterLab -- **Phase 2 as CTS command** — matches the entrypoint dispatch pattern and CTS mount contract - -## Excluded from scope - -- Frictionless `datapackage.json` descriptors (only in old monolithic `download_genomes.py`) -- `backfill_checksums.py` (legacy utility, not part of ongoing pipeline) -- `download_genomes.py` monolith (superseded by the 3-phase pipeline) -- Spark/Delta Lake integration (assembly sync is file-level, not data transformation) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 20ee5016..e59c6043 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -201,9 +201,15 @@ "\n", " candidates = diff.new + diff.updated\n", " print(f\"Verifying {len(candidates)} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", - " confirmed = set(verify_transfer_candidates(\n", - " candidates, filtered, STORE_BUCKET, STORE_KEY_PREFIX, ftp_host=FTP_HOST,\n", - " ))\n", + " confirmed = set(\n", + " verify_transfer_candidates(\n", + " candidates,\n", + " filtered,\n", + " STORE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " ftp_host=FTP_HOST,\n", + " )\n", + " )\n", " before = len(diff.new) + len(diff.updated)\n", " diff.new = [a for a in diff.new if a in confirmed]\n", " diff.updated = [a for a in diff.updated if a in confirmed]\n", @@ -290,8 +296,7 @@ " bucket, prefix = split_s3_path(STAGING_URI)\n", " prefix = prefix.rstrip(\"/\") + \"/\"\n", "\n", - " for manifest in [\"transfer_manifest.txt\", \"removed_manifest.txt\",\n", - " \"updated_manifest.txt\", \"diff_summary.json\"]:\n", + " for manifest in [\"transfer_manifest.txt\", \"removed_manifest.txt\", \"updated_manifest.txt\", \"diff_summary.json\"]:\n", " local_path = OUTPUT_DIR / manifest\n", " if local_path.exists():\n", " key = f\"{prefix}{manifest}\"\n", diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index 5dd46312..1da31ab2 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -184,10 +184,10 @@ "print(f\"Dry-run: {report['dry_run']}\")\n", "print(f\"Timestamp: {report['timestamp']}\")\n", "\n", - "if report['failed'] > 0:\n", + "if report[\"failed\"] > 0:\n", " print(\"\\n⚠️ Some operations failed — check logs above for details.\")\n", "\n", - "if report['dry_run']:\n", + "if report[\"dry_run\"]:\n", " print(\"\\n📋 This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" ] } diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index fdd4a372..ba19c2a9 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -40,11 +40,7 @@ def _extract_code_cells(notebook_path: Path) -> list[str]: """ with notebook_path.open() as f: nb = json.load(f) - return [ - "".join(cell.get("source", [])) - for cell in nb.get("cells", []) - if cell.get("cell_type") == "code" - ] + return ["".join(cell.get("source", [])) for cell in nb.get("cells", []) if cell.get("cell_type") == "code"] @pytest.mark.parametrize("notebook", NCBI_NOTEBOOKS) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index a2a98e2b..fa9898d0 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -42,9 +42,7 @@ def test_dry_run_no_writes(self, mock_s3_client_no_checksum: botocore.client.Bas assert report["dry_run"] is True # Final path should NOT exist - final_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" - ) + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key) assert resp.get("KeyCount", 0) == 0 @@ -61,9 +59,7 @@ def test_promotes_with_metadata(self, mock_s3_client_no_checksum: botocore.clien assert report["failed"] == 0 # Check final object exists with metadata - final_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" - ) + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) assert resp["Metadata"].get("md5") == "md5hash123" diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 26292a16..cb544a4a 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -559,7 +559,7 @@ def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Generator[Any, Any]: allowing copy_object calls that include ChecksumAlgorithm to succeed. """ mock_s3_client.copy_object = strip_checksum_algorithm(mock_s3_client.copy_object) - yield mock_s3_client + return mock_s3_client # copy_object @@ -656,9 +656,7 @@ def test_upload_file_with_metadata_accepts_str_and_path(sample_file: Path, path_ @pytest.mark.s3 def test_head_object_returns_info(mock_s3_client: Any) -> None: """Verify that head_object returns size, metadata, and checksum fields.""" - mock_s3_client.put_object( - Bucket=CDM_LAKE_BUCKET, Key="info/file.txt", Body=b"hello", Metadata={"md5": "abc123"} - ) + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="info/file.txt", Body=b"hello", Metadata={"md5": "abc123"}) result = head_object(f"{CDM_LAKE_BUCKET}/info/file.txt") assert result is not None assert result["size"] == 5 @@ -687,9 +685,7 @@ def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: # copy_object_with_metadata @pytest.mark.parametrize("destination", BUCKETS) @pytest.mark.s3 -def test_copy_object_with_metadata_replaces_metadata( - mocked_s3_client_no_checksum: Any, destination: str -) -> None: +def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: """Verify that copy_object_with_metadata copies and replaces metadata.""" mocked_s3_client_no_checksum.put_object( Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"old_key": "old_val"} From de8d325eabdbd314d565e1984fb11bee7c0812df Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Fri, 17 Apr 2026 15:06:19 -0700 Subject: [PATCH 06/76] formatting --- src/cdm_data_loaders/ncbi_ftp/manifest.py | 2 +- src/cdm_data_loaders/utils/s3.py | 28 ++++++++++---------- tests/utils/test_s3.py | 31 ++++++++++++++--------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index e67c2185..f6a9fce1 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -260,7 +260,7 @@ def _ftp_dir_from_url(ftp_url: str, ftp_host: str = FTP_HOST) -> str: # ── Checksum verification against S3 store ─────────────────────────────── -def verify_transfer_candidates( +def verify_transfer_candidates( # noqa: PLR0912, PLR0915 accessions: list[str], current_assemblies: dict[str, AssemblyRecord], bucket: str, diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 6fc1696a..c3897904 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -35,7 +35,7 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli if not args: try: - from berdl_notebook_utils.berdl_settings import get_settings + from berdl_notebook_utils.berdl_settings import get_settings # noqa: PLC0415 settings = get_settings() args = { @@ -44,9 +44,7 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli "aws_secret_access_key": settings.MINIO_SECRET_KEY, } except (ModuleNotFoundError, ImportError, NameError) as e: - print(e) - raise - except Exception: + print(e) # noqa: T201 raise required_args = ["endpoint_url", "aws_access_key_id", "aws_secret_access_key"] @@ -139,10 +137,10 @@ def object_exists(s3_path: str) -> bool: (bucket, key) = split_s3_path(s3_path) try: s3.head_object(Bucket=bucket, Key=key) - except Exception as e: + except botocore.exceptions.ClientError as e: error_string = str(e) if not error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - print(f"Error performing head operation on s3 object: {e!s}") + print(f"Error performing head operation on s3 object: {e!s}") # noqa: T201 return False return True @@ -175,7 +173,7 @@ def upload_file( s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" if object_exists(s3_path): - print(f"File already present: {s3_path}") + print(f"File already present: {s3_path}") # noqa: T201 return True s3 = get_s3_client() @@ -184,7 +182,7 @@ def upload_file( # Upload the file file_size = local_file_path.stat().st_size with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - print(f"uploading {local_file_path!s} to {s3_path}") + print(f"uploading {local_file_path!s} to {s3_path}") # noqa: T201 try: s3.upload_file( Filename=str(local_file_path), @@ -193,8 +191,8 @@ def upload_file( Callback=pbar.update, ExtraArgs=DEFAULT_EXTRA_ARGS, ) - except Exception as e: - print(f"Error uploading to s3: {e!s}") + except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e: + print(f"Error uploading to s3: {e!s}") # noqa: T201 return False return True @@ -219,8 +217,8 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N if not parent_dir.is_dir(): try: parent_dir.mkdir(parents=True, exist_ok=False) - except Exception as e: - print(f"Could not save s3 file to {local_file_path}: {e!s}") + except OSError as e: + print(f"Could not save s3 file to {local_file_path}: {e!s}") # noqa: T201 raise s3 = get_s3_client() @@ -232,12 +230,12 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N # Get the object size try: object_size = s3.head_object(**kwargs)["ContentLength"] - except Exception as e: + except botocore.exceptions.ClientError as e: error_string = str(e) if error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - print(f"File not found: {s3_path}") + print(f"File not found: {s3_path}") # noqa: T201 else: - print(f"Error downloading {s3_path}: {e!s}") + print(f"Error downloading {s3_path}: {e!s}") # noqa: T201 raise extra_args = {"VersionId": version_id} if version_id is not None else None diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index cb544a4a..9050448f 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -47,6 +47,11 @@ } BUCKETS = [CDM_LAKE_BUCKET, ALT_BUCKET] +HTTP_STATUS_OK = 200 +HTTP_STATUS_NO_CONTENT = 204 +SIZE_HELLO = 5 +SIZE_DATA = 4 + @pytest.fixture def mock_s3_client() -> Generator[Any, Any]: @@ -307,7 +312,7 @@ def test_list_matching_objects_empty_for_missing_prefix( } -# TODO: use a single fixture for all these tests +# NOTE: These tests currently compose multiple fixtures explicitly for readability. @pytest.mark.parametrize("dir_path", EXPECTED_FILE_LIST.keys()) @pytest.mark.s3 def test_list_matching_objects_returns_more_than_1000_entries( @@ -478,7 +483,8 @@ def test_download_file_does_not_clobber_existing_file_to_mkdir(mock_s3_client: A @pytest.mark.s3 -def test_download_file_does_not_exist(mock_s3_client: Any, tmp_path: Path, capsys: pytest.CaptureFixture) -> None: +@pytest.mark.usefixtures("mock_s3_client") +def test_download_file_does_not_exist(tmp_path: Path, capsys: pytest.CaptureFixture) -> None: """Ensure that attempting to download a file that does not exist raises an error.""" bucket = BUCKETS[0] key = "to/the/door.txt" @@ -533,8 +539,8 @@ def test_upload_dir_raises_on_empty_destination(sample_dir: Path) -> None: upload_dir(sample_dir, "") -# FIXME: once moto supports CRC64NVME, this can be removed -def strip_checksum_algorithm(method: Callable): +# NOTE: Moto currently does not support CRC64NVME; remove this helper when it does. +def strip_checksum_algorithm(method: Callable[..., Any]) -> Callable[..., Any]: """Wrap a boto3 S3 method to remove the ChecksumAlgorithm argument before calling moto. Moto does not implement CRC64NVME checksums, so any call that includes @@ -543,7 +549,7 @@ def strip_checksum_algorithm(method: Callable): """ @functools.wraps(method) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any) -> Any: """Remove the ChecksumAlgorithm argument from the call.""" kwargs.pop("ChecksumAlgorithm", None) return method(*args, **kwargs) @@ -577,7 +583,7 @@ def test_copy_file(mocked_s3_client_no_checksum: Any, destination: str) -> None: obj = mocked_s3_client_no_checksum.get_object(Bucket=destination, Key="dst/path/to/file.txt") assert obj["Body"].read() == b"copy me" - assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 + assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK # delete_object @@ -592,12 +598,12 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol resp = delete_object(s3_path) assert object_exists(s3_path) is False - assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == HTTP_STATUS_NO_CONTENT # retry the deletion resp = delete_object(s3_path) assert object_exists(s3_path) is False - assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == 204 + assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == HTTP_STATUS_NO_CONTENT # upload_file_with_metadata @@ -659,14 +665,15 @@ def test_head_object_returns_info(mock_s3_client: Any) -> None: mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="info/file.txt", Body=b"hello", Metadata={"md5": "abc123"}) result = head_object(f"{CDM_LAKE_BUCKET}/info/file.txt") assert result is not None - assert result["size"] == 5 + assert result["size"] == SIZE_HELLO assert result["metadata"]["md5"] == "abc123" # moto may not populate CRC64NVME, but the key should be present assert "checksum_crc64nvme" in result @pytest.mark.s3 -def test_head_object_returns_none_for_missing(mock_s3_client: Any) -> None: +@pytest.mark.usefixtures("mock_s3_client") +def test_head_object_returns_none_for_missing() -> None: """Verify that head_object returns None for a non-existent object.""" result = head_object(f"{CDM_LAKE_BUCKET}/does/not/exist.txt") assert result is None @@ -679,7 +686,7 @@ def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="proto/file.txt", Body=b"data") result = head_object(f"{protocol}{CDM_LAKE_BUCKET}/proto/file.txt") assert result is not None - assert result["size"] == 4 + assert result["size"] == SIZE_DATA # copy_object_with_metadata @@ -696,7 +703,7 @@ def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksu f"{destination}/archive/file.txt", metadata=new_metadata, ) - assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 + assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK # verify the destination has the new metadata, not the old resp = mocked_s3_client_no_checksum.head_object(Bucket=destination, Key="archive/file.txt") From 3f0e6040ecdf3ca0d983f6c8209cb5e7b0582b50 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:50:06 -0700 Subject: [PATCH 07/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_notebooks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index ba19c2a9..07b764e1 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -72,6 +72,8 @@ class TestManifestNotebookImports: def test_imports_resolve(self) -> None: """All manifest notebook imports are verified at module load time above.""" + assert isinstance(FTP_HOST, str) + assert FTP_HOST assert callable(download_assembly_summary) assert callable(write_updated_manifest) From 4f93d5f0685b3e72522fa905d3576e9f028da24d Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:51:14 -0700 Subject: [PATCH 08/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index 07b764e1..d73b850c 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -22,7 +22,7 @@ DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3, ) -from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path # noqa: F401 +from cdm_data_loaders.utils.s3 import split_s3_path # noqa: F401 NOTEBOOKS_DIR = Path(__file__).resolve().parents[2] / "notebooks" From 399a00d0ba69dc4838b2a1e75e888b7e4f765ba7 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:51:50 -0700 Subject: [PATCH 09/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- scripts/entrypoint.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index ee087591..b5409887 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash set -euo pipefail +supported_commands="xml_split|uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|test|bash" + # Ensure at least one argument is provided if [ "$#" -eq 0 ]; then - echo "Usage: $0 {uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|xml_split|test} [args...]" + echo "Usage: $0 {$supported_commands} [args...]" exit 1 fi @@ -39,7 +41,7 @@ case "$cmd" in exec /usr/bin/tini -- /bin/bash ;; *) - echo "Error: unknown command '$cmd'; valid commands are 'uniref', 'uniprot', 'ncbi_rest_api', 'ncbi_ftp_sync', or 'xml_split'." >&2 + echo "Error: unknown command '$cmd'; valid commands are {$supported_commands}." >&2 exit 1 ;; esac From fee2ab4acb380179838b2ac94b663cb9583ade52 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:52:52 -0700 Subject: [PATCH 10/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/ncbi_ftp/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index f6a9fce1..ae587c28 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -251,7 +251,7 @@ def compute_diff( # noqa: PLR0912 def _ftp_dir_from_url(ftp_url: str, ftp_host: str = FTP_HOST) -> str: """Convert an FTP URL from the assembly summary to an FTP directory path.""" if ftp_url.startswith("https://"): - return ftp_url.replace("https://ftp.ncbi.nlm.nih.gov", "") + return ftp_url.replace(f"https://{ftp_host}", "") if ftp_url.startswith("ftp://"): return ftp_url.replace(f"ftp://{ftp_host}", "") return ftp_url From 5233d9dd9ca30b74c525a9dbe8d6a7dc96f055e7 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:53:44 -0700 Subject: [PATCH 11/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/ncbi_ftp/promote.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 52d9e57f..ccf6bc92 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -58,13 +58,14 @@ def promote_from_s3( # noqa: PLR0913 """ s3 = get_s3_client() paginator = s3.get_paginator("list_objects_v2") + normalized_staging_key_prefix = staging_key_prefix.rstrip("/") + "/" promoted = 0 failed = 0 # Collect all objects under the staging prefix staged_objects: list[str] = [] - for page in paginator.paginate(Bucket=bucket, Prefix=staging_key_prefix): + for page in paginator.paginate(Bucket=bucket, Prefix=normalized_staging_key_prefix): staged_objects.extend(obj["Key"] for obj in page.get("Contents", [])) # Separate data files from sidecars @@ -96,7 +97,7 @@ def promote_from_s3( # noqa: PLR0913 if staged_key.endswith("download_report.json"): continue - rel_path = staged_key[len(staging_key_prefix) :] + rel_path = staged_key[len(normalized_staging_key_prefix) :] if not rel_path.startswith("raw_data/"): continue final_key = lakehouse_key_prefix + rel_path From e5b58f24ff7f9e474867a014c2ffd0493a2112c1 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:54:29 -0700 Subject: [PATCH 12/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/ncbi_ftp/promote.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index ccf6bc92..1763aba3 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -9,7 +9,7 @@ import re import tempfile from datetime import UTC, datetime -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Any import botocore.exceptions @@ -120,11 +120,12 @@ def promote_from_s3( # noqa: PLR0913 md5_obj = s3.get_object(Bucket=bucket, Key=md5_key) metadata["md5"] = md5_obj["Body"].read().decode().strip() + final_key_path = PurePosixPath(final_key) upload_file_with_metadata( tmp_path, - f"{bucket}/{Path(final_key).parent}", + f"{bucket}/{final_key_path.parent}", metadata=metadata, - object_name=Path(final_key).name, + object_name=final_key_path.name, ) promoted += 1 From 64ead4c4d39889e7b26706df804ae7caf747de81 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:55:18 -0700 Subject: [PATCH 13/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/utils/s3.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index c3897904..65942b20 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -387,7 +387,7 @@ def upload_file_with_metadata( :type metadata: dict[str, str] :param object_name: S3 object name; defaults to the local filename :type object_name: str | None - :return: True if the upload succeeded + :return: True if the upload succeeded, otherwise False :rtype: bool """ if isinstance(local_file_path, str): @@ -407,14 +407,17 @@ def upload_file_with_metadata( extra_args = {**DEFAULT_EXTRA_ARGS, "Metadata": metadata} file_size = local_file_path.stat().st_size - with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - s3.upload_file( - Filename=str(local_file_path), - Bucket=bucket, - Key=key, - Callback=pbar.update, - ExtraArgs=extra_args, - ) + try: + with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.upload_file( + Filename=str(local_file_path), + Bucket=bucket, + Key=key, + Callback=pbar.update, + ExtraArgs=extra_args, + ) + except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError): + return False return True From 9fba2c5ad4f4d0d6a148843b8135c553fff50a48 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 11:55:45 -0700 Subject: [PATCH 14/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- scripts/s3_local.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/s3_local.py b/scripts/s3_local.py index 65f80396..5802a133 100755 --- a/scripts/s3_local.py +++ b/scripts/s3_local.py @@ -24,9 +24,10 @@ from pathlib import Path import boto3 +from botocore.client import BaseClient -def _client() -> boto3.client: +def _client() -> BaseClient: return boto3.client( "s3", endpoint_url=os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000"), From c89ad37603da7a483b8d7048665e67f0eca47ef1 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:02:16 -0700 Subject: [PATCH 15/76] address copilot comments --- tests/ncbi_ftp/test_notebooks.py | 1 + tests/utils/test_s3.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index d73b850c..1ff8f0e4 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -74,6 +74,7 @@ def test_imports_resolve(self) -> None: """All manifest notebook imports are verified at module load time above.""" assert isinstance(FTP_HOST, str) assert FTP_HOST + assert AssemblyRecord is not None assert callable(download_assembly_summary) assert callable(write_updated_manifest) diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 9050448f..83a6efc6 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -558,8 +558,8 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: @pytest.fixture -def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Generator[Any, Any]: - """Yield the mocked S3 client with copy_object patched to strip ChecksumAlgorithm. +def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Any: + """Return the mocked S3 client with copy_object patched to strip ChecksumAlgorithm. This works around the moto limitation of not supporting CRC64NVME checksums, allowing copy_object calls that include ChecksumAlgorithm to succeed. From 9a5e2fe213e803ff02098101e6317e7eb13e058b Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:03:41 -0700 Subject: [PATCH 16/76] increased timeout for trivy action --- .github/workflows/trivy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/trivy.yaml b/.github/workflows/trivy.yaml index aa6735f4..ef7e0a1c 100644 --- a/.github/workflows/trivy.yaml +++ b/.github/workflows/trivy.yaml @@ -49,6 +49,7 @@ jobs: template: "@/contrib/sarif.tpl" output: "trivy-results.sarif" severity: "CRITICAL,HIGH" + timeout: "15m" - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v3 From de77b144fff4b1e1fc32cf484c3afc44d0fa2e9e Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:04:14 -0700 Subject: [PATCH 17/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_notebooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index 1ff8f0e4..3a8c1cb2 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -76,6 +76,7 @@ def test_imports_resolve(self) -> None: assert FTP_HOST assert AssemblyRecord is not None assert callable(download_assembly_summary) + assert callable(compute_diff) assert callable(write_updated_manifest) From 7d9ced1faff4d8ec041a22aa7b0b8e574cd6d855 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:04:46 -0700 Subject: [PATCH 18/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_notebooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index 3a8c1cb2..6d2e3758 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -87,3 +87,4 @@ def test_imports_resolve(self) -> None: """All promote notebook imports are verified at module load time above.""" assert callable(promote_from_s3) assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) + assert callable(split_s3_path) From 16dfda3b18432c07aff78c0c9ab9c773d50ef7f0 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:11:36 -0700 Subject: [PATCH 19/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/ncbi_ftp/promote.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 1763aba3..6faf8c79 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -121,12 +121,17 @@ def promote_from_s3( # noqa: PLR0913 metadata["md5"] = md5_obj["Body"].read().decode().strip() final_key_path = PurePosixPath(final_key) - upload_file_with_metadata( + upload_succeeded = upload_file_with_metadata( tmp_path, f"{bucket}/{final_key_path.parent}", metadata=metadata, object_name=final_key_path.name, ) + if not upload_succeeded: + logger.error("Failed to upload promoted file %s to %s", staged_key, final_key) + failed += 1 + continue + promoted += 1 # Track promoted accession for manifest trimming From 521978d0855e7d4052f9a740743ed5269dc52f43 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:12:24 -0700 Subject: [PATCH 20/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/integration/conftest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 33e53519..95e60008 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -17,6 +17,7 @@ import boto3 import botocore.client +import botocore.config import pytest import cdm_data_loaders.ncbi_ftp.manifest as manifest_mod @@ -48,6 +49,11 @@ def _minio_reachable() -> bool: endpoint_url=MINIO_ENDPOINT_URL, aws_access_key_id=MINIO_ACCESS_KEY, aws_secret_access_key=MINIO_SECRET_KEY, + config=botocore.config.Config( + connect_timeout=1, + read_timeout=1, + retries={"max_attempts": 1}, + ), ) client.list_buckets() except Exception: # noqa: BLE001 From 8f73674106da5a1fde8cde0dd0861bb293007c26 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:13:15 -0700 Subject: [PATCH 21/76] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/cdm_data_loaders/pipelines/ncbi_ftp_download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 616afee4..30a4b61b 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -15,7 +15,6 @@ from typing import Any from pydantic import AliasChoices, Field, field_validator -from pydantic_settings import CliSuppress from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST, download_assembly_to_local from cdm_data_loaders.pipelines.core import run_cli @@ -54,7 +53,7 @@ class DownloadSettings(CtsDefaultSettings): description="NCBI FTP hostname", validation_alias=AliasChoices("ftp-host", "ftp_host"), ) - limit: CliSuppress[int | None] = Field( + limit: int | None = Field( default=None, ge=1, description="Limit to first N assemblies (for testing)", From 1e8f363d7dc93b02e3645c0ce105c77eb41c19c5 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 12:42:48 -0700 Subject: [PATCH 22/76] add progress bar to manifest verification --- notebooks/ncbi_ftp_manifest.ipynb | 16 ++++++++++++++-- src/cdm_data_loaders/ncbi_ftp/manifest.py | 19 +++++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index e59c6043..1608f4e2 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -198,9 +198,18 @@ "# ── Verify against Lakehouse ──\n", "if STORE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", + " from tqdm.notebook import tqdm\n", "\n", " candidates = diff.new + diff.updated\n", - " print(f\"Verifying {len(candidates)} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", + " total = len(candidates)\n", + " print(f\"Verifying {total} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", + "\n", + " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\")\n", + "\n", + " def _update_progress(done: int, _total: int, acc: str) -> None:\n", + " progress.update(1)\n", + " progress.set_postfix(acc=acc, refresh=False)\n", + "\n", " confirmed = set(\n", " verify_transfer_candidates(\n", " candidates,\n", @@ -208,8 +217,11 @@ " STORE_BUCKET,\n", " STORE_KEY_PREFIX,\n", " ftp_host=FTP_HOST,\n", + " progress_callback=_update_progress,\n", " )\n", " )\n", + " progress.close()\n", + "\n", " before = len(diff.new) + len(diff.updated)\n", " diff.new = [a for a in diff.new if a in confirmed]\n", " diff.updated = [a for a in diff.updated if a in confirmed]\n", @@ -228,7 +240,7 @@ " diff.new = [a for a in diff.new if a in limited_set]\n", " diff.updated = [a for a in diff.updated if a in limited_set]\n", " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", - " print(f\" (was {original_new} new, {original_updated} updated)\")" + " print(f\" (was {original_new} new, {original_updated} updated)\")\n" ] }, { diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index ae587c28..62960ad1 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -12,7 +12,7 @@ import json import re import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass, field from datetime import UTC, datetime from pathlib import Path @@ -266,6 +266,7 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 bucket: str, key_prefix: str, ftp_host: str = FTP_HOST, + progress_callback: Callable[[int, int, str], None] | None = None, ) -> list[str]: """Verify which transfer candidates actually need downloading. @@ -282,6 +283,9 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 :param bucket: S3 bucket name :param key_prefix: S3 key prefix for the Lakehouse dataset root :param ftp_host: NCBI FTP hostname + :param progress_callback: optional callable invoked after each accession is + processed with ``(done, total, accession)`` so callers can display a + progress bar. ``done`` is the 1-based count of completed accessions. :return: filtered list of accessions that actually need downloading """ if not accessions: @@ -295,10 +299,12 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 last_activity = time.monotonic() try: - for acc in accessions: + for done, acc in enumerate(accessions, start=1): rec = current_assemblies.get(acc) if not rec: confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) continue # Build S3 prefix for this assembly @@ -311,6 +317,8 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 # Nothing in the store — definitely needs downloading confirmed.append(acc) skipped_missing += 1 + if progress_callback is not None: + progress_callback(done, len(accessions), acc) continue # Objects exist — need FTP md5 checksums to decide @@ -327,6 +335,8 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 except Exception: # noqa: BLE001 logger.warning("Cannot fetch md5checksums.txt for %s, keeping in transfer list", acc) confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) continue # Filter to files we'd actually download @@ -338,6 +348,8 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 if not target_checksums: confirmed.append(acc) + if progress_callback is not None: + progress_callback(done, len(accessions), acc) continue # Short-circuit: if any file differs or is missing, keep the assembly @@ -361,6 +373,9 @@ def verify_transfer_candidates( # noqa: PLR0912, PLR0915 else: pruned += 1 logger.debug("Pruned %s — all files match S3 checksums", acc) + + if progress_callback is not None: + progress_callback(done, len(accessions), acc) finally: if ftp is not None: with contextlib.suppress(Exception): From 7326641a34555676d9a0a07af9e6a2bbff8b17ea Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 14:58:34 -0700 Subject: [PATCH 23/76] add synthetic assembly summary generation option --- docs/ncbi_ftp_e2e_walkthrough.md | 38 +++++ notebooks/ncbi_ftp_manifest.ipynb | 86 +++++++++-- src/cdm_data_loaders/ncbi_ftp/manifest.py | 106 +++++++++++++ tests/integration/test_manifest_e2e.py | 87 +++++++++++ tests/ncbi_ftp/test_manifest.py | 180 ++++++++++++++++++++++ 5 files changed, 486 insertions(+), 11 deletions(-) diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index 40b467a7..ffce0e3e 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -140,6 +140,44 @@ be skipped — though on repeat runs you should set `STORE_BUCKET` so assemblies already promoted to the Lakehouse are pruned from the transfer manifest. +### Optional: Bootstrap from existing store (Cell 5) + +If you have a pre-populated S3 store but lack a baseline assembly summary, +you can scan the store to generate a synthetic baseline. This is especially +useful for large stores (100K+ assemblies) where verifying against FTP +checksums would take days. + +**When to use this:** +- First run against an existing, pre-populated store +- You want to start diffing without waiting for checksum verification +- You don't have a previous assembly summary snapshot to compare against + +**How it works:** +1. Set `SCAN_STORE = True` in Cell 5 +2. The notebook scans all objects under `s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}` +3. For each unique assembly found, it extracts the accession and uses the + earliest object `LastModified` as a conservative `seq_rel_date` +4. It saves the synthetic summary to `LOCAL_SYNTHETIC_SUMMARY` (default: + `output/synthetic_summary_from_store.txt`) +5. This becomes the baseline for diffing; subsequent runs can load this + file as `PREVIOUS_SUMMARY_URI` + +**Example (for a 500K-assembly store):** +```python +SCAN_STORE = True +STORE_BUCKET = "cdm-lake" +STORE_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" +LOCAL_SYNTHETIC_SUMMARY = Path("output/synthetic_summary_from_store.txt") + +# After running Cell 5, upload the result to S3 for future runs: +# s3 cp output/synthetic_summary_from_store.txt s3://cdm-lake/assembly_summaries/synthetic_base.txt +# Then in future runs, set: +# PREVIOUS_SUMMARY_URI = "s3://cdm-lake/assembly_summaries/synthetic_base.txt" +``` + +**Performance:** Scanning typically takes 5–10 minutes for 500K assemblies +(vs. ~6 days of checksum verification). + ### Run the notebook Execute all cells in order. After Cell 7 finishes you should see files in diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 1608f4e2..34e42355 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -132,23 +132,87 @@ { "cell_type": "code", "execution_count": null, - "id": "88954378", + "id": "ceb5af5a", "metadata": {}, "outputs": [], "source": [ - "\"\"\"Load previous summary from S3 (or start fresh).\"\"\"\n", + "\"\"\"Optional: Bootstrap baseline by scanning current store (if no previous summary available).\n", "\n", - "previous: dict[str, AssemblyRecord] | None = None\n", + "If you have a pre-populated S3 store but no previous assembly summary snapshot,\n", + "you can scan the store to generate a synthetic summary. This becomes the baseline\n", + "for the diff.\n", "\n", - "if PREVIOUS_SUMMARY_URI:\n", - " s3 = get_s3_client()\n", - " bucket, key = split_s3_path(PREVIOUS_SUMMARY_URI)\n", - " resp = s3.get_object(Bucket=bucket, Key=key)\n", - " prev_text = resp[\"Body\"].read().decode(\"utf-8\")\n", - " previous = parse_assembly_summary(prev_text)\n", - " print(f\"Loaded {len(previous)} assemblies from previous snapshot\")\n", + "Set SCAN_STORE=True below to enable. The scan will:\n", + " 1. List all objects under STORE_BUCKET/STORE_KEY_PREFIX\n", + " 2. Extract accessions and use earliest LastModified as seq_rel_date (conservative)\n", + " 3. Build AssemblyRecord for each assembly found\n", + " 4. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", + "\n", + "Typical use case: First run against 500K+ existing assemblies. Scanning takes\n", + "~5 minutes instead of ~6 days of checksum verification.\n", + "\"\"\"\n", + "\n", + "SCAN_STORE = False # Set to True to scan your store\n", + "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", + "\n", + "if SCAN_STORE and STORE_BUCKET:\n", + " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", + " from tqdm.notebook import tqdm\n", + "\n", + " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", + " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\")\n", + "\n", + " def _track_scan(count: int, acc: str) -> None:\n", + " progress.update(1)\n", + " progress.set_postfix(acc=acc, refresh=False)\n", + "\n", + " synthetic = scan_store_to_synthetic_summary(STORE_BUCKET, STORE_KEY_PREFIX, progress_callback=_track_scan)\n", + " progress.close()\n", + "\n", + " print(f\"Found {len(synthetic)} assemblies in store\")\n", + "\n", + " # Save synthetic summary to file for future runs\n", + " with LOCAL_SYNTHETIC_SUMMARY.open(\"w\") as f:\n", + " for acc in sorted(synthetic.keys()):\n", + " rec = synthetic[acc]\n", + " f.write(\n", + " f\"{rec.accession}\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t{rec.status}\\t.\\t.\\t.\\t{rec.seq_rel_date}\\t.\\t.\\t.\\t.\\t{rec.ftp_path}\\t.\\n\"\n", + " )\n", + " print(f\"Saved synthetic summary to {LOCAL_SYNTHETIC_SUMMARY}\")\n", + "\n", + " # Use it as the previous baseline\n", + " previous = synthetic\n", "else:\n", - " print(\"No previous snapshot — all current 'latest' assemblies will be marked as new\")" + " if SCAN_STORE:\n", + " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", + " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88954378", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Load previous summary from S3 (or from synthetic scan, or start fresh).\n", + "\n", + "If you ran the store scan in the previous cell, SCAN_STORE=True above will have\n", + "set `previous` already. Otherwise, try to load from PREVIOUS_SUMMARY_URI.\n", + "\"\"\"\n", + "\n", + "if \"previous\" not in locals() or previous is None:\n", + " # Store scan didn't run, or was skipped. Try to load from S3.\n", + " if PREVIOUS_SUMMARY_URI:\n", + " s3 = get_s3_client()\n", + " bucket, key = split_s3_path(PREVIOUS_SUMMARY_URI)\n", + " resp = s3.get_object(Bucket=bucket, Key=key)\n", + " prev_text = resp[\"Body\"].read().decode(\"utf-8\")\n", + " previous = parse_assembly_summary(prev_text)\n", + " print(f\"Loaded {len(previous)} assemblies from previous snapshot\")\n", + " else:\n", + " print(\"No previous snapshot and SCAN_STORE=False — all current 'latest' assemblies will be marked as new\")\n", + " previous = None\n" ] }, { diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index 62960ad1..ebce8dce 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -257,6 +257,112 @@ def _ftp_dir_from_url(ftp_url: str, ftp_host: str = FTP_HOST) -> str: return ftp_url +# ── Synthetic summary from S3 store scan ──────────────────────────────── + + +def _extract_accession_from_s3_key(key: str) -> str | None: + """Extract the assembly accession from an S3 object key. + + Looks for the pattern GCF_######.# or GCA_######.# in the key path. + + :param key: S3 object key + :return: accession (e.g. "GCF_000001215.4") or None if not found + """ + m = re.search(r"(GC[AF]_\d{3}\d{6}\.\d+)", key) + return m.group(1) if m else None + + +def _extract_assembly_dir_from_s3_key(key: str) -> str | None: + """Extract the assembly directory name from an S3 object key. + + The assembly directory is the path component that follows the accession + and contains assembly metadata (e.g. "GCF_000001215.4_Release_6_plus_ISO1_MT"). + + :param key: S3 object key + :return: assembly directory name or None if not found + """ + # Match accession followed by underscore and then capture until next / + m = re.search(r"(GC[AF]_\d{3}\d{6}\.\d+[^/]*)/", key) + return m.group(1) if m else None + + +def scan_store_to_synthetic_summary( + bucket: str, + key_prefix: str, + progress_callback: Callable[[int, str], None] | None = None, +) -> dict[str, AssemblyRecord]: + """Scan S3 store and build a synthetic assembly summary from existing objects. + + This function is useful when bootstrapping a diffs against an existing, + pre-populated S3 store that lacks a baseline assembly summary. + + For each assembly found in the store: + - Extracts the accession and assembly directory name from S3 paths + - Uses the earliest ``LastModified`` timestamp across all files in that + assembly as the synthetic ``seq_rel_date`` (conservative estimate) + - Creates an ``AssemblyRecord`` with ``status="latest"`` + + The function paginates through S3 to handle large stores efficiently. + + :param bucket: S3 bucket name + :param key_prefix: S3 key prefix (all objects under this prefix are scanned) + :param progress_callback: optional callable invoked after each accession is + processed with ``(count, accession)`` where count is the running total + of unique accessions found + :return: dict mapping accession to ``AssemblyRecord`` + """ + s3 = get_s3_client() + assemblies: dict[str, AssemblyRecord] = {} + processed_count = 0 + + try: + paginator = s3.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=bucket, Prefix=key_prefix) + + for page in pages: + for obj in page.get("Contents", []): + acc = _extract_accession_from_s3_key(obj["Key"]) + assembly_dir = _extract_assembly_dir_from_s3_key(obj["Key"]) + + if not acc or not assembly_dir: + continue + + # Convert LastModified to NCBI date format (YYYY/MM/DD) + last_modified = obj["LastModified"] + # Handle both aware and naive datetimes + if last_modified.tzinfo is None: + last_modified = last_modified.replace(tzinfo=UTC) + obj_date_str = last_modified.strftime("%Y/%m/%d") + + if acc not in assemblies: + # First object for this accession; store it + assemblies[acc] = AssemblyRecord( + accession=acc, + status="latest", + seq_rel_date=obj_date_str, + ftp_path="", # synthesized; empty as it's not from FTP + assembly_dir=assembly_dir, + ) + processed_count += 1 + if progress_callback is not None: + progress_callback(processed_count, acc) + else: + # Update to earliest timestamp (conservative) + existing_record = assemblies[acc] + existing_date = datetime.strptime(existing_record.seq_rel_date, "%Y/%m/%d").replace( + tzinfo=UTC + ) + if last_modified < existing_date: + existing_record.seq_rel_date = obj_date_str + + except Exception as e: # noqa: BLE001 + logger.error("Error scanning store: %s", e) + raise + + logger.info("Scanned S3 store: found %d unique assemblies", len(assemblies)) + return assemblies + + # ── Checksum verification against S3 store ─────────────────────────────── diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py index df25d410..4ab208f0 100644 --- a/tests/integration/test_manifest_e2e.py +++ b/tests/integration/test_manifest_e2e.py @@ -18,6 +18,7 @@ download_assembly_summary, filter_by_prefix_range, parse_assembly_summary, + scan_store_to_synthetic_summary, verify_transfer_candidates, write_diff_summary, write_removed_manifest, @@ -209,3 +210,89 @@ def test_prunes_existing_matching_md5( remaining_candidates = [c for c in candidates if c != acc] for c in remaining_candidates: assert c in result, f"Expected {c} to remain (not seeded)" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestScanStoreToSyntheticSummary: + """Test synthetic assembly summary generation from MinIO store.""" + + def test_builds_summary_from_minio_store( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Verify synthetic summary captures assemblies from MinIO.""" + s3 = minio_s3_client + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX + + # Seed MinIO with a couple of assemblies + assemblies = { + "GCF_000001215.4_v1": ["_genomic.fna.gz", "_protein.faa.gz"], + "GCF_000005845.2_v2": ["_genomic.fna.gz"], + } + + for assembly_dir, files in assemblies.items(): + for fname in files: + key = f"{path_prefix}refseq/{assembly_dir}/{assembly_dir}{fname}" + s3.put_object( + Bucket=test_bucket, + Key=key, + Body=b"placeholder", + ) + + # Scan the store + result = scan_store_to_synthetic_summary(test_bucket, path_prefix) + + # Should have found both assemblies + assert "GCF_000001215.4" in result + assert "GCF_000005845.2" in result + + # Verify basic record structure + rec1 = result["GCF_000001215.4"] + assert rec1.accession == "GCF_000001215.4" + assert rec1.status == "latest" + assert rec1.assembly_dir == "GCF_000001215.4_v1" + + def test_synthetic_summary_diff_against_current( + self, + minio_s3_client: object, + test_bucket: str, + ) -> None: + """Verify synthetic summary can be used as baseline for diffing.""" + s3 = minio_s3_client + path_prefix = DEFAULT_LAKEHOUSE_KEY_PREFIX + + # Seed MinIO with one assembly + key1 = f"{path_prefix}refseq/GCF_000001215.4_old/GCF_000001215.4_old_genomic.fna.gz" + s3.put_object(Bucket=test_bucket, Key=key1, Body=b"data") + + # Build synthetic summary from store + synthetic = scan_store_to_synthetic_summary(test_bucket, path_prefix) + assert "GCF_000001215.4" in synthetic + + # Simulate current NCBI summary with one new and one existing + current = { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date=synthetic["GCF_000001215.4"].seq_rel_date, + ftp_path="", + assembly_dir="GCF_000001215.4_old", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="latest", + seq_rel_date="2024/04/20", + ftp_path="", + assembly_dir="GCF_000005845.2_new", + ), + } + + # Compute diff + diff = compute_diff(current, previous_assemblies=synthetic) + + # Should find one new and zero updated + assert "GCF_000005845.2" in diff.new + assert "GCF_000001215.4" not in diff.new # Already in store + assert len(diff.updated) == 0 # Same date, same dir diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index dac0400f..82ed14c0 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -6,12 +6,15 @@ from cdm_data_loaders.ncbi_ftp.manifest import ( DiffResult, + _extract_accession_from_s3_key, + _extract_assembly_dir_from_s3_key, _ftp_dir_from_url, accession_prefix, compute_diff, filter_by_prefix_range, get_latest_assembly_paths, parse_assembly_summary, + scan_store_to_synthetic_summary, verify_transfer_candidates, write_diff_summary, write_removed_manifest, @@ -559,3 +562,180 @@ def test_skips_ftp_when_folder_missing_from_store( assert result == ["GCF_000001215.4"] # FTP should never have been connected (lazy init) mock_connect.assert_not_called() + + +# ── Synthetic summary from S3 store scan ──────────────────────────────── + + +class TestExtractAccessionFromS3Key: + """Test accession extraction from S3 paths.""" + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_extracts_accession_from_path(self, _mock_s3: MagicMock) -> None: + """Verify accession is extracted correctly from S3 keys.""" + assert _extract_accession_from_s3_key( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" + ) == "GCF_000001215.4" + assert _extract_accession_from_s3_key( + "some/path/GCA_999999999.1_whatever/data.txt" + ) == "GCA_999999999.1" + + def test_returns_none_for_invalid_path(self) -> None: + """Verify None is returned when no accession is found.""" + assert _extract_accession_from_s3_key("some/random/path") is None + assert _extract_accession_from_s3_key("") is None + + +class TestExtractAssemblyDirFromS3Key: + """Test assembly directory extraction from S3 paths.""" + + def test_extracts_assembly_dir(self) -> None: + """Verify assembly directory is extracted correctly from S3 keys.""" + assert _extract_assembly_dir_from_s3_key( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" + ) == "GCF_000001215.4_Release_6_plus_ISO1_MT" + assert _extract_assembly_dir_from_s3_key( + "prefix/GCA_999999999.1_assembly_name/subdir/data.txt" + ) == "GCA_999999999.1_assembly_name" + + def test_returns_none_for_invalid_path(self) -> None: + """Verify None is returned when no assembly directory is found.""" + assert _extract_assembly_dir_from_s3_key("some/random/path") is None + assert _extract_assembly_dir_from_s3_key("") is None + + +class TestScanStoreToSyntheticSummary: + """Test synthetic assembly summary generation from S3 store scan.""" + + def _mock_s3_with_objects(self) -> MagicMock: + """Return a mock S3 client with assembly objects.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + + # Mock objects from two assemblies + page_contents = [ + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file1.gz", + "LastModified": datetime(2024, 1, 15, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file2.gz", + "LastModified": datetime(2024, 1, 16, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000005845.2_Assembly/file.gz", + "LastModified": datetime(2024, 2, 20, tzinfo=timezone.utc), + }, + ] + mock_paginator.paginate.return_value = [{"Contents": page_contents}] + return mock + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_builds_summary_from_store(self, mock_get_s3: MagicMock) -> None: + """Verify synthetic summary is built correctly from S3 objects.""" + mock_get_s3.return_value = self._mock_s3_with_objects() + + result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + + assert len(result) == 2 + assert "GCF_000001215.4" in result + assert "GCF_000005845.2" in result + + # Should use earliest date (2024-01-15) + rec1 = result["GCF_000001215.4"] + assert rec1.accession == "GCF_000001215.4" + assert rec1.status == "latest" + assert rec1.seq_rel_date == "2024/01/15" + assert rec1.assembly_dir == "GCF_000001215.4_Release_6" + + # Other assembly uses its single date + rec2 = result["GCF_000005845.2"] + assert rec2.seq_rel_date == "2024/02/20" + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_uses_earliest_date_per_assembly(self, mock_get_s3: MagicMock) -> None: + """Verify earliest LastModified is used when assembly has multiple files.""" + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + + from datetime import datetime, timezone + + # Files from same assembly with different dates + page_contents = [ + { + "Key": "prefix/GCF_000001215.4_v1/file_newer.gz", + "LastModified": datetime(2024, 3, 20, tzinfo=timezone.utc), + }, + { + "Key": "prefix/GCF_000001215.4_v1/file_older.gz", + "LastModified": datetime(2024, 1, 10, tzinfo=timezone.utc), + }, + ] + mock_paginator.paginate.return_value = [{"Contents": page_contents}] + mock_get_s3.return_value = mock + + result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + + # Should use the earliest date + assert result["GCF_000001215.4"].seq_rel_date == "2024/01/10" + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_invokes_progress_callback(self, mock_get_s3: MagicMock) -> None: + """Verify progress callback is called for each unique assembly.""" + mock_get_s3.return_value = self._mock_s3_with_objects() + callback_calls = [] + + def track_progress(count: int, acc: str) -> None: + callback_calls.append((count, acc)) + + scan_store_to_synthetic_summary("test-bucket", "prefix/", progress_callback=track_progress) + + # Should have 2 calls (one per assembly discovered) + assert len(callback_calls) == 2 + assert callback_calls[0][0] == 1 # first assembly + assert callback_calls[1][0] == 2 # second assembly + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_handles_empty_store(self, mock_get_s3: MagicMock) -> None: + """Verify function handles empty store gracefully.""" + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [{"Contents": []}] + mock_get_s3.return_value = mock + + result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + + assert result == {} + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_skips_objects_without_accession(self, mock_get_s3: MagicMock) -> None: + """Verify objects without valid accessions are skipped.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + + page_contents = [ + { + "Key": "prefix/some/random/file.txt", # No accession + "LastModified": datetime(2024, 1, 1, tzinfo=timezone.utc), + }, + { + "Key": "prefix/GCF_000001215.4_Assembly/valid_file.gz", + "LastModified": datetime(2024, 2, 1, tzinfo=timezone.utc), + }, + ] + mock_paginator.paginate.return_value = [{"Contents": page_contents}] + mock_get_s3.return_value = mock + + result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + + # Only one valid assembly should be found + assert len(result) == 1 + assert "GCF_000001215.4" in result From 4cf60d6f23805ecf0ebc7aa8aa364445188650b4 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 20 Apr 2026 15:48:51 -0700 Subject: [PATCH 24/76] debug synthetic manifest --- notebooks/ncbi_ftp_manifest.ipynb | 47 ++++++++++++------- src/cdm_data_loaders/ncbi_ftp/manifest.py | 11 +++-- tests/ncbi_ftp/test_manifest.py | 57 ++++++++++++++++++++++- 3 files changed, 92 insertions(+), 23 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 34e42355..1d822445 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -152,22 +152,25 @@ "~5 minutes instead of ~6 days of checksum verification.\n", "\"\"\"\n", "\n", - "SCAN_STORE = False # Set to True to scan your store\n", + "SCAN_STORE = True # Set to True to scan your store\n", "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", "\n", "if SCAN_STORE and STORE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", + " from IPython.display import display\n", " from tqdm.notebook import tqdm\n", "\n", " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", - " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\")\n", + " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True)\n", + " display(progress)\n", "\n", " def _track_scan(count: int, acc: str) -> None:\n", - " progress.update(1)\n", + " progress.n = count\n", " progress.set_postfix(acc=acc, refresh=False)\n", + " progress.refresh()\n", "\n", " synthetic = scan_store_to_synthetic_summary(STORE_BUCKET, STORE_KEY_PREFIX, progress_callback=_track_scan)\n", - " progress.close()\n", + " progress.refresh()\n", "\n", " print(f\"Found {len(synthetic)} assemblies in store\")\n", "\n", @@ -259,32 +262,40 @@ "that genuinely need downloading.\n", "\"\"\"\n", "\n", - "# ── Verify against Lakehouse ──\n", + "# -- Verify against Lakehouse --\n", "if STORE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", + " from IPython.display import display\n", " from tqdm.notebook import tqdm\n", "\n", " candidates = diff.new + diff.updated\n", " total = len(candidates)\n", " print(f\"Verifying {total} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", "\n", - " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\")\n", + " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\", leave=True)\n", + " display(progress)\n", "\n", " def _update_progress(done: int, _total: int, acc: str) -> None:\n", - " progress.update(1)\n", + " progress.n = done\n", " progress.set_postfix(acc=acc, refresh=False)\n", + " progress.refresh()\n", "\n", - " confirmed = set(\n", - " verify_transfer_candidates(\n", - " candidates,\n", - " filtered,\n", - " STORE_BUCKET,\n", - " STORE_KEY_PREFIX,\n", - " ftp_host=FTP_HOST,\n", - " progress_callback=_update_progress,\n", + " if total == 0:\n", + " print(\"No candidates to verify; skipping checksum checks.\")\n", + " confirmed = set()\n", + " else:\n", + " confirmed = set(\n", + " verify_transfer_candidates(\n", + " candidates,\n", + " filtered,\n", + " STORE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " ftp_host=FTP_HOST,\n", + " progress_callback=_update_progress,\n", + " )\n", " )\n", - " )\n", - " progress.close()\n", + "\n", + " progress.refresh()\n", "\n", " before = len(diff.new) + len(diff.updated)\n", " diff.new = [a for a in diff.new if a in confirmed]\n", @@ -294,7 +305,7 @@ "else:\n", " print(\"Skipping S3 verification (STORE_BUCKET not set)\")\n", "\n", - "# ── Apply LIMIT ──\n", + "# -- Apply LIMIT --\n", "if LIMIT is not None:\n", " original_new = len(diff.new)\n", " original_updated = len(diff.updated)\n", diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index ebce8dce..ca7596c1 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -229,7 +229,7 @@ def compute_diff( # noqa: PLR0912 diff.new.append(acc) elif previous_assemblies is not None: prev = previous_assemblies.get(acc) - if prev and (rec.seq_rel_date != prev.seq_rel_date or rec.assembly_dir != prev.assembly_dir): + if prev and (rec.seq_rel_date > prev.seq_rel_date or rec.assembly_dir != prev.assembly_dir): diff.updated.append(acc) # Accessions in previous but entirely absent from current (withdrawn) @@ -335,12 +335,17 @@ def scan_store_to_synthetic_summary( obj_date_str = last_modified.strftime("%Y/%m/%d") if acc not in assemblies: - # First object for this accession; store it + # First object for this accession; store it. + # Construct a fake FTP path that ends with assembly_dir so + # that round-tripping through parse_assembly_summary (which + # derives assembly_dir via ftp_path.rstrip("/").split("/")[-1]) + # yields the correct assembly_dir and therefore correct diffs. + fake_ftp_path = f"https://ftp.ncbi.nlm.nih.gov/synthetic/{assembly_dir}" assemblies[acc] = AssemblyRecord( accession=acc, status="latest", seq_rel_date=obj_date_str, - ftp_path="", # synthesized; empty as it's not from FTP + ftp_path=fake_ftp_path, assembly_dir=assembly_dir, ) processed_count += 1 diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index 82ed14c0..af7f5eb1 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -154,14 +154,22 @@ def test_nothing_new_when_all_known(self) -> None: diff = compute_diff(current, previous_accessions=known) assert len(diff.new) == 0 - def test_detects_updated_seq_rel_date(self) -> None: - """Verify assemblies with changed seq_rel_date are marked updated.""" + def test_detects_updated_seq_rel_date_newer(self) -> None: + """Assemblies whose seq_rel_date moved forward are marked updated.""" current = parse_assembly_summary(SAMPLE_SUMMARY) previous = parse_assembly_summary(SAMPLE_SUMMARY) previous["GCF_000001215.4"].seq_rel_date = "2010/01/01" diff = compute_diff(current, previous_assemblies=previous) assert "GCF_000001215.4" in diff.updated + def test_does_not_flag_updated_when_seq_rel_date_older(self) -> None: + """Assemblies whose seq_rel_date in current is older (e.g. synthetic baseline) are not flagged.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + previous = parse_assembly_summary(SAMPLE_SUMMARY) + previous["GCF_000001215.4"].seq_rel_date = "2099/12/31" + diff = compute_diff(current, previous_assemblies=previous) + assert "GCF_000001215.4" not in diff.updated + def test_detects_replaced(self) -> None: """Verify assemblies with status 'replaced' are detected.""" current = parse_assembly_summary(SAMPLE_SUMMARY) @@ -739,3 +747,48 @@ def test_skips_objects_without_accession(self, mock_get_s3: MagicMock) -> None: # Only one valid assembly should be found assert len(result) == 1 assert "GCF_000001215.4" in result + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_assembly_dir_survives_file_round_trip(self, mock_get_s3: MagicMock, tmp_path: Path) -> None: + """Verify assembly_dir is preserved when saving to file and parsing back. + + Regression test: previously ftp_path was written as "" which caused + parse_assembly_summary to recover assembly_dir="" for all records, + making compute_diff flag every assembly as updated. + """ + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + page_contents = [ + { + "Key": "prefix/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "LastModified": datetime(2024, 3, 10, tzinfo=timezone.utc), + }, + ] + mock_paginator.paginate.return_value = [{"Contents": page_contents}] + mock.get_paginator.return_value = mock_paginator + mock_get_s3.return_value = mock + + synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/") + + # Simulate the notebook's save logic + out_file = tmp_path / "synthetic_summary.txt" + with out_file.open("w") as f: + for acc in sorted(synthetic.keys()): + rec = synthetic[acc] + f.write( + f"{rec.accession}\t.\t.\t.\t.\t.\t.\t.\t.\t.\t{rec.status}\t.\t.\t.\t{rec.seq_rel_date}\t.\t.\t.\t.\t{rec.ftp_path}\t.\n" + ) + + # Parse the file back + reparsed = parse_assembly_summary(out_file) + + assert "GCF_000001215.4" in reparsed + reparsed_rec = reparsed["GCF_000001215.4"] + original_rec = synthetic["GCF_000001215.4"] + + # assembly_dir must survive the round-trip so diffs are accurate + assert reparsed_rec.assembly_dir == original_rec.assembly_dir + assert reparsed_rec.seq_rel_date == original_rec.seq_rel_date + assert reparsed_rec.status == original_rec.status From 9a1d1e55fcc61995f42e0661304ba98c4bde7077 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 08:36:30 -0700 Subject: [PATCH 25/76] add custom release date to synthetic manifest generation --- notebooks/ncbi_ftp_manifest.ipynb | 10 ++++-- src/cdm_data_loaders/ncbi_ftp/manifest.py | 32 ++++++++------------ tests/integration/test_manifest_e2e.py | 4 +-- tests/ncbi_ftp/test_manifest.py | 37 ++++++++++++++--------- 4 files changed, 46 insertions(+), 37 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 1d822445..38b2c982 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -144,7 +144,7 @@ "\n", "Set SCAN_STORE=True below to enable. The scan will:\n", " 1. List all objects under STORE_BUCKET/STORE_KEY_PREFIX\n", - " 2. Extract accessions and use earliest LastModified as seq_rel_date (conservative)\n", + " 2. Extract accessions and apply user-provided SYNTHETIC_RELEASE_DATE to all records\n", " 3. Build AssemblyRecord for each assembly found\n", " 4. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", "\n", @@ -153,6 +153,7 @@ "\"\"\"\n", "\n", "SCAN_STORE = True # Set to True to scan your store\n", + "SYNTHETIC_RELEASE_DATE = \"2025/10/31\" # YYYY/MM/DD applied to all synthetic records\n", "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", "\n", "if SCAN_STORE and STORE_BUCKET:\n", @@ -169,7 +170,12 @@ " progress.set_postfix(acc=acc, refresh=False)\n", " progress.refresh()\n", "\n", - " synthetic = scan_store_to_synthetic_summary(STORE_BUCKET, STORE_KEY_PREFIX, progress_callback=_track_scan)\n", + " synthetic = scan_store_to_synthetic_summary(\n", + " STORE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " SYNTHETIC_RELEASE_DATE,\n", + " progress_callback=_track_scan,\n", + " )\n", " progress.refresh()\n", "\n", " print(f\"Found {len(synthetic)} assemblies in store\")\n", diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index ca7596c1..42fd99d2 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -289,6 +289,7 @@ def _extract_assembly_dir_from_s3_key(key: str) -> str | None: def scan_store_to_synthetic_summary( bucket: str, key_prefix: str, + release_date: str, progress_callback: Callable[[int, str], None] | None = None, ) -> dict[str, AssemblyRecord]: """Scan S3 store and build a synthetic assembly summary from existing objects. @@ -296,21 +297,29 @@ def scan_store_to_synthetic_summary( This function is useful when bootstrapping a diffs against an existing, pre-populated S3 store that lacks a baseline assembly summary. - For each assembly found in the store: + For each assembly found in the store: - Extracts the accession and assembly directory name from S3 paths - - Uses the earliest ``LastModified`` timestamp across all files in that - assembly as the synthetic ``seq_rel_date`` (conservative estimate) + - Applies the provided ``release_date`` as synthetic ``seq_rel_date`` for + all assemblies - Creates an ``AssemblyRecord`` with ``status="latest"`` The function paginates through S3 to handle large stores efficiently. :param bucket: S3 bucket name :param key_prefix: S3 key prefix (all objects under this prefix are scanned) + :param release_date: release date string in ``YYYY/MM/DD`` format used for + all synthetic records :param progress_callback: optional callable invoked after each accession is processed with ``(count, accession)`` where count is the running total of unique accessions found :return: dict mapping accession to ``AssemblyRecord`` """ + try: + datetime.strptime(release_date, "%Y/%m/%d") + except ValueError as exc: + msg = f"Invalid release_date '{release_date}'. Expected format YYYY/MM/DD." + raise ValueError(msg) from exc + s3 = get_s3_client() assemblies: dict[str, AssemblyRecord] = {} processed_count = 0 @@ -327,13 +336,6 @@ def scan_store_to_synthetic_summary( if not acc or not assembly_dir: continue - # Convert LastModified to NCBI date format (YYYY/MM/DD) - last_modified = obj["LastModified"] - # Handle both aware and naive datetimes - if last_modified.tzinfo is None: - last_modified = last_modified.replace(tzinfo=UTC) - obj_date_str = last_modified.strftime("%Y/%m/%d") - if acc not in assemblies: # First object for this accession; store it. # Construct a fake FTP path that ends with assembly_dir so @@ -344,21 +346,13 @@ def scan_store_to_synthetic_summary( assemblies[acc] = AssemblyRecord( accession=acc, status="latest", - seq_rel_date=obj_date_str, + seq_rel_date=release_date, ftp_path=fake_ftp_path, assembly_dir=assembly_dir, ) processed_count += 1 if progress_callback is not None: progress_callback(processed_count, acc) - else: - # Update to earliest timestamp (conservative) - existing_record = assemblies[acc] - existing_date = datetime.strptime(existing_record.seq_rel_date, "%Y/%m/%d").replace( - tzinfo=UTC - ) - if last_modified < existing_date: - existing_record.seq_rel_date = obj_date_str except Exception as e: # noqa: BLE001 logger.error("Error scanning store: %s", e) diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py index 4ab208f0..a0357a51 100644 --- a/tests/integration/test_manifest_e2e.py +++ b/tests/integration/test_manifest_e2e.py @@ -242,7 +242,7 @@ def test_builds_summary_from_minio_store( ) # Scan the store - result = scan_store_to_synthetic_summary(test_bucket, path_prefix) + result = scan_store_to_synthetic_summary(test_bucket, path_prefix, "2024/04/01") # Should have found both assemblies assert "GCF_000001215.4" in result @@ -268,7 +268,7 @@ def test_synthetic_summary_diff_against_current( s3.put_object(Bucket=test_bucket, Key=key1, Body=b"data") # Build synthetic summary from store - synthetic = scan_store_to_synthetic_summary(test_bucket, path_prefix) + synthetic = scan_store_to_synthetic_summary(test_bucket, path_prefix, "2024/04/20") assert "GCF_000001215.4" in synthetic # Simulate current NCBI summary with one new and one existing diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index af7f5eb1..e3ade2e3 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -4,6 +4,8 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest + from cdm_data_loaders.ncbi_ftp.manifest import ( DiffResult, _extract_accession_from_s3_key, @@ -646,26 +648,26 @@ def test_builds_summary_from_store(self, mock_get_s3: MagicMock) -> None: """Verify synthetic summary is built correctly from S3 objects.""" mock_get_s3.return_value = self._mock_s3_with_objects() - result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") assert len(result) == 2 assert "GCF_000001215.4" in result assert "GCF_000005845.2" in result - # Should use earliest date (2024-01-15) + # Should use provided release date for all records rec1 = result["GCF_000001215.4"] assert rec1.accession == "GCF_000001215.4" assert rec1.status == "latest" - assert rec1.seq_rel_date == "2024/01/15" + assert rec1.seq_rel_date == "2024/01/31" assert rec1.assembly_dir == "GCF_000001215.4_Release_6" - # Other assembly uses its single date + # Other assembly uses the same provided date rec2 = result["GCF_000005845.2"] - assert rec2.seq_rel_date == "2024/02/20" + assert rec2.seq_rel_date == "2024/01/31" @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_uses_earliest_date_per_assembly(self, mock_get_s3: MagicMock) -> None: - """Verify earliest LastModified is used when assembly has multiple files.""" + def test_applies_release_date_to_all_assemblies(self, mock_get_s3: MagicMock) -> None: + """Verify provided release_date is used for all assemblies.""" mock = MagicMock() mock_paginator = MagicMock() mock.get_paginator.return_value = mock_paginator @@ -686,10 +688,17 @@ def test_uses_earliest_date_per_assembly(self, mock_get_s3: MagicMock) -> None: mock_paginator.paginate.return_value = [{"Contents": page_contents}] mock_get_s3.return_value = mock - result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/31") + + assert result["GCF_000001215.4"].seq_rel_date == "2024/03/31" + + @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") + def test_raises_for_invalid_release_date(self, mock_get_s3: MagicMock) -> None: + """Verify invalid release_date format is rejected.""" + mock_get_s3.return_value = self._mock_s3_with_objects() - # Should use the earliest date - assert result["GCF_000001215.4"].seq_rel_date == "2024/01/10" + with pytest.raises(ValueError, match="Invalid release_date"): + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024-03-31") @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") def test_invokes_progress_callback(self, mock_get_s3: MagicMock) -> None: @@ -700,7 +709,7 @@ def test_invokes_progress_callback(self, mock_get_s3: MagicMock) -> None: def track_progress(count: int, acc: str) -> None: callback_calls.append((count, acc)) - scan_store_to_synthetic_summary("test-bucket", "prefix/", progress_callback=track_progress) + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31", progress_callback=track_progress) # Should have 2 calls (one per assembly discovered) assert len(callback_calls) == 2 @@ -716,7 +725,7 @@ def test_handles_empty_store(self, mock_get_s3: MagicMock) -> None: mock_paginator.paginate.return_value = [{"Contents": []}] mock_get_s3.return_value = mock - result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") assert result == {} @@ -742,7 +751,7 @@ def test_skips_objects_without_accession(self, mock_get_s3: MagicMock) -> None: mock_paginator.paginate.return_value = [{"Contents": page_contents}] mock_get_s3.return_value = mock - result = scan_store_to_synthetic_summary("test-bucket", "prefix/") + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") # Only one valid assembly should be found assert len(result) == 1 @@ -770,7 +779,7 @@ def test_assembly_dir_survives_file_round_trip(self, mock_get_s3: MagicMock, tmp mock.get_paginator.return_value = mock_paginator mock_get_s3.return_value = mock - synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/") + synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/10") # Simulate the notebook's save logic out_file = tmp_path / "synthetic_summary.txt" From a4ee1866a8cbb691deceab3ca641f4753866ebb6 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 08:39:04 -0700 Subject: [PATCH 26/76] fix style --- notebooks/ncbi_ftp_manifest.ipynb | 6 +++--- tests/ncbi_ftp/test_manifest.py | 29 +++++++++++++++++------------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 38b2c982..0c12060e 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -194,7 +194,7 @@ "else:\n", " if SCAN_STORE:\n", " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", - " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")\n" + " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")" ] }, { @@ -221,7 +221,7 @@ " print(f\"Loaded {len(previous)} assemblies from previous snapshot\")\n", " else:\n", " print(\"No previous snapshot and SCAN_STORE=False — all current 'latest' assemblies will be marked as new\")\n", - " previous = None\n" + " previous = None" ] }, { @@ -321,7 +321,7 @@ " diff.new = [a for a in diff.new if a in limited_set]\n", " diff.updated = [a for a in diff.updated if a in limited_set]\n", " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", - " print(f\" (was {original_new} new, {original_updated} updated)\")\n" + " print(f\" (was {original_new} new, {original_updated} updated)\")" ] }, { diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index e3ade2e3..9a3add97 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -583,12 +583,13 @@ class TestExtractAccessionFromS3Key: @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") def test_extracts_accession_from_path(self, _mock_s3: MagicMock) -> None: """Verify accession is extracted correctly from S3 keys.""" - assert _extract_accession_from_s3_key( - "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" - ) == "GCF_000001215.4" - assert _extract_accession_from_s3_key( - "some/path/GCA_999999999.1_whatever/data.txt" - ) == "GCA_999999999.1" + assert ( + _extract_accession_from_s3_key( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" + ) + == "GCF_000001215.4" + ) + assert _extract_accession_from_s3_key("some/path/GCA_999999999.1_whatever/data.txt") == "GCA_999999999.1" def test_returns_none_for_invalid_path(self) -> None: """Verify None is returned when no accession is found.""" @@ -601,12 +602,16 @@ class TestExtractAssemblyDirFromS3Key: def test_extracts_assembly_dir(self) -> None: """Verify assembly directory is extracted correctly from S3 keys.""" - assert _extract_assembly_dir_from_s3_key( - "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" - ) == "GCF_000001215.4_Release_6_plus_ISO1_MT" - assert _extract_assembly_dir_from_s3_key( - "prefix/GCA_999999999.1_assembly_name/subdir/data.txt" - ) == "GCA_999999999.1_assembly_name" + assert ( + _extract_assembly_dir_from_s3_key( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" + ) + == "GCF_000001215.4_Release_6_plus_ISO1_MT" + ) + assert ( + _extract_assembly_dir_from_s3_key("prefix/GCA_999999999.1_assembly_name/subdir/data.txt") + == "GCA_999999999.1_assembly_name" + ) def test_returns_none_for_invalid_path(self) -> None: """Verify None is returned when no assembly directory is found.""" From 6e458797642aac7e900442753cf3e888df7ada67 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 12:22:32 -0700 Subject: [PATCH 27/76] update docs and progress bar --- docs/ncbi_ftp_e2e_walkthrough.md | 22 +++++++++++++++++++++- notebooks/ncbi_ftp_manifest.ipynb | 8 ++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index ffce0e3e..b4420658 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -76,7 +76,11 @@ s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{as --- -## 1. Start MinIO +## 1. Setup + +### Local testing + +### Start MinIO ```sh docker run -d \ @@ -97,6 +101,22 @@ included `scripts/s3_local.py` helper (requires no extra installs — only uv run python scripts/s3_local.py mb s3://cdm-lake ``` +### Lakehouse + +#### Build `cdm-data-loaders` + +First, clone the `cdm-data-loaders` repo in your Lakehouse user space. Then, build the package +in a virtual environment and register it as a Jupyter kernel: +``` +cd cdm-data-loaders +uv sync +source .venv/bin/activate +uv pip install -e . +uv pip install ipykernel +uv run python -m ipykernel install --user --name cdm-data-loaders --display-name "cdm-data-loaders" +``` +Then, when you open the manifest or promote notebooks, choose the `cdm-data-loaders` kernel. + --- ## 2. Phase 1 — Generate manifests (notebook) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 0c12060e..22516ae2 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -158,12 +158,10 @@ "\n", "if SCAN_STORE and STORE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", - " from IPython.display import display\n", " from tqdm.notebook import tqdm\n", "\n", " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True)\n", - " display(progress)\n", "\n", " def _track_scan(count: int, acc: str) -> None:\n", " progress.n = count\n", @@ -194,7 +192,7 @@ "else:\n", " if SCAN_STORE:\n", " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", - " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")" + " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")\n" ] }, { @@ -271,7 +269,6 @@ "# -- Verify against Lakehouse --\n", "if STORE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", - " from IPython.display import display\n", " from tqdm.notebook import tqdm\n", "\n", " candidates = diff.new + diff.updated\n", @@ -279,7 +276,6 @@ " print(f\"Verifying {total} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", "\n", " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\", leave=True)\n", - " display(progress)\n", "\n", " def _update_progress(done: int, _total: int, acc: str) -> None:\n", " progress.n = done\n", @@ -321,7 +317,7 @@ " diff.new = [a for a in diff.new if a in limited_set]\n", " diff.updated = [a for a in diff.updated if a in limited_set]\n", " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", - " print(f\" (was {original_new} new, {original_updated} updated)\")" + " print(f\" (was {original_new} new, {original_updated} updated)\")\n" ] }, { From adfda18ca5f5e241290195a4ce434cfdac9cd40e Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 12:29:02 -0700 Subject: [PATCH 28/76] add checks for synthetic summary creation --- notebooks/ncbi_ftp_manifest.ipynb | 67 ++++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 22516ae2..e1b915d8 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -149,46 +149,55 @@ " 4. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", "\n", "Typical use case: First run against 500K+ existing assemblies. Scanning takes\n", - "~5 minutes instead of ~6 days of checksum verification.\n", + "significant time (potentially 15-30+ min for large stores with many files per assembly).\n", + "On subsequent runs the saved file is loaded directly — set FORCE_RESCAN=True to override.\n", "\"\"\"\n", "\n", "SCAN_STORE = True # Set to True to scan your store\n", + "FORCE_RESCAN = False # Set to True to ignore an existing LOCAL_SYNTHETIC_SUMMARY and rescan\n", "SYNTHETIC_RELEASE_DATE = \"2025/10/31\" # YYYY/MM/DD applied to all synthetic records\n", "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", "\n", "if SCAN_STORE and STORE_BUCKET:\n", - " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", - " from tqdm.notebook import tqdm\n", - "\n", - " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", - " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True)\n", - "\n", - " def _track_scan(count: int, acc: str) -> None:\n", - " progress.n = count\n", - " progress.set_postfix(acc=acc, refresh=False)\n", + " if LOCAL_SYNTHETIC_SUMMARY.exists() and not FORCE_RESCAN:\n", + " print(f\"Loading existing synthetic summary from {LOCAL_SYNTHETIC_SUMMARY} (set FORCE_RESCAN=True to rescan)\")\n", + " previous = parse_assembly_summary(LOCAL_SYNTHETIC_SUMMARY)\n", + " print(f\"Loaded {len(previous)} assemblies from synthetic summary\")\n", + " else:\n", + " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", + " from tqdm.notebook import tqdm\n", + "\n", + " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", + " print(\"Note: large stores (500K+ assemblies) may take 15-30+ minutes.\")\n", + " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True)\n", + "\n", + " def _track_scan(count: int, acc: str) -> None:\n", + " progress.n = count\n", + " progress.set_postfix(acc=acc, refresh=False)\n", + " progress.refresh()\n", + "\n", + " synthetic = scan_store_to_synthetic_summary(\n", + " STORE_BUCKET,\n", + " STORE_KEY_PREFIX,\n", + " SYNTHETIC_RELEASE_DATE,\n", + " progress_callback=_track_scan,\n", + " )\n", " progress.refresh()\n", "\n", - " synthetic = scan_store_to_synthetic_summary(\n", - " STORE_BUCKET,\n", - " STORE_KEY_PREFIX,\n", - " SYNTHETIC_RELEASE_DATE,\n", - " progress_callback=_track_scan,\n", - " )\n", - " progress.refresh()\n", - "\n", - " print(f\"Found {len(synthetic)} assemblies in store\")\n", + " print(f\"Found {len(synthetic)} assemblies in store\")\n", "\n", - " # Save synthetic summary to file for future runs\n", - " with LOCAL_SYNTHETIC_SUMMARY.open(\"w\") as f:\n", - " for acc in sorted(synthetic.keys()):\n", - " rec = synthetic[acc]\n", - " f.write(\n", - " f\"{rec.accession}\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t{rec.status}\\t.\\t.\\t.\\t{rec.seq_rel_date}\\t.\\t.\\t.\\t.\\t{rec.ftp_path}\\t.\\n\"\n", - " )\n", - " print(f\"Saved synthetic summary to {LOCAL_SYNTHETIC_SUMMARY}\")\n", + " # Save synthetic summary to file for future runs\n", + " LOCAL_SYNTHETIC_SUMMARY.parent.mkdir(parents=True, exist_ok=True)\n", + " with LOCAL_SYNTHETIC_SUMMARY.open(\"w\") as f:\n", + " for acc in sorted(synthetic.keys()):\n", + " rec = synthetic[acc]\n", + " f.write(\n", + " f\"{rec.accession}\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t.\\t{rec.status}\\t.\\t.\\t.\\t{rec.seq_rel_date}\\t.\\t.\\t.\\t.\\t{rec.ftp_path}\\t.\\n\"\n", + " )\n", + " print(f\"Saved synthetic summary to {LOCAL_SYNTHETIC_SUMMARY}\")\n", "\n", - " # Use it as the previous baseline\n", - " previous = synthetic\n", + " # Use it as the previous baseline\n", + " previous = synthetic\n", "else:\n", " if SCAN_STORE:\n", " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", From fdd67c4f002ea29495ba7ce028040cf230748609 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 12:35:05 -0700 Subject: [PATCH 29/76] add connection check to S3 store --- notebooks/ncbi_ftp_manifest.ipynb | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index e1b915d8..c0807086 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -115,6 +115,45 @@ "print(f\"Output dir: {OUTPUT_DIR}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "be1fcf1c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Validate S3 connectivity and bucket/prefix configuration.\"\"\"\n", + "\n", + "s3 = get_s3_client()\n", + "\n", + "# Check bucket is accessible\n", + "try:\n", + " s3.head_bucket(Bucket=\"cdm-lake\" if not STORE_BUCKET else STORE_BUCKET)\n", + " print(f\"✓ Bucket accessible: {STORE_BUCKET or 'cdm-lake'}\")\n", + "except Exception as e:\n", + " print(f\"✗ Bucket not accessible: {e}\")\n", + " raise\n", + "\n", + "# Check that objects exist under the key prefix (i.e. prefix is non-empty)\n", + "if STORE_BUCKET:\n", + " resp = s3.list_objects_v2(Bucket=STORE_BUCKET, Prefix=STORE_KEY_PREFIX, MaxKeys=1)\n", + " if resp.get(\"KeyCount\", 0) > 0:\n", + " print(f\"✓ Prefix has objects: s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}\")\n", + " else:\n", + " print(f\"⚠ No objects found under s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} — check STORE_KEY_PREFIX\")\n", + "else:\n", + " print(\"Skipping prefix check (STORE_BUCKET not set)\")\n", + "\n", + "# Check STAGING_URI bucket if set\n", + "if \"STAGING_URI\" in dir() and STAGING_URI:\n", + " staging_bucket, _ = split_s3_path(STAGING_URI)\n", + " try:\n", + " s3.head_bucket(Bucket=staging_bucket)\n", + " print(f\"✓ Staging bucket accessible: {staging_bucket}\")\n", + " except Exception as e:\n", + " print(f\"✗ Staging bucket not accessible: {e}\")\n" + ] + }, { "cell_type": "code", "execution_count": null, From d4f868d817d5536275b33b1e1dd260c5822a3c30 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 13:13:22 -0700 Subject: [PATCH 30/76] add s3 connection check --- docs/ncbi_ftp_e2e_walkthrough.md | 27 ++++++++++++++++- notebooks/ncbi_ftp_manifest.ipynb | 50 +++++++++++++++++-------------- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index b4420658..8b2f1ab5 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -107,7 +107,7 @@ uv run python scripts/s3_local.py mb s3://cdm-lake First, clone the `cdm-data-loaders` repo in your Lakehouse user space. Then, build the package in a virtual environment and register it as a Jupyter kernel: -``` +```bash cd cdm-data-loaders uv sync source .venv/bin/activate @@ -117,6 +117,31 @@ uv run python -m ipykernel install --user --name cdm-data-loaders --display-name ``` Then, when you open the manifest or promote notebooks, choose the `cdm-data-loaders` kernel. +#### Add the S3 Credentials to the Kernel + +Open a new Jupyter Notebook with the default kernel and run this in a new cell: +```python +import os +for k, v in sorted(os.environ.items()): + if "AWS" in k or "S3" in k or "MINIO" in k: + print(f"{k}={v}") +``` +Take the output and add the environment vars to the `kernel.json` for your new kernel (e.g., in `cdm-data-loaders/.venv/share/jupyter/kernels/python3/kernel.json`): +```json +{ + "argv": ["..."], + "display_name": "cdm-data-loaders", + "language": "python", + "env": { + "AWS_ACCESS_KEY_ID": "...", + "AWS_SECRET_ACCESS_KEY": "...", + "AWS_DEFAULT_REGION": "...", + ... + } +} +``` + + --- ## 2. Phase 1 — Generate manifests (notebook) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index c0807086..85f73b13 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -124,34 +124,40 @@ "source": [ "\"\"\"Validate S3 connectivity and bucket/prefix configuration.\"\"\"\n", "\n", - "s3 = get_s3_client()\n", + "import boto3\n", + "from botocore.exceptions import ClientError, NoCredentialsError\n", "\n", - "# Check bucket is accessible\n", + "s3 = boto3.client(\"s3\")\n", + "\n", + "# Check credentials are present\n", "try:\n", - " s3.head_bucket(Bucket=\"cdm-lake\" if not STORE_BUCKET else STORE_BUCKET)\n", - " print(f\"✓ Bucket accessible: {STORE_BUCKET or 'cdm-lake'}\")\n", - "except Exception as e:\n", - " print(f\"✗ Bucket not accessible: {e}\")\n", + " sts = boto3.client(\"sts\")\n", + " identity = sts.get_caller_identity()\n", + " print(f\"✓ Credentials valid — account: {identity['Account']}, arn: {identity['Arn']}\")\n", + "except NoCredentialsError:\n", + " print(\"✗ No AWS credentials found\")\n", " raise\n", + "except ClientError as e:\n", + " if e.response[\"Error\"][\"Code\"] == \"InvalidParameterValue\":\n", + " print(\"✓ Credentials present (STS GetCallerIdentity not supported on this endpoint — skipping identity check)\")\n", + " else:\n", + " print(f\"✗ Credential check failed: {e}\")\n", + " raise\n", "\n", - "# Check that objects exist under the key prefix (i.e. prefix is non-empty)\n", - "if STORE_BUCKET:\n", - " resp = s3.list_objects_v2(Bucket=STORE_BUCKET, Prefix=STORE_KEY_PREFIX, MaxKeys=1)\n", + "# Check bucket access and prefix in one step — list_objects_v2 requires only\n", + "# s3:ListBucket on the prefix, which is less restrictive than HeadBucket.\n", + "_check_bucket = STORE_BUCKET if \"STORE_BUCKET\" in dir() and STORE_BUCKET else \"cdm-lake\"\n", + "_check_prefix = STORE_KEY_PREFIX if \"STORE_KEY_PREFIX\" in dir() else \"tenant-general-warehouse/kbase/datasets/ncbi/\"\n", + "try:\n", + " resp = s3.list_objects_v2(Bucket=_check_bucket, Prefix=_check_prefix, MaxKeys=1)\n", " if resp.get(\"KeyCount\", 0) > 0:\n", - " print(f\"✓ Prefix has objects: s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}\")\n", + " print(f\"✓ Bucket accessible and prefix has objects: s3://{_check_bucket}/{_check_prefix}\")\n", " else:\n", - " print(f\"⚠ No objects found under s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} — check STORE_KEY_PREFIX\")\n", - "else:\n", - " print(\"Skipping prefix check (STORE_BUCKET not set)\")\n", - "\n", - "# Check STAGING_URI bucket if set\n", - "if \"STAGING_URI\" in dir() and STAGING_URI:\n", - " staging_bucket, _ = split_s3_path(STAGING_URI)\n", - " try:\n", - " s3.head_bucket(Bucket=staging_bucket)\n", - " print(f\"✓ Staging bucket accessible: {staging_bucket}\")\n", - " except Exception as e:\n", - " print(f\"✗ Staging bucket not accessible: {e}\")\n" + " print(f\"✓ Bucket accessible but no objects found under s3://{_check_bucket}/{_check_prefix} — check STORE_KEY_PREFIX\")\n", + "except ClientError as e:\n", + " code = e.response[\"Error\"][\"Code\"]\n", + " print(f\"✗ S3 access check failed (HTTP {code}): {e}\")\n", + " raise\n" ] }, { From 55cbf9e296df9dadb33bba52692f97fa20dc8c37 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 13:21:54 -0700 Subject: [PATCH 31/76] reduce progress bar update rate --- notebooks/ncbi_ftp_manifest.ipynb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 85f73b13..a25031d5 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -209,17 +209,24 @@ " previous = parse_assembly_summary(LOCAL_SYNTHETIC_SUMMARY)\n", " print(f\"Loaded {len(previous)} assemblies from synthetic summary\")\n", " else:\n", + " import time as _time\n", " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", " from tqdm.notebook import tqdm\n", "\n", " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", " print(\"Note: large stores (500K+ assemblies) may take 15-30+ minutes.\")\n", - " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True)\n", + " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True, mininterval=2.0)\n", + "\n", + " _last_refresh = _time.monotonic()\n", + " _REFRESH_INTERVAL = 2.0 # seconds between display updates\n", "\n", " def _track_scan(count: int, acc: str) -> None:\n", + " global _last_refresh\n", " progress.n = count\n", - " progress.set_postfix(acc=acc, refresh=False)\n", - " progress.refresh()\n", + " now = _time.monotonic()\n", + " if now - _last_refresh >= _REFRESH_INTERVAL:\n", + " progress.set_postfix(acc=acc, refresh=True)\n", + " _last_refresh = now\n", "\n", " synthetic = scan_store_to_synthetic_summary(\n", " STORE_BUCKET,\n", @@ -227,6 +234,7 @@ " SYNTHETIC_RELEASE_DATE,\n", " progress_callback=_track_scan,\n", " )\n", + " progress.n = len(synthetic)\n", " progress.refresh()\n", "\n", " print(f\"Found {len(synthetic)} assemblies in store\")\n", From c1ee7e3e1afae2e8a2fbc9301c96555fc5dbe3e9 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 21 Apr 2026 13:42:06 -0700 Subject: [PATCH 32/76] select database for synthetic summary --- notebooks/ncbi_ftp_manifest.ipynb | 12 +++++++----- src/cdm_data_loaders/ncbi_ftp/manifest.py | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index a25031d5..4762f768 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -189,9 +189,10 @@ "\n", "Set SCAN_STORE=True below to enable. The scan will:\n", " 1. List all objects under STORE_BUCKET/STORE_KEY_PREFIX\n", - " 2. Extract accessions and apply user-provided SYNTHETIC_RELEASE_DATE to all records\n", - " 3. Build AssemblyRecord for each assembly found\n", - " 4. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", + " 2. Extract accessions matching the DATABASE type (GCF_ for refseq, GCA_ for genbank)\n", + " 3. Apply user-provided SYNTHETIC_RELEASE_DATE to all records\n", + " 4. Build AssemblyRecord for each assembly found\n", + " 5. Save to LOCAL_SYNTHETIC_SUMMARY for re-use in future runs\n", "\n", "Typical use case: First run against 500K+ existing assemblies. Scanning takes\n", "significant time (potentially 15-30+ min for large stores with many files per assembly).\n", @@ -213,7 +214,7 @@ " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", " from tqdm.notebook import tqdm\n", "\n", - " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing assemblies ...\")\n", + " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing {DATABASE} assemblies ...\")\n", " print(\"Note: large stores (500K+ assemblies) may take 15-30+ minutes.\")\n", " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True, mininterval=2.0)\n", "\n", @@ -232,12 +233,13 @@ " STORE_BUCKET,\n", " STORE_KEY_PREFIX,\n", " SYNTHETIC_RELEASE_DATE,\n", + " database=DATABASE,\n", " progress_callback=_track_scan,\n", " )\n", " progress.n = len(synthetic)\n", " progress.refresh()\n", "\n", - " print(f\"Found {len(synthetic)} assemblies in store\")\n", + " print(f\"Found {len(synthetic)} {DATABASE} assemblies in store\")\n", "\n", " # Save synthetic summary to file for future runs\n", " LOCAL_SYNTHETIC_SUMMARY.parent.mkdir(parents=True, exist_ok=True)\n", diff --git a/src/cdm_data_loaders/ncbi_ftp/manifest.py b/src/cdm_data_loaders/ncbi_ftp/manifest.py index 42fd99d2..b14d2923 100644 --- a/src/cdm_data_loaders/ncbi_ftp/manifest.py +++ b/src/cdm_data_loaders/ncbi_ftp/manifest.py @@ -286,10 +286,17 @@ def _extract_assembly_dir_from_s3_key(key: str) -> str | None: return m.group(1) if m else None +_DATABASE_ACC_PREFIX: dict[str, str] = { + "refseq": "GCF_", + "genbank": "GCA_", +} + + def scan_store_to_synthetic_summary( bucket: str, key_prefix: str, release_date: str, + database: str = "refseq", progress_callback: Callable[[int, str], None] | None = None, ) -> dict[str, AssemblyRecord]: """Scan S3 store and build a synthetic assembly summary from existing objects. @@ -302,6 +309,8 @@ def scan_store_to_synthetic_summary( - Applies the provided ``release_date`` as synthetic ``seq_rel_date`` for all assemblies - Creates an ``AssemblyRecord`` with ``status="latest"`` + - Filters to accessions matching the expected prefix for ``database`` + (``GCF_`` for ``"refseq"``, ``GCA_`` for ``"genbank"``) The function paginates through S3 to handle large stores efficiently. @@ -309,6 +318,8 @@ def scan_store_to_synthetic_summary( :param key_prefix: S3 key prefix (all objects under this prefix are scanned) :param release_date: release date string in ``YYYY/MM/DD`` format used for all synthetic records + :param database: ``"refseq"`` or ``"genbank"`` — controls which accession + prefix is included (``GCF_`` or ``GCA_`` respectively) :param progress_callback: optional callable invoked after each accession is processed with ``(count, accession)`` where count is the running total of unique accessions found @@ -320,6 +331,11 @@ def scan_store_to_synthetic_summary( msg = f"Invalid release_date '{release_date}'. Expected format YYYY/MM/DD." raise ValueError(msg) from exc + acc_prefix = _DATABASE_ACC_PREFIX.get(database) + if acc_prefix is None: + msg = f"Unknown database: {database!r}. Expected 'refseq' or 'genbank'." + raise ValueError(msg) + s3 = get_s3_client() assemblies: dict[str, AssemblyRecord] = {} processed_count = 0 @@ -331,6 +347,8 @@ def scan_store_to_synthetic_summary( for page in pages: for obj in page.get("Contents", []): acc = _extract_accession_from_s3_key(obj["Key"]) + if not acc or not acc.startswith(acc_prefix): + continue assembly_dir = _extract_assembly_dir_from_s3_key(obj["Key"]) if not acc or not assembly_dir: From 76349c1f48fc9ab45dea20c1dc3f627fbf182fb2 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 22 Apr 2026 07:59:12 -0700 Subject: [PATCH 33/76] formatting --- notebooks/ncbi_ftp_manifest.ipynb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 4762f768..4d51c5b3 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -153,11 +153,13 @@ " if resp.get(\"KeyCount\", 0) > 0:\n", " print(f\"✓ Bucket accessible and prefix has objects: s3://{_check_bucket}/{_check_prefix}\")\n", " else:\n", - " print(f\"✓ Bucket accessible but no objects found under s3://{_check_bucket}/{_check_prefix} — check STORE_KEY_PREFIX\")\n", + " print(\n", + " f\"✓ Bucket accessible but no objects found under s3://{_check_bucket}/{_check_prefix} — check STORE_KEY_PREFIX\"\n", + " )\n", "except ClientError as e:\n", " code = e.response[\"Error\"][\"Code\"]\n", " print(f\"✗ S3 access check failed (HTTP {code}): {e}\")\n", - " raise\n" + " raise" ] }, { @@ -256,7 +258,7 @@ "else:\n", " if SCAN_STORE:\n", " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", - " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")\n" + " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")" ] }, { @@ -381,7 +383,7 @@ " diff.new = [a for a in diff.new if a in limited_set]\n", " diff.updated = [a for a in diff.updated if a in limited_set]\n", " print(f\"After limit ({LIMIT}): {len(diff.new)} new, {len(diff.updated)} updated\")\n", - " print(f\" (was {original_new} new, {original_updated} updated)\")\n" + " print(f\" (was {original_new} new, {original_updated} updated)\")" ] }, { From 3369e11299cc9ebf19db604c648cb249e280aeda Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 22 Apr 2026 11:56:22 -0700 Subject: [PATCH 34/76] add frictionless descriptors --- docs/ncbi_ftp_e2e_walkthrough.md | 40 +++ notebooks/ncbi_ftp_promote.ipynb | 44 ++- src/cdm_data_loaders/ncbi_ftp/metadata.py | 259 +++++++++++++++ src/cdm_data_loaders/ncbi_ftp/promote.py | 138 ++++++-- tests/integration/test_promote_e2e.py | 173 ++++++++++ tests/ncbi_ftp/test_metadata.py | 369 ++++++++++++++++++++++ 6 files changed, 993 insertions(+), 30 deletions(-) create mode 100644 src/cdm_data_loaders/ncbi_ftp/metadata.py create mode 100644 tests/ncbi_ftp/test_metadata.py diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index 8b2f1ab5..d814b73b 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -428,6 +428,46 @@ uv run python scripts/s3_local.py head \ s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/615/GCF_900000615.1_PRJEB7657_assembly/GCF_900000615.1_PRJEB7657_assembly_genomic.fna.gz ``` +### Frictionless metadata descriptors + +Each promoted assembly gets a [frictionless](https://framework.frictionlessdata.io/) data package descriptor stored at: + +``` +s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}metadata/{assembly_dir}_datapackage.json +``` + +For example: + +``` +s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/GCF_900000615.1_PRJEB7657_assembly_datapackage.json +``` + +The descriptor follows the KBase credit metadata schema (v1.0) and records: + +- **identifier** — `NCBI:{accession}`, e.g. `NCBI:GCF_900000615.1` +- **resource_type** — always `"dataset"` +- **resources** — list of promoted files with their final S3 key, byte size, + file format, and MD5 hash (when available) +- **contributors / publisher** — NCBI organizational metadata +- **meta.saved_by** — `"cdm-data-loaders-ncbi-ftp"` + +When an assembly is archived (updated or removed), its live descriptor is +copied to: + +``` +s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json +``` + +Use the last cell of `notebooks/ncbi_ftp_promote.ipynb` to list and preview +all descriptors written in a promote run. + +To inspect a descriptor directly: + +```sh +uv run python scripts/s3_local.py cat \ + s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/metadata/GCF_900000615.1_PRJEB7657_assembly_datapackage.json +``` + --- ## 6. Incremental run (second sync) diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index 1da31ab2..81239f39 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -34,8 +34,8 @@ "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/transfer_manifest.txt` |\n", "| `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", "\n", - "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", - "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" + "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/\u2026/{filename}`\n", + "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/\u2026/{filename}`" ] }, { @@ -100,7 +100,7 @@ "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", "\n", - "# Dry-run mode — log actions without making changes\n", + "# Dry-run mode \u2014 log actions without making changes\n", "DRY_RUN = False\n", "\n", "print(f\"Bucket: {STORE_BUCKET}\")\n", @@ -185,10 +185,42 @@ "print(f\"Timestamp: {report['timestamp']}\")\n", "\n", "if report[\"failed\"] > 0:\n", - " print(\"\\n⚠️ Some operations failed — check logs above for details.\")\n", + " print(\"\\n\u26a0\ufe0f Some operations failed \u2014 check logs above for details.\")\n", "\n", "if report[\"dry_run\"]:\n", - " print(\"\\n📋 This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" + " print(\"\\n\ud83d\udccb This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Inspect frictionless descriptors written to metadata/.\"\"\"\n", + "\n", + "from cdm_data_loaders.ncbi_ftp.metadata import build_descriptor_key\n", + "\n", + "s3 = get_s3_client()\n", + "paginator = s3.get_paginator(\"list_objects_v2\")\n", + "\n", + "descriptor_keys: list[str] = []\n", + "for page in paginator.paginate(Bucket=STORE_BUCKET, Prefix=LAKEHOUSE_KEY_PREFIX + \"metadata/\"):\n", + " descriptor_keys.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", + "\n", + "print(f\"Found {len(descriptor_keys)} descriptor(s) in metadata/\")\n", + "\n", + "for key in descriptor_keys[:5]: # preview first 5\n", + " obj = s3.get_object(Bucket=STORE_BUCKET, Key=key)\n", + " descriptor = json.loads(obj[\"Body\"].read())\n", + " print()\n", + " print(f\" Key: {key}\")\n", + " print(f\" Identifier: {descriptor.get('identifier')}\")\n", + " print(f\" Version: {descriptor.get('version')}\")\n", + " print(f\" Resources: {len(descriptor.get('resources', []))} file(s)\")\n", + "\n", + "if len(descriptor_keys) > 5:\n", + " print(f\" ... and {len(descriptor_keys) - 5} more\")" ] } ], @@ -213,4 +245,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py new file mode 100644 index 00000000..356ee0d6 --- /dev/null +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -0,0 +1,259 @@ +"""Frictionless data package descriptor creation for NCBI FTP assemblies. + +Creates KBase credit metadata descriptors for each promoted assembly, +matching the schema produced by ``kbase-transfers/scripts/ncbi/download_genomes.py``. + +Each descriptor is a frictionless ``Package``-compatible JSON document +describing the assembly's data files, stored at:: + + {key_prefix}metadata/{assembly_dir}_datapackage.json + +and archived alongside raw data at:: + + {key_prefix}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json + +The descriptor ``resources`` list records the final Lakehouse S3 key, byte +size, file format, and MD5 hash of each promoted data file. +""" + +from __future__ import annotations + +import json +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any, TypedDict + +from frictionless import Package + +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger +from cdm_data_loaders.utils.s3 import copy_object_with_metadata, get_s3_client + +logger = get_cdm_logger() + +_NCBI_CONTRIBUTOR = { + "contributor_type": "Organization", + "name": "National Center for Biotechnology Information", + "contributor_id": "ROR:02meqm098", + "contributor_roles": "DataCurator", +} +_NCBI_PUBLISHER = { + "organization_name": "National Center for Biotechnology Information", + "organization_id": "ROR:02meqm098", +} +_SAVED_BY = "cdm-data-loaders-ncbi-ftp" +_SCHEMA_VERSION = "1.0" + + +class DescriptorResource(TypedDict, total=False): + """A single resource entry in the frictionless descriptor ``resources`` list.""" + + name: str + path: str + format: str + bytes: int | None + hash: str | None + + +# ── Public helpers ──────────────────────────────────────────────────────── + + +def build_descriptor_key(assembly_dir: str, key_prefix: str) -> str: + """Return the S3 key for the live descriptor of *assembly_dir*. + + :param assembly_dir: full assembly directory name, e.g. ``GCF_000001215.4_Release_6_plus_ISO1_MT`` + :param key_prefix: Lakehouse key prefix (trailing slash optional) + :return: S3 key, e.g. ``tenant-general-warehouse/.../ncbi/metadata/GCF_..._datapackage.json`` + """ + prefix = key_prefix.rstrip("/") + "/" + return f"{prefix}metadata/{assembly_dir}_datapackage.json" + + +def build_archive_descriptor_key(assembly_dir: str, release_tag: str, key_prefix: str) -> str: + """Return the S3 key for the archived descriptor of *assembly_dir*. + + :param assembly_dir: full assembly directory name + :param release_tag: NCBI release tag used in the archive path, e.g. ``"2024-01"`` + :param key_prefix: Lakehouse key prefix + :return: S3 key under ``archive/{release_tag}/metadata/`` + """ + prefix = key_prefix.rstrip("/") + "/" + return f"{prefix}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json" + + +def create_descriptor( + assembly_dir: str, + accession_full: str, + resources: list[DescriptorResource], + *, + timestamp: int | None = None, +) -> dict[str, Any]: + """Build a KBase credit metadata descriptor for an NCBI assembly. + + Matches the schema produced by + ``kbase-transfers/scripts/ncbi/download_genomes.py::create_frictionless_descriptor()``. + + Resource names are lowercased. Resources whose ``hash`` value is ``None`` + have the ``hash`` key removed entirely (frictionless does not accept null + hash values). + + :param assembly_dir: full assembly directory name (includes the accession + suffix, e.g. ``GCF_000001215.4_Release_6_plus_ISO1_MT``) + :param accession_full: accession without suffix, e.g. ``GCF_000001215.4`` + :param resources: list of :class:`DescriptorResource` dicts + :param timestamp: Unix timestamp to embed; defaults to ``datetime.now(UTC)`` + :return: descriptor dict ready for serialisation and frictionless validation + """ + ts = timestamp if timestamp is not None else int(datetime.now(UTC).timestamp()) + version = accession_full.rsplit(".", 1)[-1] # e.g. "4" from "GCF_000001215.4" + + # Normalise resources: lowercase name, drop null hash + normalised: list[dict[str, Any]] = [] + for res in resources: + entry: dict[str, Any] = { + "name": res["name"].lower(), + "path": res["path"], + "format": res.get("format", ""), + } + if res.get("bytes") is not None: + entry["bytes"] = res["bytes"] + if res.get("hash") is not None: + entry["hash"] = res["hash"] + normalised.append(entry) + + return { + "identifier": f"NCBI:{accession_full}", + "resource_type": "dataset", + "version": version, + "titles": [{"title": f"NCBI Genome Assembly {assembly_dir}"}], + "descriptions": [ + {"description_text": (f"Genome assembly files for {accession_full} downloaded from NCBI Datasets")} + ], + "url": f"https://www.ncbi.nlm.nih.gov/datasets/genome/{accession_full}/", + "contributors": [_NCBI_CONTRIBUTOR], + "publisher": _NCBI_PUBLISHER, + "license": {}, + "meta": { + "credit_metadata_schema_version": _SCHEMA_VERSION, + "credit_metadata_source": [ + { + "source_name": "NCBI Genomes FTP", + "source_url": "ftp.ncbi.nlm.nih.gov/genomes/all/", + "access_timestamp": ts, + } + ], + "saved_by": _SAVED_BY, + "timestamp": ts, + }, + "resources": normalised, + } + + +def validate_descriptor(descriptor: dict[str, Any], accession_full: str) -> None: + """Validate a descriptor with frictionless. + + :param descriptor: descriptor dict from :func:`create_descriptor` + :param accession_full: accession (used only in error messages) + :raises ValueError: if frictionless reports any metadata errors + """ + errors = list(Package.metadata_validate(descriptor)) + if errors: + error_details = "; ".join(str(e) for e in errors) + msg = f"Frictionless validation failed for {accession_full}: {error_details}" + raise ValueError(msg) + logger.debug("Frictionless descriptor valid for %s", accession_full) + + +def upload_descriptor( + descriptor: dict[str, Any], + assembly_dir: str, + bucket: str, + key_prefix: str, + *, + dry_run: bool = False, +) -> str: + """Serialise and upload a descriptor to the live ``metadata/`` path. + + :param descriptor: descriptor dict from :func:`create_descriptor` + :param assembly_dir: full assembly directory name + :param bucket: S3 bucket name + :param key_prefix: Lakehouse key prefix + :param dry_run: if True, log without uploading + :return: S3 key the descriptor was (or would be) written to + """ + key = build_descriptor_key(assembly_dir, key_prefix) + + if dry_run: + logger.info("[dry-run] would upload descriptor: s3://%s/%s", bucket, key) + return key + + s3 = get_s3_client() + body = json.dumps(descriptor, indent=2).encode() + + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp: + tmp_path = tmp.name + tmp.write(body) + + try: + s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=key) + logger.info("Uploaded descriptor: s3://%s/%s", bucket, key) + finally: + Path(tmp_path).unlink() + + return key + + +def archive_descriptor( # noqa: PLR0913 + assembly_dir: str, + bucket: str, + key_prefix: str, + release_tag: str, + *, + archive_reason: str = "unknown", + dry_run: bool = False, +) -> bool: + """Copy the live descriptor to the archive path. + + If the live descriptor does not yet exist (e.g. archival is triggered + before the first promote), logs a warning and returns ``False``. + + :param assembly_dir: full assembly directory name + :param bucket: S3 bucket name + :param key_prefix: Lakehouse key prefix + :param release_tag: NCBI release tag for the archive path + :param archive_reason: metadata value describing why archived (matches raw data metadata) + :param dry_run: if True, log without copying + :return: ``True`` if the descriptor was (or would be) archived; ``False`` if not found + """ + source_key = build_descriptor_key(assembly_dir, key_prefix) + archive_key = build_archive_descriptor_key(assembly_dir, release_tag, key_prefix) + + if dry_run: + logger.info("[dry-run] would archive descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) + return True + + s3 = get_s3_client() + try: + s3.head_object(Bucket=bucket, Key=source_key) + except s3.exceptions.NoSuchKey: + logger.warning("Descriptor not found, skipping archive: s3://%s/%s", bucket, source_key) + return False + except Exception as e: + # head_object raises ClientError with 404 when key is absent + if hasattr(e, "response") and e.response.get("Error", {}).get("Code") in ("404", "NoSuchKey"): # type: ignore[union-attr] + logger.warning("Descriptor not found, skipping archive: s3://%s/%s", bucket, source_key) + return False + raise + + datestamp = datetime.now(UTC).strftime("%Y-%m-%d") + copy_object_with_metadata( + f"{bucket}/{source_key}", + f"{bucket}/{archive_key}", + metadata={ + "ncbi_last_release": release_tag, + "archive_reason": archive_reason, + "archive_date": datestamp, + }, + ) + logger.debug("Archived descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) + return True diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 6faf8c79..7767ccb2 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -8,12 +8,19 @@ import re import tempfile +from collections import defaultdict from datetime import UTC, datetime from pathlib import Path, PurePosixPath from typing import Any import botocore.exceptions +from cdm_data_loaders.ncbi_ftp.metadata import ( + DescriptorResource, + archive_descriptor, + create_descriptor, + upload_descriptor, +) from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.s3 import ( copy_object_with_metadata, @@ -60,9 +67,6 @@ def promote_from_s3( # noqa: PLR0913 paginator = s3.get_paginator("list_objects_v2") normalized_staging_key_prefix = staging_key_prefix.rstrip("/") + "/" - promoted = 0 - failed = 0 - # Collect all objects under the staging prefix staged_objects: list[str] = [] for page in paginator.paginate(Bucket=bucket, Prefix=normalized_staging_key_prefix): @@ -91,13 +95,79 @@ def promote_from_s3( # noqa: PLR0913 dry_run=dry_run, ) + promoted, failed, promoted_accessions, assembly_resources = _promote_data_files( + data_files, + sidecars, + normalized_staging_key_prefix, + lakehouse_key_prefix, + bucket, + dry_run=dry_run, + ) + + # Trim manifest for resumability + if manifest_s3_key and promoted_accessions and not dry_run: + _trim_manifest(manifest_s3_key, bucket, promoted_accessions) + + # Upload frictionless descriptors for each promoted assembly + descriptors_written = 0 + for (adir, acc), resources in assembly_resources.items(): + if not resources: + continue + try: + descriptor = create_descriptor(adir, acc, resources) + upload_descriptor(descriptor, adir, bucket, lakehouse_key_prefix, dry_run=dry_run) + descriptors_written += 1 + except Exception: + logger.exception("Failed to write descriptor for %s", adir) + + if descriptors_written: + logger.info("Wrote %d frictionless descriptor(s)", descriptors_written) + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "promoted": promoted, + "archived": archived, + "failed": failed, + "dry_run": dry_run, + } + + logger.info( + "PROMOTE SUMMARY: %d promoted, %d archived, %d failed%s", + promoted, + archived, + failed, + " (dry-run)" if dry_run else "", + ) + return report + + +# ── Promote data files (per-file loop) ────────────────────────────────── + + +def _promote_data_files( # noqa: PLR0913, PLR0915 + data_files: list[str], + sidecars: set[str], + normalized_staging_prefix: str, + lakehouse_key_prefix: str, + bucket: str, + *, + dry_run: bool, +) -> tuple[int, int, set[str], defaultdict[tuple[str, str], list[DescriptorResource]]]: + """Promote each data file from staging to the final Lakehouse path. + + :return: (promoted_count, failed_count, promoted_accessions, assembly_resources) + """ + s3 = get_s3_client() + promoted = 0 + failed = 0 promoted_accessions: set[str] = set() + assembly_resources: defaultdict[tuple[str, str], list[DescriptorResource]] = defaultdict(list) for staged_key in data_files: if staged_key.endswith("download_report.json"): continue - rel_path = staged_key[len(normalized_staging_key_prefix) :] + rel_path = staged_key[len(normalized_staging_prefix) :] if not rel_path.startswith("raw_data/"): continue final_key = lakehouse_key_prefix + rel_path @@ -139,32 +209,30 @@ def promote_from_s3( # noqa: PLR0913 if acc_match: promoted_accessions.add(acc_match.group(1)) + # Track resources for frictionless descriptor creation + adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", final_key) + if adir_match and acc_match: + adir = adir_match.group(1) + acc = acc_match.group(1) + fname = final_key_path.name + ext = fname.rsplit(".", 1)[-1] if "." in fname else "" + md5_hash = metadata.get("md5") + resource: DescriptorResource = { + "name": fname.lower(), + "path": final_key, + "format": ext, + "bytes": Path(tmp_path).stat().st_size, + "hash": md5_hash, + } + assembly_resources[(adir, acc)].append(resource) + finally: Path(tmp_path).unlink() except Exception: logger.exception("Failed to promote %s", staged_key) failed += 1 - # Trim manifest for resumability - if manifest_s3_key and promoted_accessions and not dry_run: - _trim_manifest(manifest_s3_key, bucket, promoted_accessions) - - report: dict[str, Any] = { - "timestamp": datetime.now(UTC).isoformat(), - "promoted": promoted, - "archived": archived, - "failed": failed, - "dry_run": dry_run, - } - - logger.info( - "PROMOTE SUMMARY: %d promoted, %d archived, %d failed%s", - promoted, - archived, - failed, - " (dry-run)" if dry_run else "", - ) - return report + return promoted, failed, promoted_accessions, assembly_resources # ── Archive assemblies ────────────────────────────────────────────────── @@ -223,6 +291,14 @@ def _archive_assemblies( # noqa: PLR0913 logger.debug("No objects found for %s, skipping archive", accession) continue + # Infer assembly_dir from key paths for descriptor archival + assembly_dir: str | None = None + for key in matching_keys: + adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", key) + if adir_match: + assembly_dir = adir_match.group(1) + break + for source_key in matching_keys: rel = source_key[len(lakehouse_key_prefix) :] archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{rel}" @@ -249,6 +325,20 @@ def _archive_assemblies( # noqa: PLR0913 except Exception: logger.exception("Failed to archive %s", source_key) + # Archive the frictionless descriptor alongside raw data + if assembly_dir: + try: + archive_descriptor( + assembly_dir, + bucket, + lakehouse_key_prefix, + release_tag, + archive_reason=archive_reason, + dry_run=dry_run, + ) + except Exception: + logger.exception("Failed to archive descriptor for %s", assembly_dir) + logger.info("Archived %d objects for %d accessions (%s)", archived, len(accessions), archive_reason) return archived diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 69fa3999..b97aa542 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -11,11 +11,17 @@ from __future__ import annotations import hashlib +import json from typing import TYPE_CHECKING import pytest from cdm_data_loaders.ncbi_ftp.assembly import build_accession_path +from cdm_data_loaders.ncbi_ftp.metadata import ( + build_archive_descriptor_key, + build_descriptor_key, + create_descriptor, +) from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from .conftest import get_object_metadata, list_all_keys, seed_lakehouse @@ -318,3 +324,170 @@ def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> # No objects at final path final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") assert len(final_keys) == 0 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteCreatesDescriptor: + """Promote step writes a frictionless descriptor for each promoted assembly.""" + + def test_descriptor_created(self, minio_s3_client: object, test_bucket: str) -> None: + """After promote, a JSON descriptor exists under ``metadata/``.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + assert body["identifier"] == f"NCBI:{ACCESSION_A}" + assert body["resource_type"] == "dataset" + + def test_descriptor_resources_include_promoted_files(self, minio_s3_client: object, test_bucket: str) -> None: + """Descriptor's ``resources`` list references the final Lakehouse key.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + resource_paths = [r["path"] for r in body["resources"]] + assert any(PATH_PREFIX + "raw_data/" in p for p in resource_paths) + + def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucket: str) -> None: + """Resources with .md5 sidecars include the hash value.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=descriptor_key) + body = json.loads(obj["Body"].read()) + + # Both staged files have .md5 sidecars + for resource in body["resources"]: + assert "hash" in resource, f"Expected hash in resource: {resource}" + + def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: object, test_bucket: str) -> None: + """Each assembly gets its own descriptor file.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_B) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + for assembly_dir, accession in [(ASSEMBLY_DIR_A, ACCESSION_A), (ASSEMBLY_DIR_B, ACCESSION_B)]: + key = build_descriptor_key(assembly_dir, PATH_PREFIX) + obj = s3.get_object(Bucket=test_bucket, Key=key) + body = json.loads(obj["Body"].read()) + assert body["identifier"] == f"NCBI:{accession}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveUpdatedIncludesDescriptor: + """Archiving updated assemblies also archives the descriptor.""" + + def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + """After archiving an updated assembly, the descriptor appears under archive/.""" + s3 = minio_s3_client + + # Seed old version at Lakehouse path *including* a live descriptor + old_files = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "old content"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, old_files, PATH_PREFIX, ASSEMBLY_DIR_A) + # Pre-upload a descriptor so archive_descriptor can find it + descriptor = create_descriptor(ASSEMBLY_DIR_A, ACCESSION_A, []) + # Upload directly to MinIO (not via promote) + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + s3.put_object(Bucket=test_bucket, Key=descriptor_key, Body=json.dumps(descriptor).encode()) + + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + updated_manifest_path=str(updated_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX) + # Confirm the archive descriptor object exists + resp = s3.head_object(Bucket=test_bucket, Key=archive_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteArchiveRemovedIncludesDescriptor: + """Archiving removed assemblies also archives the descriptor.""" + + def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + """After archiving a removed assembly, the descriptor is under archive/.""" + s3 = minio_s3_client + + # Seed the assembly at final Lakehouse path + files = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": "content"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + # Pre-upload a descriptor + descriptor = create_descriptor(ASSEMBLY_DIR_A, ACCESSION_A, []) + descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) + s3.put_object(Bucket=test_bucket, Key=descriptor_key, Body=json.dumps(descriptor).encode()) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + removed_manifest_path=str(removed_manifest), + ncbi_release="2024-01", + lakehouse_key_prefix=PATH_PREFIX, + ) + + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX) + resp = s3.head_object(Bucket=test_bucket, Key=archive_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRunNoDescriptor: + """Dry-run must not write any descriptor files.""" + + def test_dry_run_no_descriptor(self, minio_s3_client: object, test_bucket: str) -> None: + """Dry-run does not upload a descriptor to the metadata/ prefix.""" + s3 = minio_s3_client + _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + metadata_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "metadata/") + assert len(metadata_keys) == 0, f"Dry-run should not create descriptor files, found: {metadata_keys}" diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py new file mode 100644 index 00000000..821ac81d --- /dev/null +++ b/tests/ncbi_ftp/test_metadata.py @@ -0,0 +1,369 @@ +"""Unit tests for cdm_data_loaders.ncbi_ftp.metadata.""" + +from __future__ import annotations + +import json +import time +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + +import boto3 +import pytest +from moto import mock_aws + +if TYPE_CHECKING: + from collections.abc import Generator + + import botocore.client + +import cdm_data_loaders.ncbi_ftp.metadata as metadata_mod +import cdm_data_loaders.utils.s3 as s3_utils +from cdm_data_loaders.ncbi_ftp.metadata import ( + DescriptorResource, + archive_descriptor, + build_archive_descriptor_key, + build_descriptor_key, + create_descriptor, + upload_descriptor, + validate_descriptor, +) +from cdm_data_loaders.utils.s3 import reset_s3_client +from tests.ncbi_ftp.conftest import TEST_BUCKET + +AWS_REGION = "us-east-1" + +_ACCESSION = "GCF_000001215.4" +_ASSEMBLY_DIR = "GCF_000001215.4_Release_6_plus_ISO1_MT" +_RELEASE_TAG = "2024-01" +_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" +_TIMESTAMP = 1_700_000_000 + +_SAMPLE_RESOURCES: list[DescriptorResource] = [ + { + "name": "GCF_000001215.4_genomic.fna.gz", + "path": f"{_KEY_PREFIX}raw_data/GCF/000/001/215/{_ASSEMBLY_DIR}/GCF_000001215.4_genomic.fna.gz", + "format": "gz", + "bytes": 1024, + "hash": "abc123", + }, + { + "name": "GCF_000001215.4_assembly_report.txt", + "path": f"{_KEY_PREFIX}raw_data/GCF/000/001/215/{_ASSEMBLY_DIR}/GCF_000001215.4_assembly_report.txt", + "format": "txt", + "bytes": 512, + "hash": None, # no md5 sidecar for this one + }, +] + + +# ── build_descriptor_key ───────────────────────────────────────────────── + + +class TestBuildDescriptorKey: + """Tests for build_descriptor_key path helper.""" + + def test_produces_metadata_path(self) -> None: + """Key is located under metadata/ with _datapackage.json suffix.""" + key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + assert key == f"{_KEY_PREFIX}metadata/{_ASSEMBLY_DIR}_datapackage.json" + + def test_trailing_slash_normalised(self) -> None: + """Key is the same whether key_prefix ends with a slash or not.""" + key_no_slash = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX.rstrip("/")) + key_with_slash = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + assert key_no_slash == key_with_slash + + def test_no_double_slash(self) -> None: + """Key never contains a double slash.""" + key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + assert "//" not in key + + +# ── build_archive_descriptor_key ───────────────────────────────────────── + + +class TestBuildArchiveDescriptorKey: + """Tests for build_archive_descriptor_key path helper.""" + + def test_produces_archive_path(self) -> None: + """Key is located under archive/{release_tag}/metadata/.""" + key = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) + expected = f"{_KEY_PREFIX}archive/{_RELEASE_TAG}/metadata/{_ASSEMBLY_DIR}_datapackage.json" + assert key == expected + + def test_trailing_slash_normalised(self) -> None: + """Key is the same whether key_prefix ends with a slash or not.""" + a = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX.rstrip("/")) + b = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) + assert a == b + + def test_release_tag_in_path(self) -> None: + """Release tag appears in the archive key path.""" + key = build_archive_descriptor_key(_ASSEMBLY_DIR, "2025-06", _KEY_PREFIX) + assert "2025-06" in key + + +# ── create_descriptor ──────────────────────────────────────────────────── + + +class TestCreateDescriptor: + """Tests for create_descriptor().""" + + def test_identifier(self) -> None: + """Identifier field is prefixed with NCBI:.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["identifier"] == f"NCBI:{_ACCESSION}" + + def test_version_extracted_from_accession(self) -> None: + """Version is the suffix after the last dot in the accession.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["version"] == "4" # last segment of GCF_000001215.4 + + def test_title_includes_assembly_dir(self) -> None: + """Title includes the full assembly directory name.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert _ASSEMBLY_DIR in d["titles"][0]["title"] + + def test_description_includes_accession(self) -> None: + """Description text includes the accession.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert _ACCESSION in d["descriptions"][0]["description_text"] + + def test_url_references_accession(self) -> None: + """URL points to the NCBI genome page for the accession.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert _ACCESSION in d["url"] + assert "ncbi.nlm.nih.gov" in d["url"] + + def test_ncbi_contributor(self) -> None: + """Contributor is NCBI with the correct ROR ID.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["contributors"][0]["name"] == "National Center for Biotechnology Information" + assert d["contributors"][0]["contributor_id"] == "ROR:02meqm098" + + def test_saved_by(self) -> None: + """meta.saved_by is the cdm-data-loaders identifier.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["meta"]["saved_by"] == "cdm-data-loaders-ncbi-ftp" + + def test_timestamp_propagated(self) -> None: + """Explicit timestamp is used for both meta.timestamp and access_timestamp.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["meta"]["timestamp"] == _TIMESTAMP + assert d["meta"]["credit_metadata_source"][0]["access_timestamp"] == _TIMESTAMP + + def test_default_timestamp_is_recent(self) -> None: + """Default timestamp is close to current time when not specified.""" + before = int(time.time()) + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES) + after = int(time.time()) + ts = d["meta"]["timestamp"] + assert before <= ts <= after + 1 + + def test_resource_names_lowercased(self) -> None: + """Resource names are converted to lowercase.""" + resources: list[DescriptorResource] = [ + {"name": "FILE_UPPER.FNA.GZ", "path": "s3://bucket/a", "format": "gz", "bytes": 100, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert d["resources"][0]["name"] == "file_upper.fna.gz" + + def test_null_hash_omitted(self) -> None: + """Resources with hash=None must not include the 'hash' key.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + resources = d["resources"] + # Second resource has hash=None → key absent + assert "hash" not in resources[1] + + def test_non_null_hash_present(self) -> None: + """Non-null hash is retained in the resource entry.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["resources"][0]["hash"] == _SAMPLE_RESOURCES[0]["hash"] + + def test_resource_count(self) -> None: + """Resource list length matches the number of input resources.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert len(d["resources"]) == len(_SAMPLE_RESOURCES) + + def test_resource_bytes(self) -> None: + """Resource bytes matches the input bytes value.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["resources"][0]["bytes"] == _SAMPLE_RESOURCES[0]["bytes"] + + def test_resource_path(self) -> None: + """Resource path matches the input path value.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["resources"][0]["path"] == _SAMPLE_RESOURCES[0]["path"] + + def test_license_is_empty_dict(self) -> None: + """License field is an empty dict.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["license"] == {} + + def test_resource_type_is_dataset(self) -> None: + """resource_type is 'dataset'.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["resource_type"] == "dataset" + + def test_schema_version(self) -> None: + """credit_metadata_schema_version is '1.0'.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + assert d["meta"]["credit_metadata_schema_version"] == "1.0" + + def test_empty_resources_allowed(self) -> None: + """Empty resources list produces a valid descriptor.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, [], timestamp=_TIMESTAMP) + assert d["resources"] == [] + + def test_null_bytes_omitted(self) -> None: + """Resources with bytes=None have the 'bytes' key removed from the output.""" + resources: list[DescriptorResource] = [ + {"name": "f.txt", "path": "s3://b/f.txt", "format": "txt", "bytes": None, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert "bytes" not in d["resources"][0] + + +# ── validate_descriptor ────────────────────────────────────────────────── + + +class TestValidateDescriptor: + """Tests for validate_descriptor().""" + + def test_valid_descriptor_passes(self) -> None: + """Valid descriptor does not raise.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + # Should not raise + validate_descriptor(d, _ACCESSION) + + def test_empty_descriptor_raises(self) -> None: + """Empty dict fails frictionless validation and raises.""" + with pytest.raises((ValueError, Exception)): + validate_descriptor({}, _ACCESSION) + + +# ── upload_descriptor ──────────────────────────────────────────────────── + + +@pytest.mark.s3 +class TestUploadDescriptor: + """Tests for upload_descriptor() using moto-mocked S3.""" + + @pytest.fixture + def mock_s3(self) -> Generator[botocore.client.BaseClient]: + """Yield a mocked S3 client with the CDM Lake bucket pre-created.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(metadata_mod, "get_s3_client", return_value=client), + ): + yield client + reset_s3_client() + + def test_uploads_json(self, mock_s3: botocore.client.BaseClient) -> None: + """Uploaded object is valid JSON with the expected identifier.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) + assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + obj = mock_s3.get_object(Bucket=TEST_BUCKET, Key=key) + body = json.loads(obj["Body"].read()) + assert body["identifier"] == f"NCBI:{_ACCESSION}" + + def test_returns_expected_key(self, mock_s3: botocore.client.BaseClient) -> None: + """Return value is the metadata/ S3 key for the assembly.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) + assert key.startswith(_KEY_PREFIX) + assert key.endswith("_datapackage.json") + + def test_dry_run_skips_upload(self, mock_s3: botocore.client.BaseClient) -> None: + """Dry-run returns the key but does not create any S3 object.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) + # No object in S3 + objs = mock_s3.list_objects_v2(Bucket=TEST_BUCKET).get("Contents", []) + assert not any(o["Key"] == key for o in objs) + + def test_dry_run_returns_key(self, mock_s3: botocore.client.BaseClient) -> None: + """Dry-run returns the same key as a real upload would.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) + assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + + +# ── archive_descriptor ─────────────────────────────────────────────────── + + +@pytest.mark.s3 +class TestArchiveDescriptor: + """Tests for archive_descriptor() using moto-mocked S3.""" + + @pytest.fixture + def mock_s3_with_descriptor(self) -> Generator[tuple[botocore.client.BaseClient, MagicMock]]: + """S3 with a live descriptor already uploaded.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + # Pre-upload a descriptor + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + client.put_object( + Bucket=TEST_BUCKET, + Key=live_key, + Body=json.dumps(descriptor).encode(), + ) + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(metadata_mod, "get_s3_client", return_value=client), + patch.object(metadata_mod, "copy_object_with_metadata") as mock_copy, + ): + yield client, mock_copy + reset_s3_client() + + def test_returns_true_when_descriptor_exists( + self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] + ) -> None: + """Returns True when the live descriptor object exists in S3.""" + _, _ = mock_s3_with_descriptor + result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) + assert result is True + + def test_calls_copy_with_correct_keys( + self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] + ) -> None: + """copy_object_with_metadata is called with the live and archive keys.""" + _, mock_copy = mock_s3_with_descriptor + archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) + live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + archive_key = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) + mock_copy.assert_called_once() + args = mock_copy.call_args + assert f"{TEST_BUCKET}/{live_key}" in args[0] + assert f"{TEST_BUCKET}/{archive_key}" in args[0] + + def test_dry_run_returns_true_without_copy( + self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] + ) -> None: + """Dry-run returns True but does not call copy_object_with_metadata.""" + _, mock_copy = mock_s3_with_descriptor + result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG, dry_run=True) + assert result is True + mock_copy.assert_not_called() + + def test_missing_descriptor_returns_false(self) -> None: + """Returns False when no descriptor exists at the live key.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(metadata_mod, "get_s3_client", return_value=client), + ): + result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) + reset_s3_client() + assert result is False From 21dda1cf6f834f5db320e2b71f03591f3957c318 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 22 Apr 2026 11:59:52 -0700 Subject: [PATCH 35/76] Potential fix for pull request finding 'CodeQL / Incomplete URL substring sanitization' Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index 821ac81d..446378f3 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -6,6 +6,7 @@ import time from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch +from urllib.parse import urlparse import boto3 import pytest @@ -133,7 +134,8 @@ def test_url_references_accession(self) -> None: """URL points to the NCBI genome page for the accession.""" d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) assert _ACCESSION in d["url"] - assert "ncbi.nlm.nih.gov" in d["url"] + parsed = urlparse(d["url"]) + assert parsed.hostname == "ncbi.nlm.nih.gov" def test_ncbi_contributor(self) -> None: """Contributor is NCBI with the correct ROR ID.""" From 803d8c4989947f0cb8052277b261f7f6393b3f0e Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 22 Apr 2026 12:14:00 -0700 Subject: [PATCH 36/76] update url parsing test --- tests/ncbi_ftp/test_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index 446378f3..d9c307bb 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -135,7 +135,8 @@ def test_url_references_accession(self) -> None: d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) assert _ACCESSION in d["url"] parsed = urlparse(d["url"]) - assert parsed.hostname == "ncbi.nlm.nih.gov" + assert parsed.hostname is not None + assert parsed.hostname.endswith("ncbi.nlm.nih.gov") def test_ncbi_contributor(self) -> None: """Contributor is NCBI with the correct ROR ID.""" From 280bb635b6cbd83fce180bf4a1f765778b687ba2 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 22 Apr 2026 12:16:05 -0700 Subject: [PATCH 37/76] Potential fix for pull request finding 'CodeQL / Incomplete URL substring sanitization' Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_metadata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index d9c307bb..cc13c192 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -135,8 +135,9 @@ def test_url_references_accession(self) -> None: d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) assert _ACCESSION in d["url"] parsed = urlparse(d["url"]) - assert parsed.hostname is not None - assert parsed.hostname.endswith("ncbi.nlm.nih.gov") + host = parsed.hostname + assert host is not None + assert host == "ncbi.nlm.nih.gov" or host.endswith(".ncbi.nlm.nih.gov") def test_ncbi_contributor(self) -> None: """Contributor is NCBI with the correct ROR ID.""" From 09df49611c06624dcfcf3b8a6700a2e2578c2954 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 12:32:28 -0700 Subject: [PATCH 38/76] merge file upload functions Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/promote.py | 4 +- src/cdm_data_loaders/utils/s3.py | 73 ++++++------------------ tests/utils/test_s3.py | 20 +++---- 3 files changed, 28 insertions(+), 69 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 7767ccb2..6edd293c 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -26,7 +26,7 @@ copy_object_with_metadata, delete_object, get_s3_client, - upload_file_with_metadata, + upload_file, ) logger = get_cdm_logger() @@ -191,7 +191,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 metadata["md5"] = md5_obj["Body"].read().decode().strip() final_key_path = PurePosixPath(final_key) - upload_succeeded = upload_file_with_metadata( + upload_succeeded = upload_file( tmp_path, f"{bucket}/{final_key_path.parent}", metadata=metadata, diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 56f47a37..f6f4f632 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -167,15 +167,23 @@ def upload_file( local_file_path: Path | str, destination_dir: str, object_name: str | None = None, + metadata: dict[str, str] | None = None, ) -> bool: """Upload an object to an S3 bucket. + When *metadata* is supplied the file is always uploaded (no existence check) + and the dict is attached as S3 user metadata. When *metadata* is ``None`` + (the default) the existing behaviour is preserved: the upload is skipped if + the object is already present. + :param local_file_path: File to upload :type local_file_path: Path | str :param destination_dir: path to the destination directory on s3, INCLUDING the bucket name and EXCLUDING the file name :type destination_dir: str :param object_name: S3 object name. If not specified, the name of the file from local_file_path is used. :type object_name: str | None + :param metadata: user metadata key/value pairs to attach to the object; when provided the upload always runs + :type metadata: dict[str, str] | None :return: True if file was uploaded, else False :rtype: bool """ @@ -190,13 +198,17 @@ def upload_file( object_name = local_file_path.name s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" - if object_exists(s3_path): - print(f"File already present: {s3_path}") # noqa: T201 - return True + + if metadata is None: + if object_exists(s3_path): + print(f"File already present: {s3_path}") # noqa: T201 + return True s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) + extra_args = {**DEFAULT_EXTRA_ARGS, **(({"Metadata": metadata}) if metadata is not None else {})} + # Upload the file file_size = local_file_path.stat().st_size with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: @@ -207,7 +219,7 @@ def upload_file( Bucket=bucket, Key=key, Callback=pbar.update, - ExtraArgs=DEFAULT_EXTRA_ARGS, + ExtraArgs=extra_args, ) except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e: print(f"Error uploading to s3: {e!s}") # noqa: T201 @@ -421,59 +433,6 @@ def delete_object(s3_path: str) -> dict[str, Any]: return s3.delete_object(Bucket=bucket, Key=key) -def upload_file_with_metadata( - local_file_path: Path | str, - destination_dir: str, - metadata: dict[str, str], - object_name: str | None = None, -) -> bool: - """Upload a file to S3 with user-defined metadata and CRC64NVME checksum. - - Unlike :func:`upload_file`, this function always uploads (no existence check) - and attaches the supplied *metadata* dict as S3 user metadata. - - :param local_file_path: file to upload - :type local_file_path: Path | str - :param destination_dir: path to the destination directory on s3, INCLUDING the bucket name - :type destination_dir: str - :param metadata: user metadata key/value pairs to attach to the object - :type metadata: dict[str, str] - :param object_name: S3 object name; defaults to the local filename - :type object_name: str | None - :return: True if the upload succeeded, otherwise False - :rtype: bool - """ - if isinstance(local_file_path, str): - local_file_path = Path(local_file_path) - - if not destination_dir: - msg = "No destination directory supplied for the file" - raise ValueError(msg) - - if not object_name: - object_name = local_file_path.name - - s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" - s3 = get_s3_client() - (bucket, key) = split_s3_path(s3_path) - - extra_args = {**DEFAULT_EXTRA_ARGS, "Metadata": metadata} - - file_size = local_file_path.stat().st_size - try: - with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - s3.upload_file( - Filename=str(local_file_path), - Bucket=bucket, - Key=key, - Callback=pbar.update, - ExtraArgs=extra_args, - ) - except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError): - return False - return True - - def head_object(s3_path: str) -> dict[str, Any] | None: """Return metadata for an S3 object, or None if it does not exist. diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 6bd00208..a2acc80f 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -31,7 +31,7 @@ stream_to_s3, upload_dir, upload_file, - upload_file_with_metadata, + ) AWS_REGION = "us-east-1" @@ -782,13 +782,13 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol assert resp.get("ResponseMetadata", {}).get("HTTPStatusCode") == HTTP_STATUS_NO_CONTENT -# upload_file_with_metadata +# upload_file with metadata @pytest.mark.parametrize("bucket", BUCKETS) @pytest.mark.s3 def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample_file: Path, bucket: str) -> None: - """Verify that upload_file_with_metadata stores user metadata on the uploaded object.""" + """Verify that upload_file with metadata stores user metadata on the uploaded object.""" metadata = {"md5": "abc123", "source": "ncbi"} - result = upload_file_with_metadata(sample_file, f"{bucket}/uploads", metadata=metadata) + result = upload_file(sample_file, f"{bucket}/uploads", metadata=metadata) assert result is True resp = mock_s3_client.head_object(Bucket=bucket, Key=f"uploads/{sample_file.name}") @@ -799,7 +799,7 @@ def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample @pytest.mark.s3 def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sample_file: Path) -> None: """Verify that the object_name parameter overrides the filename.""" - result = upload_file_with_metadata( + result = upload_file( sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"k": "v"}, object_name="renamed.txt" ) assert result is True @@ -809,9 +809,9 @@ def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sampl @pytest.mark.s3 def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, sample_file: Path) -> None: - """Verify that upload_file_with_metadata uploads even when the object already exists.""" + """Verify that upload_file with metadata uploads even when the object already exists.""" mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") - result = upload_file_with_metadata(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"new": "true"}) + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"new": "true"}) assert result is True obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") assert obj["Body"].read() == b"hello s3" @@ -822,15 +822,15 @@ def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, samp def test_upload_file_with_metadata_raises_on_empty_destination(sample_file: Path) -> None: """Verify ValueError when destination_dir is empty.""" with pytest.raises(ValueError, match="No destination directory"): - upload_file_with_metadata(sample_file, "", metadata={"k": "v"}) + upload_file(sample_file, "", metadata={"k": "v"}) @pytest.mark.usefixtures("mock_s3_client") @pytest.mark.parametrize("path_type", [str, Path]) @pytest.mark.s3 def test_upload_file_with_metadata_accepts_str_and_path(sample_file: Path, path_type: type[str] | type[Path]) -> None: - """Verify that upload_file_with_metadata accepts both str and Path.""" - result = upload_file_with_metadata(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", metadata={}) + """Verify that upload_file with metadata accepts both str and Path.""" + result = upload_file(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", metadata={}) assert result is True From c56c3c9f22e1d12c56edfa81fd7e8c28d0c3d8df Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 12:35:20 -0700 Subject: [PATCH 39/76] formatting --- tests/utils/test_s3.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index a2acc80f..a96a5cab 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -31,7 +31,6 @@ stream_to_s3, upload_dir, upload_file, - ) AWS_REGION = "us-east-1" @@ -799,9 +798,7 @@ def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample @pytest.mark.s3 def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sample_file: Path) -> None: """Verify that the object_name parameter overrides the filename.""" - result = upload_file( - sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"k": "v"}, object_name="renamed.txt" - ) + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"k": "v"}, object_name="renamed.txt") assert result is True obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key="uploads/renamed.txt") assert obj["Body"].read() == b"hello s3" From 6d6415beb64dfde2d238cb42e05f61074fa39a8f Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 12:49:21 -0700 Subject: [PATCH 40/76] merge copy object functions Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/metadata.py | 4 +- src/cdm_data_loaders/ncbi_ftp/promote.py | 4 +- src/cdm_data_loaders/utils/s3.py | 59 ++++++++--------------- tests/ncbi_ftp/test_metadata.py | 6 +-- tests/utils/test_s3.py | 10 ++-- 5 files changed, 33 insertions(+), 50 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index 356ee0d6..4ef60363 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -27,7 +27,7 @@ from frictionless import Package from cdm_data_loaders.utils.cdm_logger import get_cdm_logger -from cdm_data_loaders.utils.s3 import copy_object_with_metadata, get_s3_client +from cdm_data_loaders.utils.s3 import copy_object, get_s3_client logger = get_cdm_logger() @@ -246,7 +246,7 @@ def archive_descriptor( # noqa: PLR0913 raise datestamp = datetime.now(UTC).strftime("%Y-%m-%d") - copy_object_with_metadata( + copy_object( f"{bucket}/{source_key}", f"{bucket}/{archive_key}", metadata={ diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 6edd293c..ab345ba3 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -23,7 +23,7 @@ ) from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.s3 import ( - copy_object_with_metadata, + copy_object, delete_object, get_s3_client, upload_file, @@ -309,7 +309,7 @@ def _archive_assemblies( # noqa: PLR0913 continue try: - copy_object_with_metadata( + copy_object( f"{bucket}/{source_key}", f"{bucket}/{archive_key}", metadata={ diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index f6f4f632..13a850bd 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -386,19 +386,29 @@ def upload_dir( return all_successful -def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: +def copy_object( + current_s3_path: str, + new_s3_path: str, + metadata: dict[str, str] | None = None, +) -> dict[str, Any]: """Copy an object from one place to another, adding in a CRC64NVME checksum. + When *metadata* is supplied the destination object carries exactly those + key/value pairs (``MetadataDirective='REPLACE'``). When *metadata* is + ``None`` (the default) the source metadata is inherited. + A successful copy operation will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 200 Errors (e.g, buckets or keys not existing, wrong credentials, etc.) are passed directly to the user without being caught. - :param current_path: path to the file on s3, INCLUDING the bucket name - :type current_path: str - :param new_path: the desired new file path on s3, INCLUDING the bucket name - :type new_path: str + :param current_s3_path: path to the file on s3, INCLUDING the bucket name + :type current_s3_path: str + :param new_s3_path: the desired new file path on s3, INCLUDING the bucket name + :type new_s3_path: str + :param metadata: user metadata to set on the destination object; when provided the source metadata is replaced + :type metadata: dict[str, str] | None :return: dictionary containing response :rtype: dict[str, Any] """ @@ -406,10 +416,16 @@ def copy_object(current_s3_path: str, new_s3_path: str) -> dict[str, Any]: (current_s3_bucket, current_s3_key) = split_s3_path(current_s3_path) (new_s3_bucket, new_s3_key) = split_s3_path(new_s3_path) + extra: dict[str, Any] = {} + if metadata is not None: + extra["Metadata"] = metadata + extra["MetadataDirective"] = "REPLACE" + return s3.copy_object( CopySource={"Bucket": current_s3_bucket, "Key": current_s3_key}, Bucket=new_s3_bucket, Key=new_s3_key, + **extra, **DEFAULT_EXTRA_ARGS, ) @@ -461,37 +477,4 @@ def head_object(s3_path: str) -> dict[str, Any] | None: } -def copy_object_with_metadata( - current_s3_path: str, - new_s3_path: str, - metadata: dict[str, str], -) -> dict[str, Any]: - """Copy an S3 object to a new location, replacing its user metadata. - - Uses ``MetadataDirective='REPLACE'`` so the destination object carries - exactly the supplied *metadata* rather than inheriting the source's metadata. - - A successful copy returns a response where - ``resp["ResponseMetadata"]["HTTPStatusCode"] == 200``. - - :param current_s3_path: source path on s3, INCLUDING the bucket name - :type current_s3_path: str - :param new_s3_path: destination path on s3, INCLUDING the bucket name - :type new_s3_path: str - :param metadata: user metadata to set on the destination object - :type metadata: dict[str, str] - :return: dictionary containing response - :rtype: dict[str, Any] - """ - s3 = get_s3_client() - (current_bucket, current_key) = split_s3_path(current_s3_path) - (new_bucket, new_key) = split_s3_path(new_s3_path) - return s3.copy_object( - CopySource={"Bucket": current_bucket, "Key": current_key}, - Bucket=new_bucket, - Key=new_key, - Metadata=metadata, - MetadataDirective="REPLACE", - **DEFAULT_EXTRA_ARGS, - ) diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index cc13c192..bcdd825d 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -323,7 +323,7 @@ def mock_s3_with_descriptor(self) -> Generator[tuple[botocore.client.BaseClient, with ( patch.object(s3_utils, "get_s3_client", return_value=client), patch.object(metadata_mod, "get_s3_client", return_value=client), - patch.object(metadata_mod, "copy_object_with_metadata") as mock_copy, + patch.object(metadata_mod, "copy_object") as mock_copy, ): yield client, mock_copy reset_s3_client() @@ -339,7 +339,7 @@ def test_returns_true_when_descriptor_exists( def test_calls_copy_with_correct_keys( self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] ) -> None: - """copy_object_with_metadata is called with the live and archive keys.""" + """copy_object is called with the live and archive keys.""" _, mock_copy = mock_s3_with_descriptor archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) @@ -352,7 +352,7 @@ def test_calls_copy_with_correct_keys( def test_dry_run_returns_true_without_copy( self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] ) -> None: - """Dry-run returns True but does not call copy_object_with_metadata.""" + """Dry-run returns True but does not call copy_object.""" _, mock_copy = mock_s3_with_descriptor result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG, dry_run=True) assert result is True diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index a96a5cab..8924f95a 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -19,7 +19,7 @@ CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, copy_object, - copy_object_with_metadata, + delete_object, download_file, get_s3_client, @@ -862,16 +862,16 @@ def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: assert result["size"] == SIZE_DATA -# copy_object_with_metadata +# copy_object with metadata @pytest.mark.parametrize("destination", BUCKETS) @pytest.mark.s3 def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: - """Verify that copy_object_with_metadata copies and replaces metadata.""" + """Verify that copy_object with metadata copies and replaces metadata.""" mocked_s3_client_no_checksum.put_object( Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"old_key": "old_val"} ) new_metadata = {"archive_reason": "replaced", "archive_date": "2026-04-16"} - response = copy_object_with_metadata( + response = copy_object( f"{CDM_LAKE_BUCKET}/src/file.txt", f"{destination}/archive/file.txt", metadata=new_metadata, @@ -892,7 +892,7 @@ def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksu def test_copy_object_with_metadata_preserves_content(mocked_s3_client_no_checksum: Any) -> None: """Verify that the content of the copied object matches the original.""" mocked_s3_client_no_checksum.put_object(Bucket=CDM_LAKE_BUCKET, Key="src/data.bin", Body=b"binary data") - copy_object_with_metadata( + copy_object( f"{CDM_LAKE_BUCKET}/src/data.bin", f"{CDM_LAKE_BUCKET}/dst/data.bin", metadata={"tag": "value"}, From bf0c01e0930da2b48103720f4bf314303d34215b Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 12:59:32 -0700 Subject: [PATCH 41/76] use tenacity Co-authored-by: Copilot --- src/cdm_data_loaders/utils/ftp_client.py | 63 ++++++++++++++---------- src/cdm_data_loaders/utils/s3.py | 3 -- tests/utils/test_s3.py | 1 - 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/src/cdm_data_loaders/utils/ftp_client.py b/src/cdm_data_loaders/utils/ftp_client.py index 0f8409e7..902779e5 100644 --- a/src/cdm_data_loaders/utils/ftp_client.py +++ b/src/cdm_data_loaders/utils/ftp_client.py @@ -6,12 +6,21 @@ """ import contextlib +import logging import socket import threading import time from ftplib import FTP, error_temp from pathlib import Path +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_fixed, +) + from cdm_data_loaders.utils.cdm_logger import get_cdm_logger logger = get_cdm_logger() @@ -73,19 +82,20 @@ def ftp_list_dir(ftp: FTP, path: str, retries: int = 3) -> list[str]: :return: list of filenames """ ftp.cwd(path) - for attempt in range(1, retries + 1): - try: - files: list[str] = [] - ftp.retrlines("NLST", files.append) - except error_temp as e: - if attempt < retries: - logger.warning("Transient FTP error listing %s (attempt %d/%d): %s", path, attempt, retries, e) - time.sleep(2) - else: - raise - else: - return files - return [] # unreachable, but keeps type checkers happy + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(retries), + wait=wait_fixed(2), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _list() -> list[str]: + files: list[str] = [] + ftp.retrlines("NLST", files.append) + return files + + return _list() def ftp_download_file(ftp: FTP, remote_path: str, local_path: str, retries: int = 3) -> None: @@ -96,20 +106,19 @@ def ftp_download_file(ftp: FTP, remote_path: str, local_path: str, retries: int :param local_path: local destination path :param retries: number of retry attempts """ - for attempt in range(1, retries + 1): - try: - with Path(local_path).open("wb") as f: - ftp.retrbinary(f"RETR {remote_path}", f.write) - except error_temp as e: - if attempt < retries: - logger.warning( - "Transient FTP error downloading %s (attempt %d/%d): %s", remote_path, attempt, retries, e - ) - time.sleep(2) - else: - raise - else: - return + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(retries), + wait=wait_fixed(2), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _download() -> None: + with Path(local_path).open("wb") as f: + ftp.retrbinary(f"RETR {remote_path}", f.write) + + _download() def ftp_retrieve_text(ftp: FTP, remote_path: str) -> str: diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 13a850bd..9c3de749 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -475,6 +475,3 @@ def head_object(s3_path: str) -> dict[str, Any] | None: "metadata": resp.get("Metadata", {}), "checksum_crc64nvme": resp.get("ChecksumCRC64NVME"), } - - - diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 8924f95a..f8bd9e53 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -19,7 +19,6 @@ CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, copy_object, - delete_object, download_file, get_s3_client, From de041dc902ffa1bab2d6124c10caaa7af2fd368b Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 13:19:21 -0700 Subject: [PATCH 42/76] address reviewer comments Co-authored-by: Copilot --- src/cdm_data_loaders/utils/s3.py | 31 ++++++++++++++++++------------- tests/utils/test_s3.py | 8 ++++---- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 9c3de749..2c33bee7 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -10,6 +10,8 @@ import tqdm from botocore.config import Config +from cdm_data_loaders.utils.cdm_logger import get_cdm_logger + CDM_LAKE_BUCKET = "cdm-lake" DEFAULT_EXTRA_ARGS = {"ChecksumAlgorithm": "CRC64NVME"} @@ -22,6 +24,9 @@ AWS_CLIENT_TOTAL_MAX_ATTEMPTS = 10 +logger = get_cdm_logger() + + _s3_client: botocore.client.BaseClient | None = None @@ -62,7 +67,7 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli "aws_secret_access_key": settings.MINIO_SECRET_KEY, } except (ModuleNotFoundError, ImportError, NameError) as e: - print(e) # noqa: T201 + logger.exception("Failed to load berdl settings: %s", e) raise required_args = ["endpoint_url", "aws_access_key_id", "aws_secret_access_key"] @@ -155,10 +160,10 @@ def object_exists(s3_path: str) -> bool: (bucket, key) = split_s3_path(s3_path) try: s3.head_object(Bucket=bucket, Key=key) - except botocore.exceptions.ClientError as e: + except Exception as e: # noqa: BLE001 error_string = str(e) if not error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - print(f"Error performing head operation on s3 object: {e!s}") # noqa: T201 + logger.error("Error performing head operation on s3 object: %s", e) return False return True @@ -201,7 +206,7 @@ def upload_file( if metadata is None: if object_exists(s3_path): - print(f"File already present: {s3_path}") # noqa: T201 + logger.info("File already present: %s", s3_path) return True s3 = get_s3_client() @@ -212,7 +217,7 @@ def upload_file( # Upload the file file_size = local_file_path.stat().st_size with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - print(f"uploading {local_file_path!s} to {s3_path}") # noqa: T201 + logger.info("uploading %s to %s", local_file_path, s3_path) try: s3.upload_file( Filename=str(local_file_path), @@ -221,8 +226,8 @@ def upload_file( Callback=pbar.update, ExtraArgs=extra_args, ) - except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e: - print(f"Error uploading to s3: {e!s}") # noqa: T201 + except Exception as e: # noqa: BLE001 + logger.error("Error uploading to s3: %s", e) return False return True @@ -277,7 +282,7 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N try: parent_dir.mkdir(parents=True, exist_ok=False) except OSError as e: - print(f"Could not save s3 file to {local_file_path}: {e!s}") # noqa: T201 + logger.error("Could not save s3 file to %s: %s", local_file_path, e) raise s3 = get_s3_client() @@ -289,12 +294,12 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N # Get the object size try: object_size = s3.head_object(**kwargs)["ContentLength"] - except botocore.exceptions.ClientError as e: + except Exception as e: # noqa: BLE001 error_string = str(e) if error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - print(f"File not found: {s3_path}") # noqa: T201 + logger.error("File not found: %s", s3_path) else: - print(f"Error downloading {s3_path}: {e!s}") # noqa: T201 + logger.error("Error downloading %s: %s", s3_path, e) raise extra_args = {"VersionId": version_id} if version_id is not None else None @@ -466,8 +471,8 @@ def head_object(s3_path: str) -> dict[str, Any] | None: (bucket, key) = split_s3_path(s3_path) try: resp = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": + except Exception as e: # noqa: BLE001 + if e.response["Error"]["Code"] == "404": # type: ignore[union-attr] return None raise return { diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index f8bd9e53..b459c467 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -392,13 +392,13 @@ def test_upload_file_uses_custom_object_name(mock_s3_client: Any, sample_file: P @pytest.mark.s3 def test_upload_file_skips_when_already_present( - mock_s3_client: Any, sample_file: Path, capsys: pytest.CaptureFixture + mock_s3_client: Any, sample_file: Path, caplog: pytest.LogCaptureFixture ) -> None: """Verify that uploading a file that already exists is skipped and returns True.""" mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads") assert result is True - assert "File already present" in capsys.readouterr().out + assert "File already present" in caplog.text @pytest.mark.usefixtures("mock_s3_client") @@ -622,7 +622,7 @@ def test_download_file_does_not_clobber_existing_file_to_mkdir(mock_s3_client: A @pytest.mark.s3 @pytest.mark.usefixtures("mock_s3_client") -def test_download_file_does_not_exist(tmp_path: Path, capsys: pytest.CaptureFixture) -> None: +def test_download_file_does_not_exist(tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: """Ensure that attempting to download a file that does not exist raises an error.""" bucket = BUCKETS[0] key = "to/the/door.txt" @@ -634,7 +634,7 @@ def test_download_file_does_not_exist(tmp_path: Path, capsys: pytest.CaptureFixt ): download_file(f"{bucket}/{key}", tmp_path / "file.txt") - assert "File not found" in capsys.readouterr().out + assert "File not found" in caplog.text # TODO: Missing tests From 0fefd290a8b140409b9cf235d17e19b782518ca6 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 13:35:41 -0700 Subject: [PATCH 43/76] remove defaults for minio test env vars Co-authored-by: Copilot --- tests/integration/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 95e60008..06fdb61e 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -28,9 +28,9 @@ # ── MinIO connection defaults ─────────────────────────────────────────── -MINIO_ENDPOINT_URL = os.environ.get("MINIO_ENDPOINT_URL", "http://localhost:9000") -MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin") -MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin") +MINIO_ENDPOINT_URL = os.environ["MINIO_ENDPOINT_URL"] +MINIO_ACCESS_KEY = os.environ["MINIO_ACCESS_KEY"] +MINIO_SECRET_KEY = os.environ["MINIO_SECRET_KEY"] # Maximum length of a bucket name per S3/DNS spec _MAX_BUCKET_LEN = 63 From f9bcaee01059dd69645fd93bc61557238b43ce90 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 13:50:49 -0700 Subject: [PATCH 44/76] cleanup --- notebooks/ncbi_ftp_manifest.ipynb | 2 -- notebooks/ncbi_ftp_promote.ipynb | 14 ++++++-------- scripts/entrypoint.sh | 2 +- scripts/s3_local.py | 2 -- src/cdm_data_loaders/ncbi_ftp/metadata.py | 2 -- tests/integration/conftest.py | 2 -- 6 files changed, 7 insertions(+), 17 deletions(-) diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 4d51c5b3..b4e9af5e 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -48,8 +48,6 @@ "source": [ "\"\"\"Imports and S3 client initialisation.\"\"\"\n", "\n", - "from __future__ import annotations\n", - "\n", "import json\n", "from pathlib import Path\n", "\n", diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index 81239f39..a1aebcdc 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -34,8 +34,8 @@ "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/transfer_manifest.txt` |\n", "| `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", "\n", - "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/\u2026/{filename}`\n", - "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/\u2026/{filename}`" + "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", + "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" ] }, { @@ -47,8 +47,6 @@ "source": [ "\"\"\"Imports and S3 client initialisation.\"\"\"\n", "\n", - "from __future__ import annotations\n", - "\n", "import json\n", "\n", "from cdm_data_loaders.ncbi_ftp.promote import (\n", @@ -100,7 +98,7 @@ "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", "\n", - "# Dry-run mode \u2014 log actions without making changes\n", + "# Dry-run mode — log actions without making changes\n", "DRY_RUN = False\n", "\n", "print(f\"Bucket: {STORE_BUCKET}\")\n", @@ -185,10 +183,10 @@ "print(f\"Timestamp: {report['timestamp']}\")\n", "\n", "if report[\"failed\"] > 0:\n", - " print(\"\\n\u26a0\ufe0f Some operations failed \u2014 check logs above for details.\")\n", + " print(\"\\n⚠️ Some operations failed — check logs above for details.\")\n", "\n", "if report[\"dry_run\"]:\n", - " print(\"\\n\ud83d\udccb This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" + " print(\"\\n📋 This was a dry-run. Set DRY_RUN = False and re-run to apply changes.\")" ] }, { @@ -245,4 +243,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 68d13b49..fd03e3cd 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -supported_commands="xml_split|uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|test|bash" +supported_commands="all_the_bacteria|xml_split|uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|test|bash" # Ensure at least one argument is provided if [ "$#" -eq 0 ]; then diff --git a/scripts/s3_local.py b/scripts/s3_local.py index 5802a133..60bac49f 100755 --- a/scripts/s3_local.py +++ b/scripts/s3_local.py @@ -16,8 +16,6 @@ MINIO_SECRET_KEY minioadmin """ -from __future__ import annotations - import json import os import sys diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index 4ef60363..42cfe865 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -16,8 +16,6 @@ size, file format, and MD5 hash of each promoted data file. """ -from __future__ import annotations - import json import tempfile from datetime import UTC, datetime diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 06fdb61e..aa3cd5e3 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,8 +6,6 @@ the final state of the object store via the MinIO console. """ -from __future__ import annotations - import hashlib import os import re From f88df1d04df3fe2c3e231d39e8a6c8b2ded31576 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 15:14:50 -0700 Subject: [PATCH 45/76] add docker compose integration test option and draft new action Co-authored-by: Copilot --- .github/workflows/integration_tests.yaml | 45 ++++++++++++++++++++++++ README.md | 19 ++++++++++ docker-compose.yml | 43 ++++++++++++++++++++++ pyproject.toml | 6 ++-- scripts/entrypoint.sh | 6 +++- 5 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/integration_tests.yaml create mode 100644 docker-compose.yml diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml new file mode 100644 index 00000000..a8029378 --- /dev/null +++ b/.github/workflows/integration_tests.yaml @@ -0,0 +1,45 @@ +name: Integration tests + +on: + workflow_call: + + push: + branches: + - main + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + +jobs: + integration_tests: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build integration test image + uses: docker/build-push-action@v6 + with: + context: . + load: true + tags: cdm-data-loaders_integration-tests:latest + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run integration tests + run: | + docker compose up \ + --no-build \ + --abort-on-container-exit \ + --exit-code-from integration-tests + + - name: Tear down + if: always() + run: docker compose down --volumes diff --git a/README.md b/README.md index 78aa4d74..95c9ac13 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Repo for CDM input data loading and wrangling - [Development](#development) - [Spark and other non-python dependencies](#spark-and-other-non-python-dependencies) - [Tests](#tests) + - [Integration tests (MinIO + NCBI FTP)](#integration-tests-minio--ncbi-ftp) - [Loading genomes, contigs, and features](#loading-genomes-contigs-and-features) - [Running bbmap stats and checkm2 on genome or contigset files](#running-bbmap-stats-and-checkm2-on-genome-or-contigset-files) - [Changelog](#changelog) @@ -140,6 +141,24 @@ End-to-end integration tests for the NCBI assembly pipeline live in `tests/integ - Docker (for MinIO) - Network access to `ftp.ncbi.nlm.nih.gov` +**Running with Docker Compose (easiest)** + +The [docker-compose.yml](docker-compose.yml) at the repo root defines both a MinIO service and the integration test runner. To build the image, start MinIO, and run the integration tests in one command: + +```sh +docker compose up --build --abort-on-container-exit +``` + +Compose will stream test output to the terminal and exit with the pytest exit code. To clean up afterwards: + +```sh +docker compose down --volumes +``` + +**Running manually** + +If you prefer to run the tests directly against a local MinIO instance (e.g. for faster iteration during development), follow the steps below. + **1. Start MinIO locally:** ```sh diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..cc1074eb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,43 @@ +services: + minio: + image: quay.io/minio/minio:latest + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + ports: + - "9000:9000" + - "9001:9001" + healthcheck: + test: [ "CMD", "mc", "ready", "local" ] + interval: 5s + timeout: 5s + retries: 5 + + integration-tests: + build: + context: . + depends_on: + minio: + condition: service_healthy + environment: + MINIO_ENDPOINT_URL: http://minio:9000 + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + entrypoint: + - /bin/sh + - -c + - | + attempts=0 + until python3 -c " + import urllib.request, os + urllib.request.urlopen(os.environ['MINIO_ENDPOINT_URL'] + '/minio/health/live', timeout=1) + " 2>/dev/null; do + attempts=$$((attempts + 1)) + if [ "$$attempts" -ge 30 ]; then + echo 'Timed out waiting for MinIO.' && exit 1 + fi + echo 'Waiting for MinIO...' && sleep 1 + done + exec /app/scripts/entrypoint.sh integration-test + command: [] diff --git a/pyproject.toml b/pyproject.toml index 1826340c..84d74eb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -197,9 +197,9 @@ markers = ["requires_spark: must be run in an environment where spark is availab USER = "fake_user" KBASE_AUTH_TOKEN = "test-token-123" CDM_TASK_SERVICE_URL = "http://localhost:8080" -MINIO_ENDPOINT_URL = "http://localhost:9000" -MINIO_ACCESS_KEY = "minioadmin" -MINIO_SECRET_KEY = "minioadmin" +MINIO_ENDPOINT_URL = { value = "http://localhost:9000", skip_if_set = true } +MINIO_ACCESS_KEY = { value = "minioadmin", skip_if_set = true } +MINIO_SECRET_KEY = { value = "minioadmin", skip_if_set = true } MINIO_SECURE_FLAG = "false" BERDL_POD_IP = "192.168.1.100" SPARK_MASTER_URL = "spark://localhost:7077" diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index fd03e3cd..2838f9b1 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -supported_commands="all_the_bacteria|xml_split|uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|test|bash" +supported_commands="all_the_bacteria|xml_split|uniref|uniprot|ncbi_rest_api|ncbi_ftp_sync|test|integration-test|bash" # Ensure at least one argument is provided if [ "$#" -eq 0 ]; then @@ -41,6 +41,10 @@ case "$cmd" in # run the tests exec /usr/bin/tini -- uv run --no-sync pytest -m "not requires_spark" ;; + integration-test) + # run the integration tests (requires a running MinIO instance) + exec /usr/bin/tini -- uv run --no-sync pytest -m "integration" -v "$@" + ;; bash) exec /usr/bin/tini -- /bin/bash ;; From c9aa9ffa3a141724b830ec3fab71c2bea8b25f87 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 15:31:00 -0700 Subject: [PATCH 46/76] try again for the integration action Co-authored-by: Copilot --- .github/workflows/integration_tests.yaml | 2 +- docker-compose.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index a8029378..fce2870a 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -29,7 +29,7 @@ jobs: with: context: . load: true - tags: cdm-data-loaders_integration-tests:latest + tags: cdm-data-loaders-integration-tests:latest cache-from: type=gha cache-to: type=gha,mode=max diff --git a/docker-compose.yml b/docker-compose.yml index cc1074eb..ba488cce 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,7 @@ services: retries: 5 integration-tests: + image: cdm-data-loaders-integration-tests:latest build: context: . depends_on: From 36369cac79cf2bd19f44a03f73130e3b63854989 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 15:31:31 -0700 Subject: [PATCH 47/76] Potential fix for pull request finding 'CodeQL / Workflow does not contain permissions' Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/integration_tests.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index fce2870a..77962029 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -13,6 +13,9 @@ on: - synchronize - ready_for_review +permissions: + contents: read + jobs: integration_tests: runs-on: ubuntu-latest From f2f8a78dce6b0f204c28e3c0c427c8a6a88c3ac8 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 23 Apr 2026 16:22:45 -0700 Subject: [PATCH 48/76] add labels to tests the send requests to the NCBI FTP server Co-authored-by: Copilot --- pyproject.toml | 2 +- tests/integration/test_download_e2e.py | 2 ++ tests/integration/test_full_pipeline.py | 2 ++ tests/integration/test_manifest_e2e.py | 3 +++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 84d74eb2..5e817110 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -190,7 +190,7 @@ log_cli = true log_cli_level = "INFO" log_level = "INFO" addopts = ["-v"] -markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin", "integration: end-to-end tests requiring a running MinIO instance and network access"] +markers = ["requires_spark: must be run in an environment where spark is available", "s3: tests that mock s3 interactions", "slow_test: does what it says on the tin", "integration: end-to-end tests requiring a running MinIO instance and network access", "external_request: tests that make real network requests to external services (e.g. NCBI FTP)"] # environment settings for running tests [tool.pytest_env] diff --git a/tests/integration/test_download_e2e.py b/tests/integration/test_download_e2e.py index b527de96..452501d3 100644 --- a/tests/integration/test_download_e2e.py +++ b/tests/integration/test_download_e2e.py @@ -49,6 +49,7 @@ def _manifest_for_one_assembly(tmp_path: Path) -> tuple[Path, str]: @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestDownloadSmallBatch: """Download a single assembly from NCBI FTP and verify local output.""" @@ -91,6 +92,7 @@ def test_download_small_batch(self, tmp_path: Path) -> None: @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestDownloadResumeIncomplete: """Verify download handles re-runs when some files are already present.""" diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index 2d1faab7..2654215b 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -38,6 +38,7 @@ @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestFullPipelineSmallBatch: """Run the complete pipeline for a single assembly: diff → download → promote.""" @@ -103,6 +104,7 @@ def test_full_pipeline_small_batch( @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestFullPipelineIncrementalSync: """Run the pipeline twice to test incremental sync with archival.""" diff --git a/tests/integration/test_manifest_e2e.py b/tests/integration/test_manifest_e2e.py index a0357a51..fb46ad3a 100644 --- a/tests/integration/test_manifest_e2e.py +++ b/tests/integration/test_manifest_e2e.py @@ -55,6 +55,7 @@ def _download_and_filter() -> tuple[dict[str, AssemblyRecord], dict[str, Assembl @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestFreshSyncNoPrevious: """Phase 1 with no previous snapshot — everything is 'new'.""" @@ -92,6 +93,7 @@ def test_fresh_sync_no_previous(self, tmp_path: Path) -> None: @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestIncrementalDiffSyntheticPrevious: """Phase 1 incremental diff with a manufactured 'previous' snapshot.""" @@ -154,6 +156,7 @@ def test_incremental_diff(self, tmp_path: Path) -> None: @pytest.mark.integration @pytest.mark.slow_test +@pytest.mark.external_request class TestVerifyTransferCandidatesPrunes: """verify_transfer_candidates should prune assemblies already in the store.""" From 6d923cea4ec9bd8889b017011cbb95e1d3ccc2b7 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 27 Apr 2026 15:45:23 -0700 Subject: [PATCH 49/76] address reviewer comments Co-authored-by: Copilot --- README.md | 29 + docs/ncbi_ftp_e2e_walkthrough.md | 3 + pyproject.toml | 4 +- scripts/entrypoint.sh | 10 +- .../pipelines/ncbi_ftp_download.py | 56 +- src/cdm_data_loaders/utils/s3.py | 68 +- tests/integration/conftest.py | 2 +- tests/integration/test_download_e2e.py | 6 +- tests/integration/test_promote_e2e.py | 6 +- tests/ncbi_ftp/conftest.py | 15 +- tests/ncbi_ftp/test_assembly.py | 165 +-- tests/ncbi_ftp/test_manifest.py | 1272 ++++++++--------- tests/ncbi_ftp/test_metadata.py | 456 +++--- tests/ncbi_ftp/test_notebooks.py | 72 +- tests/ncbi_ftp/test_promote.py | 402 +++--- tests/s3_helpers.py | 25 + tests/utils/test_s3.py | 22 +- 17 files changed, 1154 insertions(+), 1459 deletions(-) create mode 100644 tests/s3_helpers.py diff --git a/README.md b/README.md index 839ba7f3..edb8198e 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,35 @@ uv run python -m ipykernel install --user --name cdm-data-loaders --display-name The `cdm-data-loaders` kernel should now be available from the dropdown list of kernels in the Jupyter notebook interface. +#### Jupyter Kernel Environment Variables + +Often you will need access to environment variables that are included in the default Lakehouse +Jupyter environment, but will not be automatically included in your custom Jupyter kernel. To address +this, first identify the needed variables and values, and add them to your new kernel configuration +with the following steps: + +Open a new Jupyter Notebook __with the default kernel__ and run this in a new cell: +```python +import os +for k, v in sorted(os.environ.items()): + if "AWS" in k or "S3" in k or "MINIO" in k: # replace with whatever keys you're interested in + print(f"{k}={v}") +``` +Take the output and add the environment vars to the `kernel.json` for your new kernel (e.g., in `cdm-data-loaders/.venv/share/jupyter/kernels/python3/kernel.json`): +```json +{ + "argv": ["..."], + "display_name": "cdm-data-loaders", + "language": "python", + "env": { + "AWS_ACCESS_KEY_ID": "...", + "AWS_SECRET_ACCESS_KEY": "...", + "AWS_DEFAULT_REGION": "...", + ... + } +} +``` + ## Running import pipelines diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index d814b73b..4a710046 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -92,6 +92,9 @@ docker run -d \ minio/minio:RELEASE.2025-02-28T09-55-16Z server /data --console-address ":9001" ``` +(Note that a similar service is included in the `docker-compose` configuration file at the root of +this repository that is used in CI test workflows.) + Create a test bucket via the [MinIO console](http://localhost:9001) (login: `minioadmin` / `minioadmin`), or from the command line using the included `scripts/s3_local.py` helper (requires no extra installs — only diff --git a/pyproject.toml b/pyproject.toml index 5630b776..60785ba0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,10 @@ dependencies = [ [project.scripts] all_the_bacteria = "cdm_data_loaders.pipelines.all_the_bacteria:cli" +ncbi_ftp_sync = "cdm_data_loaders.pipelines.ncbi_ftp_download:cli" +ncbi_rest_api = "cdm_data_loaders.pipelines.ncbi_rest_api:cli" uniprot = "cdm_data_loaders.pipelines.uniprot_kb:cli" uniref = "cdm_data_loaders.pipelines.uniref:cli" -ncbi_rest_api = "cdm_data_loaders.pipelines.ncbi_rest_api:cli" -ncbi_ftp_sync = "cdm_data_loaders.pipelines.ncbi_ftp_download:cli" [dependency-groups] dev = [ diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 9a1a7af5..c1c0b6d3 100755 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -VALID_COMMANDS=(all_the_bacteria ncbi_rest_api uniprot uniref xml_split ncbi_ftp_sync test integration-test bash) +VALID_COMMANDS=(all_the_bacteria ncbi_ftp_sync ncbi_rest_api uniprot uniref xml_split test integration-test bash) usage() { local joined @@ -21,6 +21,10 @@ case "$cmd" in all_the_bacteria) exec /usr/bin/tini -- uv run --no-sync all_the_bacteria "$@" ;; + ncbi_ftp_sync) + # Run the NCBI FTP assembly download pipeline (Phase 2) + exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" + ;; ncbi_rest_api) exec /usr/bin/tini -- uv run --no-sync ncbi_rest_api "$@" ;; @@ -33,10 +37,6 @@ case "$cmd" in xml_split) exec /usr/bin/tini -- xml_file_splitter "$@" ;; - ncbi_ftp_sync) - # Run the NCBI FTP assembly download pipeline (Phase 2) - exec /usr/bin/tini -- uv run --no-sync ncbi_ftp_sync "$@" - ;; test) exec /usr/bin/tini -- uv run --no-sync pytest -m "not requires_spark" ;; diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 0f740ce9..20cd77e8 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -6,15 +6,16 @@ """ import json +import logging import threading -import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import UTC, datetime from ftplib import error_temp from pathlib import Path from typing import Any -from pydantic import AliasChoices, Field, field_validator +from pydantic import AliasChoices, Field +from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed from pydantic_settings import BaseSettings, SettingsConfigDict from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST, download_assembly_to_local @@ -63,20 +64,6 @@ class DownloadSettings(BaseSettings): validation_alias=AliasChoices("l", "limit"), ) - @field_validator("threads") - @classmethod - def validate_threads(cls, v: int) -> int: - """Validate threads is within range. - - :param v: number of threads - :raises ValueError: if out of range - :return: validated thread count - """ - if v < 1 or v > 32: # noqa: PLR2004 - msg = f"threads must be between 1 and 32, got {v}" - raise ValueError(msg) - return v - # ── Batch download ─────────────────────────────────────────────────────── @@ -113,23 +100,26 @@ def download_batch( def _download_one(path: str) -> tuple[str, Exception | None]: nonlocal success_count - last_error: Exception | None = None - for attempt in range(1, 4): - try: - stats = download_assembly_to_local(path, output_dir, ftp_host=ftp_host, ftp=pool.get()) - except error_temp as e: - last_error = e - if attempt < 3: # noqa: PLR2004 - logger.warning("Transient FTP error for %s, retry %d/3: %s", path, attempt, e) - time.sleep(5) - except Exception as e: # noqa: BLE001 - return path, e - else: - with lock: - success_count += 1 - all_stats.append(stats) - return path, None - return path, last_error + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(3), + wait=wait_fixed(5), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _attempt() -> dict[str, Any]: + return download_assembly_to_local(path, output_dir, ftp_host=ftp_host, ftp=pool.get()) + + try: + stats = _attempt() + except Exception as e: # noqa: BLE001 + return path, e + else: + with lock: + success_count += 1 + all_stats.append(stats) + return path, None try: with ThreadPoolExecutor(max_workers=threads) as executor: diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 2c33bee7..15fc5ecd 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -67,7 +67,7 @@ def get_s3_client(args: dict[str, str] | None = None) -> botocore.client.BaseCli "aws_secret_access_key": settings.MINIO_SECRET_KEY, } except (ModuleNotFoundError, ImportError, NameError) as e: - logger.exception("Failed to load berdl settings: %s", e) + logger.exception("Failed to load berdl settings") raise required_args = ["endpoint_url", "aws_access_key_id", "aws_secret_access_key"] @@ -147,6 +147,34 @@ def list_matching_objects(s3_path: str) -> list[dict[str, Any]]: return contents +def head_object(s3_path: str) -> dict[str, Any] | None: + """Return metadata for an S3 object, or None if it does not exist. + + The returned dict contains: + - ``size``: content length in bytes + - ``metadata``: user metadata dict + - ``checksum_crc64nvme``: CRC64NVME checksum string (if available) + + :param s3_path: path to the object on s3, INCLUDING the bucket name + :type s3_path: str + :return: dict with object info, or None if the object does not exist + :rtype: dict[str, Any] | None + """ + s3 = get_s3_client() + (bucket, key) = split_s3_path(s3_path) + try: + resp = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") + except Exception as e: # noqa: BLE001 + if e.response["Error"]["Code"] == "404": # type: ignore[union-attr] + return None + raise + return { + "size": resp["ContentLength"], + "metadata": resp.get("Metadata", {}), + "checksum_crc64nvme": resp.get("ChecksumCRC64NVME"), + } + + def object_exists(s3_path: str) -> bool: """Check whether an object exists on s3. @@ -163,7 +191,7 @@ def object_exists(s3_path: str) -> bool: except Exception as e: # noqa: BLE001 error_string = str(e) if not error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - logger.error("Error performing head operation on s3 object: %s", e) + logger.exception("Error performing head operation on s3 object") return False return True @@ -227,7 +255,7 @@ def upload_file( ExtraArgs=extra_args, ) except Exception as e: # noqa: BLE001 - logger.error("Error uploading to s3: %s", e) + logger.exception("Error uploading to s3") return False return True @@ -282,7 +310,7 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N try: parent_dir.mkdir(parents=True, exist_ok=False) except OSError as e: - logger.error("Could not save s3 file to %s: %s", local_file_path, e) + logger.exception("Could not save s3 file to %s", local_file_path) raise s3 = get_s3_client() @@ -297,9 +325,9 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N except Exception as e: # noqa: BLE001 error_string = str(e) if error_string.startswith("An error occurred (404) when calling the HeadObject operation: Not Found"): - logger.error("File not found: %s", s3_path) + logger.exception("File not found: %s", s3_path) else: - logger.error("Error downloading %s: %s", s3_path, e) + logger.exception("Error downloading %s", s3_path) raise extra_args = {"VersionId": version_id} if version_id is not None else None @@ -452,31 +480,3 @@ def delete_object(s3_path: str) -> dict[str, Any]: s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) return s3.delete_object(Bucket=bucket, Key=key) - - -def head_object(s3_path: str) -> dict[str, Any] | None: - """Return metadata for an S3 object, or None if it does not exist. - - The returned dict contains: - - ``size``: content length in bytes - - ``metadata``: user metadata dict - - ``checksum_crc64nvme``: CRC64NVME checksum string (if available) - - :param s3_path: path to the object on s3, INCLUDING the bucket name - :type s3_path: str - :return: dict with object info, or None if the object does not exist - :rtype: dict[str, Any] | None - """ - s3 = get_s3_client() - (bucket, key) = split_s3_path(s3_path) - try: - resp = s3.head_object(Bucket=bucket, Key=key, ChecksumMode="ENABLED") - except Exception as e: # noqa: BLE001 - if e.response["Error"]["Code"] == "404": # type: ignore[union-attr] - return None - raise - return { - "size": resp["ContentLength"], - "metadata": resp.get("Metadata", {}), - "checksum_crc64nvme": resp.get("ChecksumCRC64NVME"), - } diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index aa3cd5e3..0fe7384a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -75,7 +75,7 @@ def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item # ── Fixtures ──────────────────────────────────────────────────────────── -@pytest.fixture(scope="session") +@pytest.fixture def minio_s3_client() -> botocore.client.BaseClient: """Session-scoped real boto3 S3 client pointed at the local MinIO instance. diff --git a/tests/integration/test_download_e2e.py b/tests/integration/test_download_e2e.py index 452501d3..2126bd81 100644 --- a/tests/integration/test_download_e2e.py +++ b/tests/integration/test_download_e2e.py @@ -5,15 +5,11 @@ unreachable. """ -from __future__ import annotations - import json -from typing import TYPE_CHECKING import pytest -if TYPE_CHECKING: - from pathlib import Path +from pathlib import Path from cdm_data_loaders.ncbi_ftp.manifest import ( compute_diff, diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index b97aa542..46324603 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -8,11 +8,8 @@ unreachable. Each test method gets its own bucket. """ -from __future__ import annotations - import hashlib import json -from typing import TYPE_CHECKING import pytest @@ -26,8 +23,7 @@ from .conftest import get_object_metadata, list_all_keys, seed_lakehouse -if TYPE_CHECKING: - from pathlib import Path +from pathlib import Path # Fake assembly details used across tests ACCESSION_A = "GCF_900000001.1" diff --git a/tests/ncbi_ftp/conftest.py b/tests/ncbi_ftp/conftest.py index 2c0923dd..07fff0c6 100644 --- a/tests/ncbi_ftp/conftest.py +++ b/tests/ncbi_ftp/conftest.py @@ -1,7 +1,6 @@ """Shared fixtures for ncbi_ftp tests.""" -import functools -from collections.abc import Callable, Generator +from collections.abc import Generator from unittest.mock import patch import boto3 @@ -11,6 +10,7 @@ import cdm_data_loaders.ncbi_ftp.promote as promote_mod import cdm_data_loaders.utils.s3 as s3_utils +from tests.s3_helpers import strip_checksum_algorithm from cdm_data_loaders.utils.s3 import CDM_LAKE_BUCKET, reset_s3_client AWS_REGION = "us-east-1" @@ -45,17 +45,6 @@ ) -def strip_checksum_algorithm(method: Callable) -> Callable: - """Wrap a boto3 S3 method to remove ChecksumAlgorithm (moto CRC64NVME workaround).""" - - @functools.wraps(method) - def wrapper(*args: object, **kwargs: object) -> object: - kwargs.pop("ChecksumAlgorithm", None) # type: ignore[arg-type] - return method(*args, **kwargs) - - return wrapper - - @pytest.fixture def mock_s3_client() -> Generator[botocore.client.BaseClient]: """Yield a mocked S3 client with the CDM Lake bucket created.""" diff --git a/tests/ncbi_ftp/test_assembly.py b/tests/ncbi_ftp/test_assembly.py index 8aa24b1c..559788bf 100644 --- a/tests/ncbi_ftp/test_assembly.py +++ b/tests/ncbi_ftp/test_assembly.py @@ -9,106 +9,87 @@ parse_md5_checksums_file, ) -_EXPECTED_TWO_ENTRIES = 2 - # ── Path helpers ───────────────────────────────────────────────────────── -class TestBuildAccessionPath: - """Test output directory path construction from assembly names.""" - - def test_basic(self) -> None: - """Verify standard GCF accession path construction.""" - result = build_accession_path("GCF_000001215.4_Release_6_plus_ISO1_MT") - assert result == "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" - - def test_gca_prefix(self) -> None: - """Verify GCA prefix path construction.""" - result = build_accession_path("GCA_012345678.1_ASM1234v1") - assert result == "raw_data/GCA/012/345/678/GCA_012345678.1_ASM1234v1/" - - def test_invalid_raises(self) -> None: - """Verify ValueError on invalid assembly name.""" - with pytest.raises(ValueError, match="Cannot parse"): - build_accession_path("invalid_name") - - -class TestParseAssemblyPath: - """Test FTP path parsing.""" - - def test_basic(self) -> None: - """Verify db, assembly_dir, and accession are parsed correctly.""" - path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" - _db, _assembly_dir, accession = parse_assembly_path(path) - assert _db == "GCF" - assert _assembly_dir == "GCF_000001215.4_Release_6_plus_ISO1_MT" - assert accession == "GCF_000001215.4" - - def test_without_trailing_slash(self) -> None: - """Verify parsing works without trailing slash.""" - path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" - _db, _assembly_dir, accession = parse_assembly_path(path) - assert accession == "GCF_000001215.4" - - def test_invalid_raises(self) -> None: - """Verify ValueError on invalid path.""" - with pytest.raises(ValueError, match="Cannot parse"): - parse_assembly_path("/random/path/") - - -# ── FILE_FILTERS sanity ───────────────────────────────────────────────── - - -class TestFileFilters: - """Sanity checks for the file suffix filter list.""" - - def test_not_empty(self) -> None: - """Verify FILE_FILTERS is not empty.""" - assert len(FILE_FILTERS) > 0 - - def test_all_start_with_underscore(self) -> None: - """Verify all filter patterns start with an underscore.""" - for f in FILE_FILTERS: - assert f.startswith("_"), f"Filter should start with underscore: {f}" +@pytest.mark.parametrize( + ("assembly_dir", "expected"), + [ + pytest.param( + "GCF_000001215.4_Release_6_plus_ISO1_MT", + "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + id="gcf", + ), + pytest.param( + "GCA_012345678.1_ASM1234v1", + "raw_data/GCA/012/345/678/GCA_012345678.1_ASM1234v1/", + id="gca", + ), + ], +) +def test_build_accession_path(assembly_dir: str, expected: str) -> None: + """Verify accession path construction for various inputs.""" + assert build_accession_path(assembly_dir) == expected + + +def test_build_accession_path_invalid() -> None: + """Verify ValueError on invalid assembly name.""" + with pytest.raises(ValueError, match="Cannot parse"): + build_accession_path("invalid_name") + + +@pytest.mark.parametrize( + ("path", "expected"), + [ + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + ("GCF", "GCF_000001215.4_Release_6_plus_ISO1_MT", "GCF_000001215.4"), + id="with_trailing_slash", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT", + ("GCF", "GCF_000001215.4_Release_6_plus_ISO1_MT", "GCF_000001215.4"), + id="without_trailing_slash", + ), + ], +) +def test_parse_assembly_path(path: str, expected: tuple[str, str, str]) -> None: + """Verify db, assembly_dir, and accession are parsed correctly.""" + assert parse_assembly_path(path) == expected - def test_genomic_fna_included(self) -> None: - """Verify _genomic.fna.gz is in the filter list.""" - assert "_genomic.fna.gz" in FILE_FILTERS - def test_assembly_report_included(self) -> None: - """Verify _assembly_report.txt is in the filter list.""" - assert "_assembly_report.txt" in FILE_FILTERS +def test_parse_assembly_path_invalid() -> None: + """Verify ValueError on invalid path.""" + with pytest.raises(ValueError, match="Cannot parse"): + parse_assembly_path("/random/path/") # ── parse_md5_checksums_file ───────────────────────────────────────────── -class TestParseMd5ChecksumsFile: - """Test NCBI md5checksums.txt parsing.""" - - def test_basic(self) -> None: - """Verify parsing of standard md5checksums.txt format.""" - text = "abc123 ./GCF_000001215.4_genomic.fna.gz\ndef456 ./GCF_000001215.4_genomic.gff.gz\n" - result = parse_md5_checksums_file(text) - assert result == { - "GCF_000001215.4_genomic.fna.gz": "abc123", - "GCF_000001215.4_genomic.gff.gz": "def456", - } - - def test_no_leading_dot_slash(self) -> None: - """Verify parsing works without leading ./ prefix.""" - text = "abc123 GCF_000001215.4_genomic.fna.gz\n" - result = parse_md5_checksums_file(text) - assert result == {"GCF_000001215.4_genomic.fna.gz": "abc123"} - - def test_empty(self) -> None: - """Verify empty or whitespace-only input returns empty dict.""" - assert parse_md5_checksums_file("") == {} - assert parse_md5_checksums_file(" \n \n") == {} - - def test_blank_lines_ignored(self) -> None: - """Verify blank lines between entries are skipped.""" - text = "abc123 file1.txt\n\n\ndef456 file2.txt\n" - result = parse_md5_checksums_file(text) - assert len(result) == _EXPECTED_TWO_ENTRIES +@pytest.mark.parametrize( + ("text", "expected"), + [ + pytest.param( + "abc123 ./GCF_000001215.4_genomic.fna.gz\ndef456 ./GCF_000001215.4_genomic.gff.gz\n", + {"GCF_000001215.4_genomic.fna.gz": "abc123", "GCF_000001215.4_genomic.gff.gz": "def456"}, + id="dot_slash_prefix", + ), + pytest.param( + "abc123 GCF_000001215.4_genomic.fna.gz\n", + {"GCF_000001215.4_genomic.fna.gz": "abc123"}, + id="no_dot_slash_prefix", + ), + pytest.param("", {}, id="empty_string"), + pytest.param(" \n \n", {}, id="whitespace_only"), + pytest.param( + "abc123 file1.txt\n\n\ndef456 file2.txt\n", + {"file1.txt": "abc123", "file2.txt": "def456"}, + id="blank_lines_ignored", + ), + ], +) +def test_parse_md5_checksums_file(text: str, expected: dict[str, str]) -> None: + """Verify parse_md5_checksums_file handles various input formats.""" + assert parse_md5_checksums_file(text) == expected diff --git a/tests/ncbi_ftp/test_manifest.py b/tests/ncbi_ftp/test_manifest.py index 9a3add97..5611e925 100644 --- a/tests/ncbi_ftp/test_manifest.py +++ b/tests/ncbi_ftp/test_manifest.py @@ -7,6 +7,7 @@ import pytest from cdm_data_loaders.ncbi_ftp.manifest import ( + AssemblyRecord, DiffResult, _extract_accession_from_s3_key, _extract_assembly_dir_from_s3_key, @@ -26,294 +27,245 @@ from .conftest import SAMPLE_SUMMARY -_EXPECTED_ENTRIES = 4 _EXPECTED_TWO = 2 -_EXPECTED_TOTAL_TRANSFER = 2 # ── parse_assembly_summary ─────────────────────────────────────────────── -class TestParseAssemblySummary: - """Test assembly summary parsing.""" - - def test_parse_basic(self) -> None: - """Verify basic parsing returns expected number of assemblies.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - assert len(assemblies) == _EXPECTED_ENTRIES - assert "GCF_000001215.4" in assemblies - assert "GCF_000005845.2" in assemblies - assert "GCF_000099999.1" not in assemblies # ftp_path == "na" - - def test_parse_status(self) -> None: - """Verify status field is parsed correctly.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - assert assemblies["GCF_000001215.4"].status == "latest" - assert assemblies["GCF_000005845.2"].status == "replaced" - assert assemblies["GCF_000009999.1"].status == "suppressed" - - def test_parse_seq_rel_date(self) -> None: - """Verify seq_rel_date field is parsed correctly.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - assert assemblies["GCF_000001215.4"].seq_rel_date == "2014/10/21" - - def test_parse_assembly_dir(self) -> None: - """Verify assembly_dir is extracted from the FTP path.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - assert assemblies["GCF_000001215.4"].assembly_dir == "GCF_000001215.4_Release_6_plus_ISO1_MT" - - def test_parse_ftp_path(self) -> None: - """Verify full FTP path is stored.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - assert assemblies["GCF_000001215.4"].ftp_path == ( - "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" - ) - - def test_parse_empty(self) -> None: - """Verify empty or comment-only input returns empty dict.""" - assemblies = parse_assembly_summary("# comment only\n") - assert len(assemblies) == 0 - - def test_parse_skips_comments(self) -> None: - """Verify comment lines are not included in results.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - for acc in assemblies: - assert acc.startswith("GCF_") - - def test_parse_from_file(self, tmp_path: Path) -> None: - """Verify parsing from a file path object.""" +_EXPECTED_ASSEMBLIES = { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date="2014/10/21", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT", + assembly_dir="GCF_000001215.4_Release_6_plus_ISO1_MT", + ), + "GCF_000001405.40": AssemblyRecord( + accession="GCF_000001405.40", + status="latest", + seq_rel_date="2022/02/03", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14", + assembly_dir="GCF_000001405.40_GRCh38.p14", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="replaced", + seq_rel_date="2013/09/26", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2", + assembly_dir="GCF_000005845.2_ASM584v2", + ), + "GCF_000009999.1": AssemblyRecord( + accession="GCF_000009999.1", + status="suppressed", + seq_rel_date="2010/01/01", + ftp_path="https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/999/GCF_000009999.1_ASM999v1", + assembly_dir="GCF_000009999.1_ASM999v1", + ), + # GCF_000099999.1 is excluded because ftp_path == "na" +} + + +def test_parse_assembly_summary() -> None: + """SAMPLE_SUMMARY is parsed to the expected assemblies.""" + assert parse_assembly_summary(SAMPLE_SUMMARY) == _EXPECTED_ASSEMBLIES + + +def test_parse_assembly_summary_empty() -> None: + """Comment-only input returns empty dict.""" + assert parse_assembly_summary("# comment only\n") == {} + + +@pytest.mark.parametrize("source", ["file", "file_str", "list_of_lines"]) +def test_parse_assembly_summary_input_types(source: str, tmp_path: Path) -> None: + """Parsing works from a file path, string path, and list of lines.""" + if source == "list_of_lines": + arg = SAMPLE_SUMMARY.splitlines(keepends=True) + else: f = tmp_path / "summary.tsv" f.write_text(SAMPLE_SUMMARY) - assemblies = parse_assembly_summary(f) - assert len(assemblies) == _EXPECTED_ENTRIES - - def test_parse_from_file_str(self, tmp_path: Path) -> None: - """Verify parsing from a string file path.""" - f = tmp_path / "summary.tsv" - f.write_text(SAMPLE_SUMMARY) - assemblies = parse_assembly_summary(str(f)) - assert len(assemblies) == _EXPECTED_ENTRIES - - def test_parse_from_list_of_lines(self) -> None: - """Verify parsing from a list of lines.""" - lines = SAMPLE_SUMMARY.splitlines(keepends=True) - assemblies = parse_assembly_summary(lines) - assert len(assemblies) == _EXPECTED_ENTRIES + arg = f if source == "file" else str(f) + assert parse_assembly_summary(arg) == _EXPECTED_ASSEMBLIES # ── get_latest_assembly_paths ──────────────────────────────────────────── -class TestGetLatestAssemblyPaths: - """Test extraction of FTP paths for latest assemblies.""" +def test_get_latest_assembly_paths() -> None: + """Only 'latest' assemblies appear; paths are FTP directories with trailing slash.""" + assert dict(get_latest_assembly_paths(parse_assembly_summary(SAMPLE_SUMMARY))) == { + "GCF_000001215.4": "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/", + "GCF_000001405.40": "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/", + } - def test_only_latest(self) -> None: - """Verify only assemblies with status 'latest' are returned.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - paths = get_latest_assembly_paths(assemblies) - accessions = [acc for acc, _ in paths] - assert "GCF_000001215.4" in accessions - assert "GCF_000001405.40" in accessions - assert "GCF_000005845.2" not in accessions # replaced - assert "GCF_000009999.1" not in accessions # suppressed - def test_path_conversion(self) -> None: - """Verify HTTPS paths are converted to FTP-relative paths.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - paths = dict(get_latest_assembly_paths(assemblies)) - assert paths["GCF_000001215.4"] == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/" - - def test_paths_end_with_slash(self) -> None: - """Verify all returned paths end with a trailing slash.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - for _, path in get_latest_assembly_paths(assemblies): - assert path.endswith("/") - - def test_empty(self) -> None: - """Verify empty input returns empty list.""" - assemblies = parse_assembly_summary("# empty\n") - assert get_latest_assembly_paths(assemblies) == [] +def test_get_latest_assembly_paths_empty() -> None: + """Empty input returns empty list.""" + assert get_latest_assembly_paths(parse_assembly_summary("# empty\n")) == [] # ── compute_diff ───────────────────────────────────────────────────────── -class TestComputeDiff: - """Test diff computation between current and previous assembly state.""" - - def test_all_new_no_previous(self) -> None: - """Verify all latest assemblies are marked new when no previous state.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions=set()) - assert "GCF_000001215.4" in diff.new - assert "GCF_000001405.40" in diff.new - assert "GCF_000005845.2" not in diff.new # replaced - - def test_nothing_new_when_all_known(self) -> None: - """Verify no new assemblies when all are already known.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - known = {"GCF_000001215.4", "GCF_000001405.40"} - diff = compute_diff(current, previous_accessions=known) - assert len(diff.new) == 0 - - def test_detects_updated_seq_rel_date_newer(self) -> None: - """Assemblies whose seq_rel_date moved forward are marked updated.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - previous = parse_assembly_summary(SAMPLE_SUMMARY) - previous["GCF_000001215.4"].seq_rel_date = "2010/01/01" - diff = compute_diff(current, previous_assemblies=previous) - assert "GCF_000001215.4" in diff.updated - - def test_does_not_flag_updated_when_seq_rel_date_older(self) -> None: - """Assemblies whose seq_rel_date in current is older (e.g. synthetic baseline) are not flagged.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - previous = parse_assembly_summary(SAMPLE_SUMMARY) - previous["GCF_000001215.4"].seq_rel_date = "2099/12/31" - diff = compute_diff(current, previous_assemblies=previous) - assert "GCF_000001215.4" not in diff.updated - - def test_detects_replaced(self) -> None: - """Verify assemblies with status 'replaced' are detected.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions={"GCF_000005845.2"}) - assert "GCF_000005845.2" in diff.replaced - - def test_detects_suppressed(self) -> None: - """Verify assemblies with status 'suppressed' are detected.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions={"GCF_000009999.1"}) - assert "GCF_000009999.1" in diff.suppressed - - def test_detects_withdrawn(self) -> None: - """Accessions in previous but entirely absent from current.""" - current = parse_assembly_summary("# empty\n") - diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) - assert "GCF_000001215.4" in diff.suppressed - - def test_scan_store_fallback(self) -> None: - """Verify known accessions are not marked as new.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) - assert "GCF_000001215.4" not in diff.new - assert "GCF_000001405.40" in diff.new - - def test_results_are_sorted(self) -> None: - """Verify diff results are sorted alphabetically.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions=set()) - assert diff.new == sorted(diff.new) +def test_compute_diff_new() -> None: + """All latest assemblies are new with no previous state; result is sorted.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + assert "GCF_000001215.4" in diff.new + assert "GCF_000001405.40" in diff.new + assert "GCF_000005845.2" not in diff.new # replaced + assert diff.new == sorted(diff.new) -# ── accession_prefix & filter_by_prefix_range ──────────────────────────── +def test_compute_diff_updated() -> None: + """seq_rel_date moving forward marks updated; moving backward does not.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + previous = parse_assembly_summary(SAMPLE_SUMMARY) + previous["GCF_000001215.4"].seq_rel_date = "2010/01/01" + assert "GCF_000001215.4" in compute_diff(current, previous_assemblies=previous).updated + previous2 = parse_assembly_summary(SAMPLE_SUMMARY) + previous2["GCF_000001215.4"].seq_rel_date = "2099/12/31" + assert "GCF_000001215.4" not in compute_diff(current, previous_assemblies=previous2).updated -class TestPrefixFiltering: - """Test prefix extraction and range filtering.""" - def test_accession_prefix(self) -> None: - """Verify 3-digit prefix extraction from accessions.""" - assert accession_prefix("GCF_000001215.4") == "000" - assert accession_prefix("GCF_123456789.1") == "123" - assert accession_prefix("invalid") is None +def test_compute_diff_removed() -> None: + """Replaced, suppressed, and entirely-absent accessions are classified correctly.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + assert "GCF_000005845.2" in compute_diff(current, previous_accessions={"GCF_000005845.2"}).replaced + assert "GCF_000009999.1" in compute_diff(current, previous_accessions={"GCF_000009999.1"}).suppressed + # Accession absent from current entirely → suppressed + assert ( + "GCF_000001215.4" + in compute_diff(parse_assembly_summary("# empty\n"), previous_accessions={"GCF_000001215.4"}).suppressed + ) - def test_filter_range_inclusive(self) -> None: - """Verify prefix range filter is inclusive.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - filtered = filter_by_prefix_range(assemblies, "000", "000") - assert len(filtered) == len(assemblies) - def test_filter_excludes_out_of_range(self) -> None: - """Verify assemblies outside the prefix range are excluded.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - filtered = filter_by_prefix_range(assemblies, "001", "999") - assert len(filtered) == 0 +def test_compute_diff_scan_store_fallback() -> None: + """Known accessions are not marked new; unknown ones are.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000001215.4"}) + assert "GCF_000001215.4" not in diff.new + assert "GCF_000001405.40" in diff.new - def test_no_filter_returns_all(self) -> None: - """Verify no prefix range returns all assemblies.""" - assemblies = parse_assembly_summary(SAMPLE_SUMMARY) - filtered = filter_by_prefix_range(assemblies) - assert len(filtered) == len(assemblies) +# ── accession_prefix & filter_by_prefix_range ──────────────────────────── -# ── Manifest writing ──────────────────────────────────────────────────── +@pytest.mark.parametrize( + ("accession", "expected"), + [ + pytest.param("GCF_000001215.4", "000", id="three_zeros"), + pytest.param("GCF_123456789.1", "123", id="non_zero"), + pytest.param("invalid", None, id="invalid"), + ], +) +def test_accession_prefix(accession: str, expected: str | None) -> None: + """3-digit prefix is extracted from the accession; invalid input returns None.""" + assert accession_prefix(accession) == expected -class TestManifestWriting: - """Test manifest file writing.""" - - def test_write_transfer_manifest(self, tmp_path: Path) -> None: - """Verify transfer manifest file is written correctly.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions=set()) - manifest_file = tmp_path / "transfer.txt" - paths = write_transfer_manifest(diff, current, manifest_file) - assert len(paths) > 0 - lines = [line.strip() for line in manifest_file.read_text().splitlines() if line.strip()] - assert len(lines) == len(paths) - for line in lines: - assert line.startswith("/genomes/") - assert line.endswith("/") - - def test_write_removed_manifest(self, tmp_path: Path) -> None: - """Verify removed manifest lists replaced and suppressed accessions.""" - current = parse_assembly_summary(SAMPLE_SUMMARY) - diff = compute_diff(current, previous_accessions={"GCF_000005845.2", "GCF_000009999.1"}) - removed_file = tmp_path / "removed.txt" - removed = write_removed_manifest(diff, removed_file) - assert len(removed) == _EXPECTED_TWO - lines = [line.strip() for line in removed_file.read_text().splitlines() if line.strip()] - assert len(lines) == _EXPECTED_TWO - - def test_write_updated_manifest(self, tmp_path: Path) -> None: - """Verify updated manifest lists only updated accessions.""" - diff = DiffResult(new=["GCF_000001215.4"], updated=["GCF_000005845.2", "GCF_000001405.40"]) - updated_file = tmp_path / "updated.txt" - updated = write_updated_manifest(diff, updated_file) - assert len(updated) == _EXPECTED_TWO - lines = [line.strip() for line in updated_file.read_text().splitlines() if line.strip()] - assert len(lines) == _EXPECTED_TWO - # Should be sorted - assert lines[0] == "GCF_000001405.40" - assert lines[1] == "GCF_000005845.2" - - def test_write_diff_summary(self, tmp_path: Path) -> None: - """Verify diff summary JSON is written with correct counts.""" - diff = DiffResult(new=["a"], updated=["b"], replaced=["c"], suppressed=[]) - summary_file = tmp_path / "summary.json" - summary = write_diff_summary(diff, summary_file, "refseq", "000", "003") - assert summary["counts"]["new"] == 1 - assert summary["counts"]["total_to_transfer"] == _EXPECTED_TOTAL_TRANSFER - assert summary["prefix_range"]["from"] == "000" - - loaded = json.loads(summary_file.read_text()) - assert loaded["database"] == "refseq" +def test_filter_by_prefix_range() -> None: + """Range filter is inclusive; out-of-range excluded; no range returns all.""" + assemblies = parse_assembly_summary(SAMPLE_SUMMARY) + assert len(filter_by_prefix_range(assemblies, "000", "000")) == len(assemblies) + assert len(filter_by_prefix_range(assemblies, "001", "999")) == 0 + assert len(filter_by_prefix_range(assemblies)) == len(assemblies) -# ── _ftp_dir_from_url ─────────────────────────────────────────────────── +# ── Manifest writing ──────────────────────────────────────────────────── -class TestFtpDirFromUrl: - """Test FTP URL to directory path conversion.""" - def test_https_url(self) -> None: - """Verify https:// URLs are converted to FTP paths.""" - url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" - assert _ftp_dir_from_url(url) == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" +def test_write_transfer_manifest(tmp_path: Path) -> None: + """Transfer manifest is written with FTP paths that start with /genomes/ and end with /.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions=set()) + manifest_file = tmp_path / "transfer.txt" + paths = write_transfer_manifest(diff, current, manifest_file) + assert len(paths) > 0 + lines = [line.strip() for line in manifest_file.read_text().splitlines() if line.strip()] + assert len(lines) == len(paths) + for line in lines: + assert line.startswith("/genomes/") + assert line.endswith("/") + + +def test_write_removed_manifest(tmp_path: Path) -> None: + """Removed manifest lists replaced and suppressed accessions.""" + current = parse_assembly_summary(SAMPLE_SUMMARY) + diff = compute_diff(current, previous_accessions={"GCF_000005845.2", "GCF_000009999.1"}) + removed_file = tmp_path / "removed.txt" + removed = write_removed_manifest(diff, removed_file) + assert len(removed) == _EXPECTED_TWO + lines = [line.strip() for line in removed_file.read_text().splitlines() if line.strip()] + assert len(lines) == _EXPECTED_TWO + + +def test_write_updated_manifest(tmp_path: Path) -> None: + """Updated manifest lists only updated accessions, sorted.""" + diff = DiffResult(new=["GCF_000001215.4"], updated=["GCF_000005845.2", "GCF_000001405.40"]) + updated_file = tmp_path / "updated.txt" + updated = write_updated_manifest(diff, updated_file) + assert len(updated) == _EXPECTED_TWO + lines = [line.strip() for line in updated_file.read_text().splitlines() if line.strip()] + assert lines == ["GCF_000001405.40", "GCF_000005845.2"] + + +def test_write_diff_summary(tmp_path: Path) -> None: + """Diff summary JSON is written with correct counts, prefix range, and database.""" + diff = DiffResult(new=["a"], updated=["b"], replaced=["c"], suppressed=[]) + summary_file = tmp_path / "summary.json" + summary = write_diff_summary(diff, summary_file, "refseq", "000", "003") + assert json.loads(summary_file.read_text()) == summary + assert {k: summary[k] for k in ("database", "counts", "prefix_range", "accessions")} == { + "database": "refseq", + "counts": { + "new": 1, + "updated": 1, + "replaced": 1, + "suppressed": 0, + "total_to_transfer": 2, + "total_to_remove": 1, + }, + "prefix_range": {"from": "000", "to": "003"}, + "accessions": {"new": ["a"], "updated": ["b"], "replaced": ["c"], "suppressed": []}, + } - def test_ftp_url(self) -> None: - """Verify ftp:// URLs are converted to FTP paths.""" - url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" - assert _ftp_dir_from_url(url) == "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" - def test_bare_path(self) -> None: - """Verify bare paths are returned unchanged.""" - path = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6" - assert _ftp_dir_from_url(path) == path +# ── _ftp_dir_from_url ─────────────────────────────────────────────────── + - def test_custom_ftp_host(self) -> None: - """Verify custom FTP host is stripped from ftp:// URLs.""" - url = "ftp://custom.host.example.com/genomes/all/GCF/000/001/215" - assert _ftp_dir_from_url(url, ftp_host="custom.host.example.com") == "/genomes/all/GCF/000/001/215" +@pytest.mark.parametrize( + ("url", "expected", "kwargs"), + [ + pytest.param( + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="https_url", + ), + pytest.param( + "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="ftp_url", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6", + {}, + id="bare_path", + ), + pytest.param( + "ftp://custom.host.example.com/genomes/all/GCF/000/001/215", + "/genomes/all/GCF/000/001/215", + {"ftp_host": "custom.host.example.com"}, + id="custom_ftp_host", + ), + ], +) +def test_ftp_dir_from_url(url: str, expected: str, kwargs: dict) -> None: + assert _ftp_dir_from_url(url, **kwargs) == expected # ── verify_transfer_candidates ─────────────────────────────────────────── @@ -341,468 +293,384 @@ def _mock_s3_empty() -> MagicMock: return client -class TestVerifyTransferCandidates: - """Test S3 checksum verification to prune transfer candidates.""" - - def _assemblies(self) -> dict: - return parse_assembly_summary(SAMPLE_SUMMARY) - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_prunes_when_all_match( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Assemblies where every file matches S3 are pruned from the list.""" - mock_connect.return_value = MagicMock() - - def head_side_effect(s3_path: str) -> dict | None: +_BUCKET = "cdm-lake" +_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" + + +def _assemblies() -> dict: + return parse_assembly_summary(SAMPLE_SUMMARY) + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_prunes_when_all_match( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assemblies where every file matches S3 are pruned from the list.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + if "_genomic.fna.gz" in s3_path: + return {"size": 100, "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, "checksum_crc64nvme": None} + if "_protein.faa.gz" in s3_path: + return {"size": 100, "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, "checksum_crc64nvme": None} + if "_assembly_report.txt" in s3_path: + return {"size": 100, "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, "checksum_crc64nvme": None} + return None + + mock_head.side_effect = head_side_effect + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == [] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_md5_differs( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when at least one file has a different MD5.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {"md5": "WRONG"}, "checksum_crc64nvme": None} + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_s3_object_missing( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when at least one file doesn't exist in S3.""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_no_md5_metadata( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Assembly is kept when S3 object exists but has no md5 metadata.""" + mock_connect.return_value = MagicMock() + mock_head.return_value = {"size": 100, "metadata": {}, "checksum_crc64nvme": None} + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", side_effect=Exception("FTP error")) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_keeps_when_ftp_fails( + mock_connect: MagicMock, mock_retrieve: MagicMock, mock_s3: MagicMock +) -> None: + """Assembly is kept (conservative) when md5checksums.txt cannot be fetched.""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) == ["GCF_000001215.4"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_empty_input(mock_connect: MagicMock) -> None: + """Empty accession list returns empty result without connecting.""" + assert verify_transfer_candidates([], {}, _BUCKET, "prefix/") == [] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_unknown_accession_kept(mock_connect: MagicMock) -> None: + """Accessions not in assemblies dict are kept (conservative).""" + mock_connect.return_value = MagicMock() + assert verify_transfer_candidates(["GCF_999999999.1"], {}, _BUCKET, "prefix/") == ["GCF_999999999.1"] + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_short_circuits_on_first_mismatch( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """Verification stops checking after the first missing/mismatched file.""" + mock_connect.return_value = MagicMock() + verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert mock_head.call_count == 1 + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") +@patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_mixed( + mock_connect: MagicMock, + mock_retrieve: MagicMock, + mock_head: MagicMock, + mock_s3: MagicMock, +) -> None: + """A mix of matching and non-matching assemblies: matched pruned, unmatched kept.""" + mock_connect.return_value = MagicMock() + + def head_side_effect(s3_path: str) -> dict | None: + if "GCF_000001215.4_Release_6_plus_ISO1_MT/" in s3_path: if "_genomic.fna.gz" in s3_path: - return { - "size": 100, - "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, - "checksum_crc64nvme": None, - } + return {"size": 1, "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, "checksum_crc64nvme": None} if "_protein.faa.gz" in s3_path: - return { - "size": 100, - "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, - "checksum_crc64nvme": None, - } + return {"size": 1, "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, "checksum_crc64nvme": None} if "_assembly_report.txt" in s3_path: - return { - "size": 100, - "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, - "checksum_crc64nvme": None, - } - return None - - mock_head.side_effect = head_side_effect - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == [] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_keeps_when_md5_differs( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Assembly is kept when at least one file has a different MD5.""" - mock_connect.return_value = MagicMock() - mock_head.return_value = {"size": 100, "metadata": {"md5": "WRONG"}, "checksum_crc64nvme": None} - - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001215.4"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_keeps_when_s3_object_missing( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Assembly is kept when at least one file doesn't exist in S3.""" - mock_connect.return_value = MagicMock() - - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001215.4"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_keeps_when_s3_has_no_md5_metadata( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Assembly is kept when S3 object exists but has no md5 metadata.""" - mock_connect.return_value = MagicMock() - mock_head.return_value = {"size": 100, "metadata": {}, "checksum_crc64nvme": None} - - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001215.4"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", side_effect=Exception("FTP error")) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_keeps_when_ftp_fails(self, mock_connect: MagicMock, mock_retrieve: MagicMock, mock_s3: MagicMock) -> None: - """Assembly is kept (conservative) when md5checksums.txt cannot be fetched.""" - mock_connect.return_value = MagicMock() - - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001215.4"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_empty_input(self, mock_connect: MagicMock) -> None: - """Empty accession list returns empty result without connecting.""" - result = verify_transfer_candidates([], {}, "cdm-lake", "prefix/") - assert result == [] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_unknown_accession_kept(self, mock_connect: MagicMock) -> None: - """Accessions not in assemblies dict are kept (conservative).""" - mock_connect.return_value = MagicMock() - result = verify_transfer_candidates(["GCF_999999999.1"], {}, "cdm-lake", "prefix/") - assert result == ["GCF_999999999.1"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object", return_value=None) - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_short_circuits_on_first_mismatch( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Verification stops checking after the first missing/mismatched file.""" - mock_connect.return_value = MagicMock() - - verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert mock_head.call_count == 1 - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_with_objects()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.head_object") - @patch("cdm_data_loaders.ncbi_ftp.manifest.ftp_retrieve_text", return_value=_MD5_CHECKSUMS_TXT) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_mixed_candidates( - self, - mock_connect: MagicMock, - mock_retrieve: MagicMock, - mock_head: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Verify a mix of matching and non-matching assemblies.""" - mock_connect.return_value = MagicMock() - - def head_side_effect(s3_path: str) -> dict | None: - # GCF_000001215.4 assembly dir → all match; GCF_000001405.40 → missing - if "GCF_000001215.4_Release_6_plus_ISO1_MT/" in s3_path: - if "_genomic.fna.gz" in s3_path: - return { - "size": 1, - "metadata": {"md5": "d41d8cd98f00b204e9800998ecf8427e"}, - "checksum_crc64nvme": None, - } - if "_protein.faa.gz" in s3_path: - return { - "size": 1, - "metadata": {"md5": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4"}, - "checksum_crc64nvme": None, - } - if "_assembly_report.txt" in s3_path: - return { - "size": 1, - "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, - "checksum_crc64nvme": None, - } - return None - - mock_head.side_effect = head_side_effect - result = verify_transfer_candidates( - ["GCF_000001215.4", "GCF_000001405.40"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001405.40"] - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_empty()) - @patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") - def test_skips_ftp_when_folder_missing_from_store( - self, - mock_connect: MagicMock, - mock_s3: MagicMock, - ) -> None: - """Accessions with no objects in S3 are confirmed without FTP round-trip.""" - result = verify_transfer_candidates( - ["GCF_000001215.4"], - self._assemblies(), - "cdm-lake", - "tenant-general-warehouse/kbase/datasets/ncbi/", - ) - assert result == ["GCF_000001215.4"] - # FTP should never have been connected (lazy init) - mock_connect.assert_not_called() + return {"size": 1, "metadata": {"md5": "ffffffffffffffffffffffffffffffff"}, "checksum_crc64nvme": None} + return None + mock_head.side_effect = head_side_effect + result = verify_transfer_candidates(["GCF_000001215.4", "GCF_000001405.40"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert result == ["GCF_000001405.40"] -# ── Synthetic summary from S3 store scan ──────────────────────────────── +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client", return_value=_mock_s3_empty()) +@patch("cdm_data_loaders.ncbi_ftp.manifest.connect_ftp") +def test_verify_transfer_candidates_skips_ftp_when_folder_missing( + mock_connect: MagicMock, + mock_s3: MagicMock, +) -> None: + """Accessions with no objects in S3 are confirmed without FTP round-trip.""" + result = verify_transfer_candidates(["GCF_000001215.4"], _assemblies(), _BUCKET, _KEY_PREFIX) + assert result == ["GCF_000001215.4"] + mock_connect.assert_not_called() -class TestExtractAccessionFromS3Key: - """Test accession extraction from S3 paths.""" - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_extracts_accession_from_path(self, _mock_s3: MagicMock) -> None: - """Verify accession is extracted correctly from S3 keys.""" - assert ( - _extract_accession_from_s3_key( - "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" - ) - == "GCF_000001215.4" - ) - assert _extract_accession_from_s3_key("some/path/GCA_999999999.1_whatever/data.txt") == "GCA_999999999.1" - - def test_returns_none_for_invalid_path(self) -> None: - """Verify None is returned when no accession is found.""" - assert _extract_accession_from_s3_key("some/random/path") is None - assert _extract_accession_from_s3_key("") is None +# ── Synthetic summary from S3 store scan ──────────────────────────────── -class TestExtractAssemblyDirFromS3Key: - """Test assembly directory extraction from S3 paths.""" - def test_extracts_assembly_dir(self) -> None: - """Verify assembly directory is extracted correctly from S3 keys.""" - assert ( - _extract_assembly_dir_from_s3_key( - "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz" +@pytest.mark.parametrize( + ("key", "expected"), + [ + pytest.param( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "GCF_000001215.4", + id="long_path", + ), + pytest.param("some/path/GCA_999999999.1_whatever/data.txt", "GCA_999999999.1", id="short_path"), + pytest.param("some/random/path", None, id="no_accession"), + pytest.param("", None, id="empty"), + ], +) +def test_extract_accession_from_s3_key(key: str, expected: str | None) -> None: + """Accession is extracted from S3 key paths; invalid/empty paths return None.""" + assert _extract_accession_from_s3_key(key) == expected + + +@pytest.mark.parametrize( + ("key", "expected"), + [ + pytest.param( + "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "GCF_000001215.4_Release_6_plus_ISO1_MT", + id="long_path", + ), + pytest.param( + "prefix/GCA_999999999.1_assembly_name/subdir/data.txt", + "GCA_999999999.1_assembly_name", + id="subdir", + ), + pytest.param("some/random/path", None, id="no_assembly_dir"), + pytest.param("", None, id="empty"), + ], +) +def test_extract_assembly_dir_from_s3_key(key: str, expected: str | None) -> None: + """Assembly directory is extracted from S3 key paths; invalid/empty paths return None.""" + assert _extract_assembly_dir_from_s3_key(key) == expected + + +def _make_mock_s3_paginator() -> MagicMock: + """Return a mock S3 client with two assemblies (GCF_000001215.4, GCF_000005845.2).""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file1.gz", + "LastModified": datetime(2024, 1, 15, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file2.gz", + "LastModified": datetime(2024, 1, 16, tzinfo=timezone.utc), + }, + { + "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000005845.2_Assembly/file.gz", + "LastModified": datetime(2024, 2, 20, tzinfo=timezone.utc), + }, + ] + } + ] + return mock + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_builds_summary(mock_get_s3: MagicMock) -> None: + """Synthetic summary is built correctly with provided release_date for all assemblies.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + assert scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") == { + "GCF_000001215.4": AssemblyRecord( + accession="GCF_000001215.4", + status="latest", + seq_rel_date="2024/01/31", + ftp_path="https://ftp.ncbi.nlm.nih.gov/synthetic/GCF_000001215.4_Release_6", + assembly_dir="GCF_000001215.4_Release_6", + ), + "GCF_000005845.2": AssemblyRecord( + accession="GCF_000005845.2", + status="latest", + seq_rel_date="2024/01/31", + ftp_path="https://ftp.ncbi.nlm.nih.gov/synthetic/GCF_000005845.2_Assembly", + assembly_dir="GCF_000005845.2_Assembly", + ), + } + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_applies_release_date_to_all(mock_get_s3: MagicMock) -> None: + """Provided release_date is used even when files have different LastModified dates.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "prefix/GCF_000001215.4_v1/file_newer.gz", + "LastModified": datetime(2024, 3, 20, tzinfo=timezone.utc), + }, + { + "Key": "prefix/GCF_000001215.4_v1/file_older.gz", + "LastModified": datetime(2024, 1, 10, tzinfo=timezone.utc), + }, + ] + } + ] + mock_get_s3.return_value = mock + assert ( + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/31")["GCF_000001215.4"].seq_rel_date + == "2024/03/31" + ) + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_raises_for_invalid_release_date(mock_get_s3: MagicMock) -> None: + """Invalid release_date format is rejected.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + with pytest.raises(ValueError, match="Invalid release_date"): + scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024-03-31") + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_invokes_progress_callback(mock_get_s3: MagicMock) -> None: + """Progress callback is called once per unique assembly discovered.""" + mock_get_s3.return_value = _make_mock_s3_paginator() + calls: list[tuple[int, str]] = [] + scan_store_to_synthetic_summary( + "test-bucket", "prefix/", "2024/01/31", progress_callback=lambda n, a: calls.append((n, a)) + ) + assert len(calls) == 2 + assert calls[0][0] == 1 + assert calls[1][0] == 2 + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_handles_empty_store(mock_get_s3: MagicMock) -> None: + """Empty store returns empty dict.""" + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [{"Contents": []}] + mock_get_s3.return_value = mock + assert scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") == {} + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_skips_objects_without_accession(mock_get_s3: MagicMock) -> None: + """Objects without valid accessions in the key are skipped.""" + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock.get_paginator.return_value = mock_paginator + mock_paginator.paginate.return_value = [ + { + "Contents": [ + {"Key": "prefix/some/random/file.txt", "LastModified": datetime(2024, 1, 1, tzinfo=timezone.utc)}, + { + "Key": "prefix/GCF_000001215.4_Assembly/valid_file.gz", + "LastModified": datetime(2024, 2, 1, tzinfo=timezone.utc), + }, + ] + } + ] + mock_get_s3.return_value = mock + result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") + assert len(result) == 1 + assert "GCF_000001215.4" in result + + +@patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") +def test_scan_store_assembly_dir_survives_round_trip(mock_get_s3: MagicMock, tmp_path: Path) -> None: + """assembly_dir is preserved after save-to-file / parse-back round-trip. + + Regression: previously ftp_path was written as "" causing assembly_dir="" + and compute_diff flagging every assembly as updated. + """ + from datetime import datetime, timezone + + mock = MagicMock() + mock_paginator = MagicMock() + mock_paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "prefix/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", + "LastModified": datetime(2024, 3, 10, tzinfo=timezone.utc), + } + ] + } + ] + mock.get_paginator.return_value = mock_paginator + mock_get_s3.return_value = mock + + synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/10") + + out_file = tmp_path / "synthetic_summary.txt" + with out_file.open("w") as f: + for acc in sorted(synthetic.keys()): + rec = synthetic[acc] + f.write( + f"{rec.accession}\t.\t.\t.\t.\t.\t.\t.\t.\t.\t{rec.status}\t.\t.\t.\t{rec.seq_rel_date}\t.\t.\t.\t.\t{rec.ftp_path}\t.\n" ) - == "GCF_000001215.4_Release_6_plus_ISO1_MT" - ) - assert ( - _extract_assembly_dir_from_s3_key("prefix/GCA_999999999.1_assembly_name/subdir/data.txt") - == "GCA_999999999.1_assembly_name" - ) - - def test_returns_none_for_invalid_path(self) -> None: - """Verify None is returned when no assembly directory is found.""" - assert _extract_assembly_dir_from_s3_key("some/random/path") is None - assert _extract_assembly_dir_from_s3_key("") is None - - -class TestScanStoreToSyntheticSummary: - """Test synthetic assembly summary generation from S3 store scan.""" - - def _mock_s3_with_objects(self) -> MagicMock: - """Return a mock S3 client with assembly objects.""" - from datetime import datetime, timezone - - mock = MagicMock() - mock_paginator = MagicMock() - mock.get_paginator.return_value = mock_paginator - - # Mock objects from two assemblies - page_contents = [ - { - "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file1.gz", - "LastModified": datetime(2024, 1, 15, tzinfo=timezone.utc), - }, - { - "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000001215.4_Release_6/file2.gz", - "LastModified": datetime(2024, 1, 16, tzinfo=timezone.utc), - }, - { - "Key": "tenant-general-warehouse/kbase/datasets/ncbi/refseq/GCF_000005845.2_Assembly/file.gz", - "LastModified": datetime(2024, 2, 20, tzinfo=timezone.utc), - }, - ] - mock_paginator.paginate.return_value = [{"Contents": page_contents}] - return mock - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_builds_summary_from_store(self, mock_get_s3: MagicMock) -> None: - """Verify synthetic summary is built correctly from S3 objects.""" - mock_get_s3.return_value = self._mock_s3_with_objects() - - result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") - - assert len(result) == 2 - assert "GCF_000001215.4" in result - assert "GCF_000005845.2" in result - - # Should use provided release date for all records - rec1 = result["GCF_000001215.4"] - assert rec1.accession == "GCF_000001215.4" - assert rec1.status == "latest" - assert rec1.seq_rel_date == "2024/01/31" - assert rec1.assembly_dir == "GCF_000001215.4_Release_6" - - # Other assembly uses the same provided date - rec2 = result["GCF_000005845.2"] - assert rec2.seq_rel_date == "2024/01/31" - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_applies_release_date_to_all_assemblies(self, mock_get_s3: MagicMock) -> None: - """Verify provided release_date is used for all assemblies.""" - mock = MagicMock() - mock_paginator = MagicMock() - mock.get_paginator.return_value = mock_paginator - - from datetime import datetime, timezone - - # Files from same assembly with different dates - page_contents = [ - { - "Key": "prefix/GCF_000001215.4_v1/file_newer.gz", - "LastModified": datetime(2024, 3, 20, tzinfo=timezone.utc), - }, - { - "Key": "prefix/GCF_000001215.4_v1/file_older.gz", - "LastModified": datetime(2024, 1, 10, tzinfo=timezone.utc), - }, - ] - mock_paginator.paginate.return_value = [{"Contents": page_contents}] - mock_get_s3.return_value = mock - - result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/31") - - assert result["GCF_000001215.4"].seq_rel_date == "2024/03/31" - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_raises_for_invalid_release_date(self, mock_get_s3: MagicMock) -> None: - """Verify invalid release_date format is rejected.""" - mock_get_s3.return_value = self._mock_s3_with_objects() - - with pytest.raises(ValueError, match="Invalid release_date"): - scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024-03-31") - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_invokes_progress_callback(self, mock_get_s3: MagicMock) -> None: - """Verify progress callback is called for each unique assembly.""" - mock_get_s3.return_value = self._mock_s3_with_objects() - callback_calls = [] - - def track_progress(count: int, acc: str) -> None: - callback_calls.append((count, acc)) - - scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31", progress_callback=track_progress) - - # Should have 2 calls (one per assembly discovered) - assert len(callback_calls) == 2 - assert callback_calls[0][0] == 1 # first assembly - assert callback_calls[1][0] == 2 # second assembly - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_handles_empty_store(self, mock_get_s3: MagicMock) -> None: - """Verify function handles empty store gracefully.""" - mock = MagicMock() - mock_paginator = MagicMock() - mock.get_paginator.return_value = mock_paginator - mock_paginator.paginate.return_value = [{"Contents": []}] - mock_get_s3.return_value = mock - - result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") - - assert result == {} - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_skips_objects_without_accession(self, mock_get_s3: MagicMock) -> None: - """Verify objects without valid accessions are skipped.""" - from datetime import datetime, timezone - - mock = MagicMock() - mock_paginator = MagicMock() - mock.get_paginator.return_value = mock_paginator - - page_contents = [ - { - "Key": "prefix/some/random/file.txt", # No accession - "LastModified": datetime(2024, 1, 1, tzinfo=timezone.utc), - }, - { - "Key": "prefix/GCF_000001215.4_Assembly/valid_file.gz", - "LastModified": datetime(2024, 2, 1, tzinfo=timezone.utc), - }, - ] - mock_paginator.paginate.return_value = [{"Contents": page_contents}] - mock_get_s3.return_value = mock - - result = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/01/31") - - # Only one valid assembly should be found - assert len(result) == 1 - assert "GCF_000001215.4" in result - - @patch("cdm_data_loaders.ncbi_ftp.manifest.get_s3_client") - def test_assembly_dir_survives_file_round_trip(self, mock_get_s3: MagicMock, tmp_path: Path) -> None: - """Verify assembly_dir is preserved when saving to file and parsing back. - - Regression test: previously ftp_path was written as "" which caused - parse_assembly_summary to recover assembly_dir="" for all records, - making compute_diff flag every assembly as updated. - """ - from datetime import datetime, timezone - - mock = MagicMock() - mock_paginator = MagicMock() - page_contents = [ - { - "Key": "prefix/GCF_000001215.4_Release_6_plus_ISO1_MT/file.gz", - "LastModified": datetime(2024, 3, 10, tzinfo=timezone.utc), - }, - ] - mock_paginator.paginate.return_value = [{"Contents": page_contents}] - mock.get_paginator.return_value = mock_paginator - mock_get_s3.return_value = mock - - synthetic = scan_store_to_synthetic_summary("test-bucket", "prefix/", "2024/03/10") - - # Simulate the notebook's save logic - out_file = tmp_path / "synthetic_summary.txt" - with out_file.open("w") as f: - for acc in sorted(synthetic.keys()): - rec = synthetic[acc] - f.write( - f"{rec.accession}\t.\t.\t.\t.\t.\t.\t.\t.\t.\t{rec.status}\t.\t.\t.\t{rec.seq_rel_date}\t.\t.\t.\t.\t{rec.ftp_path}\t.\n" - ) - - # Parse the file back - reparsed = parse_assembly_summary(out_file) - - assert "GCF_000001215.4" in reparsed - reparsed_rec = reparsed["GCF_000001215.4"] - original_rec = synthetic["GCF_000001215.4"] - - # assembly_dir must survive the round-trip so diffs are accurate - assert reparsed_rec.assembly_dir == original_rec.assembly_dir - assert reparsed_rec.seq_rel_date == original_rec.seq_rel_date - assert reparsed_rec.status == original_rec.status + + reparsed = parse_assembly_summary(out_file) + assert "GCF_000001215.4" in reparsed + reparsed_rec = reparsed["GCF_000001215.4"] + original_rec = synthetic["GCF_000001215.4"] + assert reparsed_rec.assembly_dir == original_rec.assembly_dir + assert reparsed_rec.seq_rel_date == original_rec.seq_rel_date + assert reparsed_rec.status == original_rec.status diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index bcdd825d..0b471b48 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -57,317 +57,207 @@ ] -# ── build_descriptor_key ───────────────────────────────────────────────── +# ── build_descriptor_key / build_archive_descriptor_key ───────────────── -class TestBuildDescriptorKey: - """Tests for build_descriptor_key path helper.""" +@pytest.mark.parametrize("prefix", [_KEY_PREFIX, _KEY_PREFIX.rstrip("/")]) +def test_build_descriptor_key(prefix: str) -> None: + """Key is under metadata/, ends with _datapackage.json, trailing slash on prefix is normalized.""" + key = build_descriptor_key(_ASSEMBLY_DIR, prefix) + assert key == f"{_KEY_PREFIX}metadata/{_ASSEMBLY_DIR}_datapackage.json" + assert "//" not in key - def test_produces_metadata_path(self) -> None: - """Key is located under metadata/ with _datapackage.json suffix.""" - key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - assert key == f"{_KEY_PREFIX}metadata/{_ASSEMBLY_DIR}_datapackage.json" - def test_trailing_slash_normalised(self) -> None: - """Key is the same whether key_prefix ends with a slash or not.""" - key_no_slash = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX.rstrip("/")) - key_with_slash = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - assert key_no_slash == key_with_slash +@pytest.mark.parametrize( + ("prefix", "tag"), + [ + pytest.param(_KEY_PREFIX, _RELEASE_TAG, id="trailing_slash"), + pytest.param(_KEY_PREFIX.rstrip("/"), _RELEASE_TAG, id="no_trailing_slash"), + pytest.param(_KEY_PREFIX, "2025-06", id="different_tag"), + ], +) +def test_build_archive_descriptor_key(prefix: str, tag: str) -> None: + """Archive key includes tag and has no double slash; trailing slash on prefix is normalized.""" + key = build_archive_descriptor_key(_ASSEMBLY_DIR, tag, prefix) + assert key == f"{_KEY_PREFIX}archive/{tag}/metadata/{_ASSEMBLY_DIR}_datapackage.json" + assert "//" not in key - def test_no_double_slash(self) -> None: - """Key never contains a double slash.""" - key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - assert "//" not in key +# ── create_descriptor ──────────────────────────────────────────────────── -# ── build_archive_descriptor_key ───────────────────────────────────────── +def test_create_descriptor() -> None: + """create_descriptor produces a fully populated descriptor matching the expected structure.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + + # URL hostname is computed; can't express as equality + host = urlparse(d["url"]).hostname + assert host is not None and (host == "ncbi.nlm.nih.gov" or host.endswith(".ncbi.nlm.nih.gov")) + + # resource[1]: hash=None → key absent; bytes=512 → key present + r1 = d["resources"][1] + assert "hash" not in r1 + assert "bytes" in r1 + + assert {k: d[k] for k in ("identifier", "resource_type", "version", "license", "contributors")} == { + "identifier": f"NCBI:{_ACCESSION}", + "resource_type": "dataset", + "version": "4", + "license": {}, + "contributors": [ + { + "name": "National Center for Biotechnology Information", + "contributor_id": "ROR:02meqm098", + "contributor_type": "Organization", + "contributor_roles": "DataCurator", + } + ], + } + assert { + k: d["meta"][k] for k in ("saved_by", "credit_metadata_schema_version", "timestamp", "credit_metadata_source") + } == { + "saved_by": "cdm-data-loaders-ncbi-ftp", + "credit_metadata_schema_version": "1.0", + "timestamp": _TIMESTAMP, + "credit_metadata_source": [ + { + "access_timestamp": _TIMESTAMP, + "source_name": "NCBI Genomes FTP", + "source_url": "ftp.ncbi.nlm.nih.gov/genomes/all/", + } + ], + } + assert _ASSEMBLY_DIR in d["titles"][0]["title"] + assert _ACCESSION in d["descriptions"][0]["description_text"] + r0 = d["resources"][0] + assert {k: r0[k] for k in ("hash", "bytes", "path")} == { + "hash": "abc123", + "bytes": 1024, + "path": _SAMPLE_RESOURCES[0]["path"], + } -class TestBuildArchiveDescriptorKey: - """Tests for build_archive_descriptor_key path helper.""" - def test_produces_archive_path(self) -> None: - """Key is located under archive/{release_tag}/metadata/.""" - key = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) - expected = f"{_KEY_PREFIX}archive/{_RELEASE_TAG}/metadata/{_ASSEMBLY_DIR}_datapackage.json" - assert key == expected +def test_create_descriptor_default_timestamp_is_recent() -> None: + """Default timestamp is close to current time when not specified.""" + before = int(time.time()) + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES) + after = int(time.time()) + assert before <= d["meta"]["timestamp"] <= after + 1 - def test_trailing_slash_normalised(self) -> None: - """Key is the same whether key_prefix ends with a slash or not.""" - a = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX.rstrip("/")) - b = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) - assert a == b - def test_release_tag_in_path(self) -> None: - """Release tag appears in the archive key path.""" - key = build_archive_descriptor_key(_ASSEMBLY_DIR, "2025-06", _KEY_PREFIX) - assert "2025-06" in key +def test_create_descriptor_resource_name_lowercased() -> None: + """Resource names are converted to lowercase.""" + resources: list[DescriptorResource] = [ + {"name": "FILE_UPPER.FNA.GZ", "path": "s3://bucket/a", "format": "gz", "bytes": 100, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert d["resources"][0]["name"] == "file_upper.fna.gz" -# ── create_descriptor ──────────────────────────────────────────────────── +def test_create_descriptor_null_bytes_omitted() -> None: + """Resources with bytes=None have the 'bytes' key removed from the output.""" + resources: list[DescriptorResource] = [ + {"name": "f.txt", "path": "s3://b/f.txt", "format": "txt", "bytes": None, "hash": "x"}, + ] + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) + assert "bytes" not in d["resources"][0] -class TestCreateDescriptor: - """Tests for create_descriptor().""" - - def test_identifier(self) -> None: - """Identifier field is prefixed with NCBI:.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["identifier"] == f"NCBI:{_ACCESSION}" - - def test_version_extracted_from_accession(self) -> None: - """Version is the suffix after the last dot in the accession.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["version"] == "4" # last segment of GCF_000001215.4 - - def test_title_includes_assembly_dir(self) -> None: - """Title includes the full assembly directory name.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert _ASSEMBLY_DIR in d["titles"][0]["title"] - - def test_description_includes_accession(self) -> None: - """Description text includes the accession.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert _ACCESSION in d["descriptions"][0]["description_text"] - - def test_url_references_accession(self) -> None: - """URL points to the NCBI genome page for the accession.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert _ACCESSION in d["url"] - parsed = urlparse(d["url"]) - host = parsed.hostname - assert host is not None - assert host == "ncbi.nlm.nih.gov" or host.endswith(".ncbi.nlm.nih.gov") - - def test_ncbi_contributor(self) -> None: - """Contributor is NCBI with the correct ROR ID.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["contributors"][0]["name"] == "National Center for Biotechnology Information" - assert d["contributors"][0]["contributor_id"] == "ROR:02meqm098" - - def test_saved_by(self) -> None: - """meta.saved_by is the cdm-data-loaders identifier.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["meta"]["saved_by"] == "cdm-data-loaders-ncbi-ftp" - - def test_timestamp_propagated(self) -> None: - """Explicit timestamp is used for both meta.timestamp and access_timestamp.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["meta"]["timestamp"] == _TIMESTAMP - assert d["meta"]["credit_metadata_source"][0]["access_timestamp"] == _TIMESTAMP - - def test_default_timestamp_is_recent(self) -> None: - """Default timestamp is close to current time when not specified.""" - before = int(time.time()) - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES) - after = int(time.time()) - ts = d["meta"]["timestamp"] - assert before <= ts <= after + 1 - - def test_resource_names_lowercased(self) -> None: - """Resource names are converted to lowercase.""" - resources: list[DescriptorResource] = [ - {"name": "FILE_UPPER.FNA.GZ", "path": "s3://bucket/a", "format": "gz", "bytes": 100, "hash": "x"}, - ] - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) - assert d["resources"][0]["name"] == "file_upper.fna.gz" - - def test_null_hash_omitted(self) -> None: - """Resources with hash=None must not include the 'hash' key.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - resources = d["resources"] - # Second resource has hash=None → key absent - assert "hash" not in resources[1] - - def test_non_null_hash_present(self) -> None: - """Non-null hash is retained in the resource entry.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["resources"][0]["hash"] == _SAMPLE_RESOURCES[0]["hash"] - - def test_resource_count(self) -> None: - """Resource list length matches the number of input resources.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert len(d["resources"]) == len(_SAMPLE_RESOURCES) - - def test_resource_bytes(self) -> None: - """Resource bytes matches the input bytes value.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["resources"][0]["bytes"] == _SAMPLE_RESOURCES[0]["bytes"] - - def test_resource_path(self) -> None: - """Resource path matches the input path value.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["resources"][0]["path"] == _SAMPLE_RESOURCES[0]["path"] - - def test_license_is_empty_dict(self) -> None: - """License field is an empty dict.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["license"] == {} - - def test_resource_type_is_dataset(self) -> None: - """resource_type is 'dataset'.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["resource_type"] == "dataset" - - def test_schema_version(self) -> None: - """credit_metadata_schema_version is '1.0'.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - assert d["meta"]["credit_metadata_schema_version"] == "1.0" - - def test_empty_resources_allowed(self) -> None: - """Empty resources list produces a valid descriptor.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, [], timestamp=_TIMESTAMP) - assert d["resources"] == [] - - def test_null_bytes_omitted(self) -> None: - """Resources with bytes=None have the 'bytes' key removed from the output.""" - resources: list[DescriptorResource] = [ - {"name": "f.txt", "path": "s3://b/f.txt", "format": "txt", "bytes": None, "hash": "x"}, - ] - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, resources, timestamp=_TIMESTAMP) - assert "bytes" not in d["resources"][0] +def test_create_descriptor_empty_resources() -> None: + """Empty resources list produces a valid descriptor.""" + d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, [], timestamp=_TIMESTAMP) + assert d["resources"] == [] # ── validate_descriptor ────────────────────────────────────────────────── -class TestValidateDescriptor: - """Tests for validate_descriptor().""" +def test_validate_descriptor_valid() -> None: + """Valid descriptor does not raise.""" + validate_descriptor( + create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP), + _ACCESSION, + ) + + +def test_validate_descriptor_empty_raises() -> None: + """Empty dict fails frictionless validation and raises.""" + with pytest.raises((ValueError, Exception)): + validate_descriptor({}, _ACCESSION) + + +# ── upload_descriptor / archive_descriptor ─────────────────────────────── - def test_valid_descriptor_passes(self) -> None: - """Valid descriptor does not raise.""" - d = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - # Should not raise - validate_descriptor(d, _ACCESSION) - def test_empty_descriptor_raises(self) -> None: - """Empty dict fails frictionless validation and raises.""" - with pytest.raises((ValueError, Exception)): - validate_descriptor({}, _ACCESSION) +@pytest.fixture +def mock_s3() -> Generator[botocore.client.BaseClient]: + """Mocked S3 client with the CDM Lake bucket pre-created.""" + with mock_aws(): + client = boto3.client("s3", region_name=AWS_REGION) + client.create_bucket(Bucket=TEST_BUCKET) + reset_s3_client() + with ( + patch.object(s3_utils, "get_s3_client", return_value=client), + patch.object(metadata_mod, "get_s3_client", return_value=client), + ): + yield client + reset_s3_client() -# ── upload_descriptor ──────────────────────────────────────────────────── +@pytest.fixture +def mock_s3_with_descriptor(mock_s3: botocore.client.BaseClient) -> tuple[botocore.client.BaseClient, MagicMock]: + """mock_s3 with a live descriptor pre-uploaded and copy_object patched.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + mock_s3.put_object(Bucket=TEST_BUCKET, Key=live_key, Body=json.dumps(descriptor).encode()) + with patch.object(metadata_mod, "copy_object") as mock_copy: + yield mock_s3, mock_copy + + +@pytest.mark.s3 +def test_upload_descriptor(mock_s3: botocore.client.BaseClient) -> None: + """Uploaded object is valid JSON at the expected key with the expected identifier.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) + expected_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + assert key == expected_key + assert key.startswith(_KEY_PREFIX) + assert key.endswith("_datapackage.json") + body = json.loads(mock_s3.get_object(Bucket=TEST_BUCKET, Key=key)["Body"].read()) + assert body["identifier"] == f"NCBI:{_ACCESSION}" + + +@pytest.mark.s3 +def test_upload_descriptor_dry_run(mock_s3: botocore.client.BaseClient) -> None: + """Dry-run returns the correct key but creates no S3 object.""" + descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) + key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) + assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) + objs = mock_s3.list_objects_v2(Bucket=TEST_BUCKET).get("Contents", []) + assert not any(o["Key"] == key for o in objs) + + +@pytest.mark.s3 +def test_archive_descriptor(mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock]) -> None: + """archive_descriptor returns True and calls copy_object with the correct keys.""" + _, mock_copy = mock_s3_with_descriptor + result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) + assert result is True + mock_copy.assert_called_once() + args = mock_copy.call_args[0] + assert f"{TEST_BUCKET}/{build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX)}" in args + assert f"{TEST_BUCKET}/{build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX)}" in args @pytest.mark.s3 -class TestUploadDescriptor: - """Tests for upload_descriptor() using moto-mocked S3.""" - - @pytest.fixture - def mock_s3(self) -> Generator[botocore.client.BaseClient]: - """Yield a mocked S3 client with the CDM Lake bucket pre-created.""" - with mock_aws(): - client = boto3.client("s3", region_name=AWS_REGION) - client.create_bucket(Bucket=TEST_BUCKET) - reset_s3_client() - with ( - patch.object(s3_utils, "get_s3_client", return_value=client), - patch.object(metadata_mod, "get_s3_client", return_value=client), - ): - yield client - reset_s3_client() - - def test_uploads_json(self, mock_s3: botocore.client.BaseClient) -> None: - """Uploaded object is valid JSON with the expected identifier.""" - descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) - assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - obj = mock_s3.get_object(Bucket=TEST_BUCKET, Key=key) - body = json.loads(obj["Body"].read()) - assert body["identifier"] == f"NCBI:{_ACCESSION}" - - def test_returns_expected_key(self, mock_s3: botocore.client.BaseClient) -> None: - """Return value is the metadata/ S3 key for the assembly.""" - descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX) - assert key.startswith(_KEY_PREFIX) - assert key.endswith("_datapackage.json") - - def test_dry_run_skips_upload(self, mock_s3: botocore.client.BaseClient) -> None: - """Dry-run returns the key but does not create any S3 object.""" - descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) - # No object in S3 - objs = mock_s3.list_objects_v2(Bucket=TEST_BUCKET).get("Contents", []) - assert not any(o["Key"] == key for o in objs) - - def test_dry_run_returns_key(self, mock_s3: botocore.client.BaseClient) -> None: - """Dry-run returns the same key as a real upload would.""" - descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - key = upload_descriptor(descriptor, _ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, dry_run=True) - assert key == build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - - -# ── archive_descriptor ─────────────────────────────────────────────────── +def test_archive_descriptor_dry_run(mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock]) -> None: + """Dry-run returns True but does not call copy_object.""" + _, mock_copy = mock_s3_with_descriptor + assert archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG, dry_run=True) is True + mock_copy.assert_not_called() @pytest.mark.s3 -class TestArchiveDescriptor: - """Tests for archive_descriptor() using moto-mocked S3.""" - - @pytest.fixture - def mock_s3_with_descriptor(self) -> Generator[tuple[botocore.client.BaseClient, MagicMock]]: - """S3 with a live descriptor already uploaded.""" - with mock_aws(): - client = boto3.client("s3", region_name=AWS_REGION) - client.create_bucket(Bucket=TEST_BUCKET) - # Pre-upload a descriptor - descriptor = create_descriptor(_ASSEMBLY_DIR, _ACCESSION, _SAMPLE_RESOURCES, timestamp=_TIMESTAMP) - live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - client.put_object( - Bucket=TEST_BUCKET, - Key=live_key, - Body=json.dumps(descriptor).encode(), - ) - reset_s3_client() - with ( - patch.object(s3_utils, "get_s3_client", return_value=client), - patch.object(metadata_mod, "get_s3_client", return_value=client), - patch.object(metadata_mod, "copy_object") as mock_copy, - ): - yield client, mock_copy - reset_s3_client() - - def test_returns_true_when_descriptor_exists( - self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] - ) -> None: - """Returns True when the live descriptor object exists in S3.""" - _, _ = mock_s3_with_descriptor - result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) - assert result is True - - def test_calls_copy_with_correct_keys( - self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] - ) -> None: - """copy_object is called with the live and archive keys.""" - _, mock_copy = mock_s3_with_descriptor - archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) - live_key = build_descriptor_key(_ASSEMBLY_DIR, _KEY_PREFIX) - archive_key = build_archive_descriptor_key(_ASSEMBLY_DIR, _RELEASE_TAG, _KEY_PREFIX) - mock_copy.assert_called_once() - args = mock_copy.call_args - assert f"{TEST_BUCKET}/{live_key}" in args[0] - assert f"{TEST_BUCKET}/{archive_key}" in args[0] - - def test_dry_run_returns_true_without_copy( - self, mock_s3_with_descriptor: tuple[botocore.client.BaseClient, MagicMock] - ) -> None: - """Dry-run returns True but does not call copy_object.""" - _, mock_copy = mock_s3_with_descriptor - result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG, dry_run=True) - assert result is True - mock_copy.assert_not_called() - - def test_missing_descriptor_returns_false(self) -> None: - """Returns False when no descriptor exists at the live key.""" - with mock_aws(): - client = boto3.client("s3", region_name=AWS_REGION) - client.create_bucket(Bucket=TEST_BUCKET) - reset_s3_client() - with ( - patch.object(s3_utils, "get_s3_client", return_value=client), - patch.object(metadata_mod, "get_s3_client", return_value=client), - ): - result = archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) - reset_s3_client() - assert result is False +def test_archive_descriptor_missing_returns_false(mock_s3: botocore.client.BaseClient) -> None: + """Returns False when no descriptor exists at the live key.""" + assert archive_descriptor(_ASSEMBLY_DIR, TEST_BUCKET, _KEY_PREFIX, _RELEASE_TAG) is False diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index 6d2e3758..fc3fa793 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -44,47 +44,31 @@ def _extract_code_cells(notebook_path: Path) -> list[str]: @pytest.mark.parametrize("notebook", NCBI_NOTEBOOKS) -class TestNotebookSyntax: - """Validate that every code cell in each notebook is syntactically valid Python.""" - - def test_all_cells_parse(self, notebook: str) -> None: - """Verify every code cell compiles without SyntaxError.""" - path = NOTEBOOKS_DIR / notebook - assert path.exists(), f"Notebook not found: {path}" - cells = _extract_code_cells(path) - assert len(cells) > 0, f"No code cells found in {notebook}" - for i, source in enumerate(cells, 1): - try: - ast.parse(source, filename=f"{notebook}:cell{i}") - except SyntaxError as exc: - pytest.fail(f"{notebook} cell {i} has a syntax error: {exc}") - - def test_no_empty_code_cells(self, notebook: str) -> None: - """Verify no code cell is completely empty.""" - path = NOTEBOOKS_DIR / notebook - cells = _extract_code_cells(path) - for i, source in enumerate(cells, 1): - assert source.strip(), f"{notebook} cell {i} is empty" - - -class TestManifestNotebookImports: - """Verify that all imports in the manifest notebook resolve.""" - - def test_imports_resolve(self) -> None: - """All manifest notebook imports are verified at module load time above.""" - assert isinstance(FTP_HOST, str) - assert FTP_HOST - assert AssemblyRecord is not None - assert callable(download_assembly_summary) - assert callable(compute_diff) - assert callable(write_updated_manifest) - - -class TestPromoteNotebookImports: - """Verify that all imports in the promote notebook resolve.""" - - def test_imports_resolve(self) -> None: - """All promote notebook imports are verified at module load time above.""" - assert callable(promote_from_s3) - assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) - assert callable(split_s3_path) +def test_notebook_syntax(notebook: str) -> None: + """Every code cell is syntactically valid Python and non-empty.""" + path = NOTEBOOKS_DIR / notebook + assert path.exists(), f"Notebook not found: {path}" + cells = _extract_code_cells(path) + assert len(cells) > 0, f"No code cells found in {notebook}" + for i, source in enumerate(cells, 1): + assert source.strip(), f"{notebook} cell {i} is empty" + try: + ast.parse(source, filename=f"{notebook}:cell{i}") + except SyntaxError as exc: + pytest.fail(f"{notebook} cell {i} has a syntax error: {exc}") + + +def test_manifest_notebook_imports() -> None: + """All manifest notebook imports are verified at module load time above.""" + assert isinstance(FTP_HOST, str) and FTP_HOST + assert AssemblyRecord is not None + assert callable(download_assembly_summary) + assert callable(compute_diff) + assert callable(write_updated_manifest) + + +def test_promote_notebook_imports() -> None: + """All promote notebook imports are verified at module load time above.""" + assert callable(promote_from_s3) + assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) + assert callable(split_s3_path) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index fa9898d0..fdb180db 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -14,215 +14,182 @@ from tests.ncbi_ftp.conftest import TEST_BUCKET -@pytest.mark.s3 -class TestPromoteFromS3: - """Test promote_from_s3 with moto-mocked S3.""" - - def _stage_files(self, s3_client: botocore.client.BaseClient, prefix: str) -> None: - """Upload sample staged files to mock S3.""" - for key in [ - f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz", - f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz.md5", - f"{prefix}download_report.json", - ]: - body = b"md5hash123" if key.endswith(".md5") else b"data" - s3_client.put_object(Bucket=TEST_BUCKET, Key=key, Body=body) - - def test_dry_run_no_writes(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: - """Verify dry_run does not write any objects.""" - prefix = "staging/run1/" - self._stage_files(mock_s3_client_no_checksum, prefix) - - report = promote_from_s3( - staging_key_prefix=prefix, - bucket=TEST_BUCKET, - dry_run=True, - ) - assert report["promoted"] == 1 - assert report["dry_run"] is True - - # Final path should NOT exist - final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key) - assert resp.get("KeyCount", 0) == 0 +def _stage_files(s3_client: botocore.client.BaseClient, prefix: str) -> None: + """Upload sample staged files to mock S3.""" + for key in [ + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz", + f"{prefix}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz.md5", + f"{prefix}download_report.json", + ]: + body = b"md5hash123" if key.endswith(".md5") else b"data" + s3_client.put_object(Bucket=TEST_BUCKET, Key=key, Body=body) - def test_promotes_with_metadata(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: - """Verify objects are promoted with MD5 metadata attached.""" - prefix = "staging/run1/" - self._stage_files(mock_s3_client_no_checksum, prefix) - report = promote_from_s3( - staging_key_prefix=prefix, - bucket=TEST_BUCKET, - ) - assert report["promoted"] == 1 - assert report["failed"] == 0 - - # Check final object exists with metadata - final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" - resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) - assert resp["Metadata"].get("md5") == "md5hash123" +@pytest.mark.s3 +def test_promote_dry_run_no_writes(mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Verify dry_run does not write any objects.""" + prefix = "staging/run1/" + _stage_files(mock_s3_client_no_checksum, prefix) - def test_skips_download_report(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: - """Verify download_report.json is not promoted.""" - prefix = "staging/run1/" - self._stage_files(mock_s3_client_no_checksum, prefix) + report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET, dry_run=True) + assert report["promoted"] == 1 + assert report["dry_run"] is True - report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET) - # Only the .fna.gz data file, not download_report.json - assert report["promoted"] == 1 + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=final_key).get("KeyCount", 0) == 0 @pytest.mark.s3 -class TestTrimManifest: - """Test _trim_manifest removes promoted accessions from S3 manifest.""" +def test_promote_with_metadata(mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: + """Objects are promoted with MD5 metadata; download_report.json is skipped.""" + prefix = "staging/run1/" + _stage_files(mock_s3_client_no_checksum, prefix) - def test_trims_promoted(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: - """Verify promoted accessions are removed from manifest.""" - manifest_key = "manifests/transfer_manifest.txt" - manifest_body = ( - "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" - "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" - ) - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) - - _trim_manifest(manifest_key, TEST_BUCKET, {"GCF_000001215.4"}) + report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET) + assert report["promoted"] == 1 # only .fna.gz, not download_report.json + assert report["failed"] == 0 - resp = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key) - remaining = resp["Body"].read().decode() - assert "GCF_000001215.4" not in remaining - assert "GCF_000001405.40" in remaining + final_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/GCF_000001215.4_Release_6/GCF_000001215.4_genomic.fna.gz" + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=final_key) + assert resp["Metadata"].get("md5") == "md5hash123" - def test_trims_all(self, mock_s3_client_no_checksum: botocore.client.BaseClient) -> None: - """Verify all entries can be trimmed leaving an empty manifest.""" - manifest_key = "manifests/transfer_manifest.txt" - manifest_body = "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) - _trim_manifest(manifest_key, TEST_BUCKET, {"GCF_000001215.4"}) - - resp = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key) - remaining = resp["Body"].read().decode().strip() - assert remaining == "" +@pytest.mark.s3 +@pytest.mark.parametrize( + ("manifest_body", "promoted_set", "expected_present", "expected_absent"), + [ + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n", + {"GCF_000001215.4"}, + ["GCF_000001405.40"], + ["GCF_000001215.4"], + id="partial", + ), + pytest.param( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6/\n", + {"GCF_000001215.4"}, + [], + ["GCF_000001215.4"], + id="all", + ), + ], +) +def test_trim_manifest( + mock_s3_client_no_checksum: botocore.client.BaseClient, + manifest_body: str, + promoted_set: set[str], + expected_present: list[str], + expected_absent: list[str], +) -> None: + """Promoted accessions are removed; others remain (partial) or the manifest empties (all).""" + manifest_key = "manifests/transfer_manifest.txt" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=manifest_key, Body=manifest_body.encode()) + _trim_manifest(manifest_key, TEST_BUCKET, promoted_set) + remaining = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=manifest_key)["Body"].read().decode() + for acc in expected_present: + assert acc in remaining + for acc in expected_absent: + assert acc not in remaining @pytest.mark.s3 -class TestArchiveAssemblies: - """Test _archive_assemblies with moto-mocked S3.""" +def test_archive_assemblies_removed(mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path) -> None: + """Removed accessions are archived and originals deleted.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") - def test_archives_and_deletes_removed( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify removed accessions are archived and originals deleted.""" - accession = "GCF_000005845.2" - key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") - manifest = tmp_path / "removed.txt" - manifest.write_text(f"{accession}\n") - - count = _archive_assemblies( + assert ( + _archive_assemblies( str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="replaced_or_suppressed", delete_source=True, ) - assert count == 1 + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 0 - # Original should be deleted - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) - assert resp.get("KeyCount", 0) == 0 + archive_key = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" + f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 - # Archived copy should exist - archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" - f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" - ) - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) - assert resp.get("KeyCount", 0) == 1 - - def test_archives_updated_without_deleting( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify updated accessions are archived but originals remain.""" - accession = "GCF_000001215.4" - asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"original-data") - - manifest = tmp_path / "updated.txt" - manifest.write_text(f"{accession}\n") - - count = _archive_assemblies( - str(manifest), - bucket=TEST_BUCKET, - ncbi_release="2024-06", - archive_reason="updated", - delete_source=False, + +@pytest.mark.s3 +def test_archive_assemblies_updated_no_delete( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Updated accessions are archived but originals remain.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"original-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + assert ( + _archive_assemblies( + str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated", delete_source=False ) - assert count == 1 + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 1 - # Original still exists (promote will overwrite it) - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) - assert resp.get("KeyCount", 0) == 1 + archive_key = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) + assert resp["Metadata"]["archive_reason"] == "updated" + assert resp["Metadata"]["ncbi_last_release"] == "2024-06" - # Archived copy exists with correct metadata - archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/" - f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) - resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) - assert resp["Metadata"]["archive_reason"] == "updated" - assert resp["Metadata"]["ncbi_last_release"] == "2024-06" - - def test_multiple_releases_no_collision( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify archiving the same accession in different releases creates distinct folders.""" - accession = "GCF_000001215.4" - asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v1-data") - - manifest = tmp_path / "updated.txt" - manifest.write_text(f"{accession}\n") - - # First archive: release 2024-01 - _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") - - # Simulate promote overwriting source - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") - - # Second archive: release 2024-06 - _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") - - archive_key_1 = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" - f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) - archive_key_2 = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/" - f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) - resp1 = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1) - assert resp1["Body"].read() == b"v1-data" - resp2 = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_2) - assert resp2["Body"].read() == b"v2-data" +@pytest.mark.s3 +def test_archive_assemblies_multiple_releases_no_collision( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Archiving the same accession in different releases creates distinct folders.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v1-data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") + _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") + + archive_key_1 = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + archive_key_2 = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1)["Body"].read() == b"v1-data" + assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_2)["Body"].read() == b"v2-data" + - def test_dry_run_no_side_effects( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify dry_run does not copy or delete anything.""" - accession = "GCF_000005845.2" - key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") +@pytest.mark.s3 +def test_archive_assemblies_dry_run(mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path) -> None: + """dry_run does not copy or delete anything.""" + accession = "GCF_000005845.2" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") - manifest = tmp_path / "removed.txt" - manifest.write_text(f"{accession}\n") + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") - count = _archive_assemblies( + assert ( + _archive_assemblies( str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", @@ -230,45 +197,40 @@ def test_dry_run_no_side_effects( delete_source=True, dry_run=True, ) - assert count == 1 - - # Original still exists - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) - assert resp.get("KeyCount", 0) == 1 - - # No archive created - archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix) - assert resp.get("KeyCount", 0) == 0 - - def test_no_existing_objects_skips( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify accessions with no existing S3 objects are silently skipped.""" - manifest = tmp_path / "updated.txt" - manifest.write_text("GCF_000001215.4\n") - - count = _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01") - assert count == 0 - - def test_unknown_release_fallback( - self, mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path - ) -> None: - """Verify ncbi_release=None falls back to 'unknown'.""" - accession = "GCF_000001215.4" - asm_dir = f"{accession}_Release_6" - key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") - - manifest = tmp_path / "updated.txt" - manifest.write_text(f"{accession}\n") - - count = _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release=None) - assert count == 1 - - archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/" - f"raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) - resp = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) - assert resp.get("KeyCount", 0) == 1 + == 1 + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 1 + + archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix).get("KeyCount", 0) == 0 + + +@pytest.mark.s3 +def test_archive_assemblies_no_objects_skips( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Accessions with no existing S3 objects are silently skipped.""" + manifest = tmp_path / "updated.txt" + manifest.write_text("GCF_000001215.4\n") + assert _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01") == 0 + + +@pytest.mark.s3 +def test_archive_assemblies_unknown_release_fallback( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """ncbi_release=None falls back to 'unknown' in the archive path.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + assert _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release=None) == 1 + + archive_key = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + ) + assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 diff --git a/tests/s3_helpers.py b/tests/s3_helpers.py new file mode 100644 index 00000000..cf98c3d5 --- /dev/null +++ b/tests/s3_helpers.py @@ -0,0 +1,25 @@ +"""Shared S3 test helpers. + +# NOTE: Moto currently does not support CRC64NVME; remove this helper when it does. +""" + +import functools +from collections.abc import Callable +from typing import Any + + +def strip_checksum_algorithm(method: Callable[..., Any]) -> Callable[..., Any]: + """Wrap a boto3 S3 method to remove the ChecksumAlgorithm argument before calling moto. + + Moto does not implement CRC64NVME checksums, so any call that includes + ChecksumAlgorithm='CRC64NVME' would fail. This wrapper silently drops the + argument so the rest of the call proceeds normally against the moto backend. + """ + + @functools.wraps(method) + def wrapper(*args: Any, **kwargs: Any) -> Any: + """Remove the ChecksumAlgorithm argument from the call.""" + kwargs.pop("ChecksumAlgorithm", None) + return method(*args, **kwargs) + + return wrapper diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index b459c467..bebba52c 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -1,8 +1,7 @@ """Tests for s3_utils.py using moto to mock AWS S3.""" -import functools import io -from collections.abc import Callable, Generator +from collections.abc import Generator from pathlib import Path from typing import Any from unittest.mock import MagicMock, patch @@ -15,6 +14,7 @@ from requests.exceptions import HTTPError import cdm_data_loaders.utils.s3 as s3_utils +from tests.s3_helpers import strip_checksum_algorithm from cdm_data_loaders.utils.s3 import ( CDM_LAKE_BUCKET, DEFAULT_EXTRA_ARGS, @@ -683,24 +683,6 @@ def test_upload_dir_raises_on_empty_destination(sample_dir: Path) -> None: upload_dir(sample_dir, "") -# NOTE: Moto currently does not support CRC64NVME; remove this helper when it does. -def strip_checksum_algorithm(method: Callable[..., Any]) -> Callable[..., Any]: - """Wrap a boto3 S3 method to remove the ChecksumAlgorithm argument before calling moto. - - Moto does not implement CRC64NVME checksums, so any call that includes - ChecksumAlgorithm='CRC64NVME' would fail. This wrapper silently drops the - argument so the rest of the call proceeds normally against the moto backend. - """ - - @functools.wraps(method) - def wrapper(*args: Any, **kwargs: Any) -> Any: - """Remove the ChecksumAlgorithm argument from the call.""" - kwargs.pop("ChecksumAlgorithm", None) - return method(*args, **kwargs) - - return wrapper - - @pytest.fixture def mocked_s3_client_no_checksum(mock_s3_client: Any) -> Any: """Return the mocked S3 client with copy_object patched to strip ChecksumAlgorithm. From cc1042bd841ec03fd30226b25a5a9c0d7ff47cab Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Mon, 27 Apr 2026 16:01:13 -0700 Subject: [PATCH 50/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_assembly.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ncbi_ftp/test_assembly.py b/tests/ncbi_ftp/test_assembly.py index 559788bf..261f4676 100644 --- a/tests/ncbi_ftp/test_assembly.py +++ b/tests/ncbi_ftp/test_assembly.py @@ -3,7 +3,6 @@ import pytest from cdm_data_loaders.ncbi_ftp.assembly import ( - FILE_FILTERS, build_accession_path, parse_assembly_path, parse_md5_checksums_file, From 32164035061bca850a6a9d6eb5fff6ff9d9678ca Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 28 Apr 2026 12:25:03 -0700 Subject: [PATCH 51/76] add notebook option for download phase Co-authored-by: Copilot --- notebooks/ncbi_ftp_download.ipynb | 199 +++++++++++ .../pipelines/ncbi_ftp_download.py | 103 +++++- tests/integration/test_download_e2e.py | 62 +++- tests/ncbi_ftp/test_notebooks.py | 12 + tests/pipelines/test_ncbi_ftp_download.py | 329 +++++++++++++++++- 5 files changed, 701 insertions(+), 4 deletions(-) create mode 100644 notebooks/ncbi_ftp_download.ipynb diff --git a/notebooks/ncbi_ftp_download.ipynb b/notebooks/ncbi_ftp_download.ipynb new file mode 100644 index 00000000..1a2af714 --- /dev/null +++ b/notebooks/ncbi_ftp_download.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79505884", + "metadata": {}, + "source": [ + "# NCBI Assembly Download & Stage (Phase 2)\n", + "\n", + "Downloads NCBI assemblies listed in a transfer manifest from the NCBI FTP server\n", + "and uploads them to an S3 staging prefix for Phase 3 promotion.\n", + "\n", + "**When to use this notebook vs the CTS container:**\n", + "- Use the CTS container (`ncbi_ftp_sync`) for production runs — it has restart/retry\n", + " support and runs in the data-transfer environment.\n", + "- Use this notebook when CTS is unavailable (e.g. local development, debugging, or\n", + " one-off re-downloads of failed assemblies).\n", + "\n", + "Steps:\n", + "1. Configure bucket, manifest source, staging prefix, and thread count\n", + "2. Preview the first 10 manifest lines to verify before committing\n", + "3. Download assemblies from NCBI FTP and upload to staging\n", + "4. Review the download/stage report" + ] + }, + { + "cell_type": "markdown", + "id": "76d92c54", + "metadata": {}, + "source": [ + "## Path formats quick reference\n", + "\n", + "| Suffix in variable name | Format | Example |\n", + "|-------------------------|--------|---------|\n", + "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `staging/run1/` |\n", + "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/run1/input/transfer_manifest.txt` |\n", + "| `_PATH` | local filesystem path | `output/transfer_manifest.txt` |\n", + "\n", + "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`\n", + "Report: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}download_report.json`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38e7aa6d", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Imports and S3 client initialisation.\"\"\"\n", + "\n", + "import json\n", + "\n", + "from cdm_data_loaders.pipelines.ncbi_ftp_download import (\n", + " DEFAULT_STAGING_KEY_PREFIX,\n", + " download_and_stage,\n", + ")\n", + "from cdm_data_loaders.utils.s3 import get_s3_client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34a18261", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Configure parameters.\n", + "\n", + "Provide exactly one of MANIFEST_S3_KEY (read from S3) or MANIFEST_LOCAL_PATH (read from disk).\n", + "Set the other to None.\n", + "\n", + "Disk space note: ensure sufficient free space in the system temp directory before running.\n", + "A rough estimate is ~500 MB per 1000 assemblies; large genomes can exceed 1 GB each.\n", + "Set LIMIT to a small number (e.g. 5) to test the workflow before a full run.\n", + "\"\"\"\n", + "\n", + "# S3 bucket where the manifest lives and where staged files will be written\n", + "# format: bucket name (no s3:// scheme)\n", + "STORE_BUCKET = \"cdm-lake\"\n", + "\n", + "# S3 object key of the transfer manifest written by Phase 1\n", + "# format: S3 object key within STORE_BUCKET (no scheme, no bucket)\n", + "# Set to None to use MANIFEST_LOCAL_PATH instead\n", + "MANIFEST_S3_KEY: str | None = \"staging/run1/input/transfer_manifest.txt\"\n", + "\n", + "# Local path to the transfer manifest (alternative to MANIFEST_S3_KEY)\n", + "# format: local filesystem path\n", + "# Set to None to use MANIFEST_S3_KEY instead\n", + "MANIFEST_LOCAL_PATH: str | None = None\n", + "\n", + "# S3 key prefix for staged output files (must match what Phase 3 expects)\n", + "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = DEFAULT_STAGING_KEY_PREFIX + \"run1/\"\n", + "\n", + "# Number of parallel download and upload threads\n", + "THREADS = 4\n", + "\n", + "# Limit to first N assemblies (None = process all)\n", + "LIMIT: int | None = None\n", + "\n", + "# Dry-run mode — download locally but skip S3 uploads\n", + "DRY_RUN = True\n", + "\n", + "print(f\"Bucket: {STORE_BUCKET}\")\n", + "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Manifest local: {MANIFEST_LOCAL_PATH}\")\n", + "print(f\"Staging prefix: {STAGING_KEY_PREFIX}\")\n", + "print(f\"Threads: {THREADS}\")\n", + "print(f\"Limit: {LIMIT}\")\n", + "print(f\"Dry-run: {DRY_RUN}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69beeaa9", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Preview the first 10 manifest lines before committing to the full run.\"\"\"\n", + "\n", + "if MANIFEST_S3_KEY is not None:\n", + " s3 = get_s3_client()\n", + " response = s3.get_object(Bucket=STORE_BUCKET, Key=MANIFEST_S3_KEY)\n", + " manifest_lines = response[\"Body\"].read().decode().splitlines()\n", + "else:\n", + " with open(MANIFEST_LOCAL_PATH) as f:\n", + " manifest_lines = f.read().splitlines()\n", + "\n", + "data_lines = [l for l in manifest_lines if l.strip() and not l.startswith(\"#\")]\n", + "\n", + "print(f\"Total entries: {len(data_lines)}\")\n", + "print(\"First 10:\")\n", + "for line in data_lines[:10]:\n", + " print(f\" {line}\")\n", + "if len(data_lines) > 10:\n", + " print(f\" ... and {len(data_lines) - 10} more\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b76d273", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Download assemblies from NCBI FTP and upload to S3 staging.\"\"\"\n", + "\n", + "report = download_and_stage(\n", + " bucket=STORE_BUCKET,\n", + " staging_key_prefix=STAGING_KEY_PREFIX,\n", + " manifest_s3_key=MANIFEST_S3_KEY,\n", + " manifest_local_path=MANIFEST_LOCAL_PATH,\n", + " threads=THREADS,\n", + " limit=LIMIT,\n", + " dry_run=DRY_RUN,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "192b9d34", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Display download and staging report.\"\"\"\n", + "\n", + "print(\"=\" * 50)\n", + "print(\"DOWNLOAD & STAGE REPORT\")\n", + "print(\"=\" * 50)\n", + "print(f\"Attempted: {report['total_attempted']}\")\n", + "print(f\"Succeeded: {report['succeeded']}\")\n", + "print(f\"Failed: {report['failed']}\")\n", + "print(f\"Staged objects: {report['staged_objects']}\")\n", + "print(f\"Staging prefix: {report['staging_key_prefix']}\")\n", + "print(f\"Dry-run: {report['dry_run']}\")\n", + "print(f\"Timestamp: {report['timestamp']}\")\n", + "\n", + "if report[\"failed\"] > 0:\n", + " print(\"\\nFailed assemblies:\")\n", + " for failure in report[\"failures\"]:\n", + " print(f\" {failure['path']}: {failure['error']}\")\n", + "\n", + "if report[\"dry_run\"]:\n", + " print(\"\\nThis was a dry-run. Set DRY_RUN = False and re-run to upload to S3.\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 20cd77e8..85581a57 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -7,6 +7,7 @@ import json import logging +import tempfile import threading from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import UTC, datetime @@ -23,11 +24,17 @@ from cdm_data_loaders.pipelines.cts_defaults import DEFAULT_SETTINGS_CONFIG_DICT, INPUT_MOUNT, OUTPUT_MOUNT from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.ftp_client import ThreadLocalFTP +from cdm_data_loaders.utils.s3 import get_s3_client, upload_file logger = get_cdm_logger() -# ── Settings ───────────────────────────────────────────────────────────── +# ── Constants ──────────────────────────────────────────────────────────── + +DEFAULT_STAGING_KEY_PREFIX = "staging/" + + + class DownloadSettings(BaseSettings): @@ -181,3 +188,97 @@ def run_download(config: DownloadSettings) -> None: def cli() -> None: """CLI entry point for ``ncbi_ftp_sync``.""" run_cli(DownloadSettings, run_download) + + +# ── Notebook / interactive entry point ────────────────────────────────── + + +def download_and_stage( + *, + bucket: str, + staging_key_prefix: str, + manifest_s3_key: str | None = None, + manifest_local_path: str | Path | None = None, + threads: int = 4, + ftp_host: str = FTP_HOST, + limit: int | None = None, + dry_run: bool = False, +) -> dict[str, Any]: + """Download assemblies from NCBI FTP and stage them to S3 (Phase 2). + + Exactly one of *manifest_s3_key* or *manifest_local_path* must be given. + + :param bucket: destination S3 bucket name + :param staging_key_prefix: key prefix inside the bucket (e.g. ``"staging/run1/"``) + :param manifest_s3_key: S3 object key of the transfer manifest within *bucket* + :param manifest_local_path: local path to the transfer manifest file + :param threads: number of parallel download **and** upload threads + :param ftp_host: NCBI FTP hostname + :param limit: optional limit for testing (pass to :func:`download_batch`) + :param dry_run: when ``True``, download but skip all S3 uploads + :return: download report extended with ``staged_objects``, ``staging_key_prefix``, ``dry_run`` + """ + if manifest_s3_key is not None and manifest_local_path is not None: + msg = "Provide exactly one of manifest_s3_key or manifest_local_path, not both" + raise ValueError(msg) + if manifest_s3_key is None and manifest_local_path is None: + msg = "One of manifest_s3_key or manifest_local_path must be provided" + raise ValueError(msg) + + with tempfile.TemporaryDirectory() as _tmpdir: + tmp = Path(_tmpdir) + manifest_dest = tmp / "transfer_manifest.txt" + + if manifest_s3_key is not None: + s3 = get_s3_client() + response = s3.get_object(Bucket=bucket, Key=manifest_s3_key) + manifest_dest.write_bytes(response["Body"].read()) + logger.info("Manifest read from S3: s3://%s/%s", bucket, manifest_s3_key) + else: + manifest_dest.write_bytes(Path(manifest_local_path).read_bytes()) + logger.info("Manifest read from local path: %s", manifest_local_path) + + report = download_batch( + manifest_path=manifest_dest, + output_dir=tmp, + threads=threads, + ftp_host=ftp_host, + limit=limit, + ) + + staged_objects = 0 + + if not dry_run: + raw_data_dir = tmp / "raw_data" + report_json = tmp / "download_report.json" + + upload_tasks: list[tuple[Path, str]] = [] + + if raw_data_dir.exists(): + for local_file in sorted(raw_data_dir.rglob("*")): + if local_file.is_file(): + relative = local_file.relative_to(tmp) + dest_prefix = f"{bucket}/{staging_key_prefix.rstrip('/')}/{relative.parent}" + upload_tasks.append((local_file, dest_prefix)) + + if report_json.exists(): + upload_tasks.append((report_json, f"{bucket}/{staging_key_prefix.rstrip('/')}")) + + def _upload(task: tuple[Path, str]) -> None: + local_file, dest = task + upload_file(local_file, dest) + + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = [executor.submit(_upload, t) for t in upload_tasks] + for future in as_completed(futures): + future.result() + staged_objects += 1 + + logger.info("Staged %d objects to s3://%s/%s", staged_objects, bucket, staging_key_prefix) + + return { + **report, + "staged_objects": staged_objects, + "staging_key_prefix": staging_key_prefix, + "dry_run": dry_run, + } diff --git a/tests/integration/test_download_e2e.py b/tests/integration/test_download_e2e.py index 2126bd81..37456e3e 100644 --- a/tests/integration/test_download_e2e.py +++ b/tests/integration/test_download_e2e.py @@ -10,7 +10,9 @@ import pytest from pathlib import Path +from unittest.mock import patch +import cdm_data_loaders.utils.s3 as s3_utils from cdm_data_loaders.ncbi_ftp.manifest import ( compute_diff, download_assembly_summary, @@ -18,7 +20,7 @@ parse_assembly_summary, write_transfer_manifest, ) -from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch +from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch, download_and_stage # Use same stable prefix as manifest tests STABLE_PREFIX = "900" @@ -125,3 +127,61 @@ def test_download_resume(self, tmp_path: Path) -> None: # All original files should still exist files_after_second = set(output_dir.rglob("*")) assert files_after_first.issubset(files_after_second) + + +@pytest.mark.integration +@pytest.mark.slow_test +@pytest.mark.external_request +def test_download_and_stage_e2e( + tmp_path: Path, + minio_s3_client, + test_bucket: str, +) -> None: + """Download one assembly and verify it is staged under the expected S3 prefix.""" + manifest_path, _acc = _manifest_for_one_assembly(tmp_path) + + staging_prefix = "staging/e2e-test/" + + # Seed the manifest in MinIO so download_and_stage can read it from S3 + manifest_s3_key = f"{staging_prefix}input/transfer_manifest.txt" + minio_s3_client.put_object( + Bucket=test_bucket, + Key=manifest_s3_key, + Body=manifest_path.read_bytes(), + ) + + with patch.object(s3_utils, "get_s3_client", return_value=minio_s3_client): + report = download_and_stage( + bucket=test_bucket, + staging_key_prefix=staging_prefix, + manifest_s3_key=manifest_s3_key, + threads=1, + limit=1, + dry_run=False, + ) + + assert report["succeeded"] >= 1 + assert report["failed"] == 0 + assert report["staged_objects"] > 0 + assert report["staging_key_prefix"] == staging_prefix + assert report["dry_run"] is False + + # Verify raw_data/ files and .md5 sidecars are staged + paginator = minio_s3_client.get_paginator("list_objects_v2") + staged_keys = [ + obj["Key"] + for page in paginator.paginate(Bucket=test_bucket, Prefix=f"{staging_prefix}raw_data/") + for obj in page.get("Contents", []) + ] + assert len(staged_keys) > 0, "Expected staged files under raw_data/" + + data_files = [k for k in staged_keys if not k.endswith(".md5")] + md5_files = [k for k in staged_keys if k.endswith(".md5")] + assert len(data_files) > 0, "Expected data files" + assert len(md5_files) > 0, "Expected .md5 sidecar files" + + # Verify download_report.json was also uploaded + report_key = f"{staging_prefix}download_report.json" + resp = minio_s3_client.get_object(Bucket=test_bucket, Key=report_key) + saved_report = json.loads(resp["Body"].read()) + assert saved_report["succeeded"] >= 1 diff --git a/tests/ncbi_ftp/test_notebooks.py b/tests/ncbi_ftp/test_notebooks.py index fc3fa793..345572db 100644 --- a/tests/ncbi_ftp/test_notebooks.py +++ b/tests/ncbi_ftp/test_notebooks.py @@ -29,6 +29,7 @@ NCBI_NOTEBOOKS = [ "ncbi_ftp_manifest.ipynb", "ncbi_ftp_promote.ipynb", + "ncbi_ftp_download.ipynb", ] @@ -72,3 +73,14 @@ def test_promote_notebook_imports() -> None: assert callable(promote_from_s3) assert isinstance(DEFAULT_LAKEHOUSE_KEY_PREFIX, str) assert callable(split_s3_path) + + +def test_download_notebook_imports() -> None: + """All download notebook imports resolve without error.""" + from cdm_data_loaders.pipelines.ncbi_ftp_download import ( # noqa: F401 + DEFAULT_STAGING_KEY_PREFIX, + download_and_stage, + ) + + assert callable(download_and_stage) + assert isinstance(DEFAULT_STAGING_KEY_PREFIX, str) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index 90df299f..86d6f8e5 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -2,14 +2,22 @@ import json from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch +import boto3 import pytest +from moto import mock_aws from pydantic import ValidationError from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT -from cdm_data_loaders.pipelines.ncbi_ftp_download import DownloadSettings, download_batch +from cdm_data_loaders.pipelines.ncbi_ftp_download import ( + DEFAULT_STAGING_KEY_PREFIX, + DownloadSettings, + download_and_stage, + download_batch, +) +from cdm_data_loaders.utils.s3 import reset_s3_client _DEFAULT_THREADS = 4 _CUSTOM_THREADS = 8 @@ -228,3 +236,320 @@ def test_handles_download_failure(self, tmp_path: Path) -> None: assert report["failed"] == 1 assert report["succeeded"] == 0 + + +# ── Helpers shared by download_and_stage tests ─────────────────────────── + +_MANIFEST_CONTENT = ( + "/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n" + "/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/\n" +) +_TEST_BUCKET = "test-bucket" +_STAGING_PREFIX = "staging/run1/" + +_MOCK_REPORT = { + "timestamp": "2026-01-01T00:00:00+00:00", + "total_attempted": 2, + "succeeded": 2, + "failed": 0, + "failures": [], + "assembly_stats": [], +} + + +def _make_moto_s3(): + """Return a moto-backed S3 client with the test bucket created.""" + client = boto3.client("s3", region_name="us-east-1") + client.create_bucket(Bucket=_TEST_BUCKET) + return client + + +# ── download_and_stage — manifest source ──────────────────────────────── + + +@pytest.mark.parametrize( + ("manifest_s3_key", "use_local"), + [ + pytest.param("staging/input/transfer_manifest.txt", False, id="s3_source"), + pytest.param(None, True, id="local_source"), + ], +) +@mock_aws +def test_download_and_stage_manifest_source( + tmp_path: Path, + manifest_s3_key: str | None, + use_local: bool, +) -> None: + """Manifest lines are passed to download_batch regardless of source (S3 or local).""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local: Path | None = None + if manifest_s3_key is not None: + s3.put_object(Bucket=_TEST_BUCKET, Key=manifest_s3_key, Body=_MANIFEST_CONTENT.encode()) + else: + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + captured_content: list[str] = [] + + def _capturing_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 + captured_content.append(Path(manifest_path).read_text()) + return _MOCK_REPORT + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_capturing_batch), + ): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=manifest_s3_key, + manifest_local_path=manifest_local, + dry_run=True, + ) + + assert captured_content == [_MANIFEST_CONTENT] + + reset_s3_client() + + +# ── download_and_stage — exactly one source required ──────────────────── + + +@pytest.mark.parametrize( + ("s3_key", "local_path", "should_raise"), + [ + pytest.param("s3/key", "local/path", True, id="both_provided_raises"), + pytest.param(None, None, True, id="neither_provided_raises"), + pytest.param("s3/key", None, False, id="s3_only_ok"), + pytest.param(None, "local/path", False, id="local_only_ok"), + ], +) +@mock_aws +def test_download_and_stage_exactly_one_source_required( + tmp_path: Path, + s3_key: str | None, + local_path: str | None, + should_raise: bool, +) -> None: + """ValueError is raised when both or neither manifest sources are given.""" + reset_s3_client() + + if should_raise: + with pytest.raises(ValueError, match="manifest"): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=s3_key, + manifest_local_path=local_path, + ) + else: + s3 = _make_moto_s3() + # For s3_only: seed the object; for local_only: create the file + if s3_key is not None: + s3.put_object(Bucket=_TEST_BUCKET, Key=s3_key, Body=_MANIFEST_CONTENT.encode()) + if local_path is not None: + real_local = tmp_path / "manifest.txt" + real_local.write_text(_MANIFEST_CONTENT) + local_path = real_local + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT), + ): + result = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_s3_key=s3_key, + manifest_local_path=local_path, + dry_run=True, + ) + assert result["succeeded"] == _MOCK_REPORT["succeeded"] + + reset_s3_client() + + +# ── download_and_stage — uploads to staging ────────────────────────────── + + +@mock_aws +def test_download_and_stage_uploads_to_staging(tmp_path: Path) -> None: + """Files produced in raw_data/ and download_report.json are all uploaded to staging.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + assembly_rel = "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" + + def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 + out = Path(output_dir) + asm_dir = out / assembly_rel + asm_dir.mkdir(parents=True) + (asm_dir / "genomic.fna.gz").write_bytes(b"fasta_data") + (asm_dir / "genomic.fna.gz.md5").write_bytes(b"abc123") + report_path = out / "download_report.json" + report_path.write_text(json.dumps(_MOCK_REPORT)) + return _MOCK_REPORT + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_fake_download_batch), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=False, + threads=1, + ) + + paginator = s3.get_paginator("list_objects_v2") + uploaded_keys = { + obj["Key"] + for page in paginator.paginate(Bucket=_TEST_BUCKET) + for obj in page.get("Contents", []) + } + + expected_keys = { + f"{_STAGING_PREFIX}{assembly_rel}/genomic.fna.gz", + f"{_STAGING_PREFIX}{assembly_rel}/genomic.fna.gz.md5", + f"{_STAGING_PREFIX}download_report.json", + } + assert uploaded_keys == expected_keys + assert report["staged_objects"] == len(expected_keys) + + reset_s3_client() + + +# ── download_and_stage — dry_run skips upload ──────────────────────────── + + +@mock_aws +def test_download_and_stage_dry_run_skips_upload(tmp_path: Path) -> None: + """dry_run=True leaves S3 empty and returns staged_objects=0.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 + asm_dir = Path(output_dir) / "raw_data/GCF/000/001/215/GCF_000001215.4" + asm_dir.mkdir(parents=True) + (asm_dir / "genomic.fna.gz").write_bytes(b"fasta") + return _MOCK_REPORT + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_fake_download_batch), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=True, + threads=1, + ) + + listed = s3.list_objects_v2(Bucket=_TEST_BUCKET) + assert listed.get("KeyCount", 0) == 0 + assert report["staged_objects"] == 0 + assert report["dry_run"] is True + + reset_s3_client() + + +# ── download_and_stage — limit forwarded ──────────────────────────────── + + +@pytest.mark.parametrize( + "limit", + [ + pytest.param(1, id="limit_1"), + pytest.param(10, id="limit_10"), + ], +) +@mock_aws +def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: + """The limit parameter is forwarded verbatim to download_batch.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT) as mock_batch, + ): + download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + limit=limit, + dry_run=True, + ) + + assert mock_batch.call_args.kwargs["limit"] == limit + + reset_s3_client() + + +# ── download_and_stage — report shape ─────────────────────────────────── + + +@mock_aws +def test_download_and_stage_report_shape(tmp_path: Path) -> None: + """Return value includes all download_batch keys plus staged_objects, staging_key_prefix, dry_run.""" + reset_s3_client() + s3 = _make_moto_s3() + + manifest_local = tmp_path / "manifest.txt" + manifest_local.write_text(_MANIFEST_CONTENT) + + import cdm_data_loaders.utils.s3 as s3_mod + + with ( + patch.object(s3_mod, "get_s3_client", return_value=s3), + patch.object(s3_mod, "_s3_client", s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT), + ): + report = download_and_stage( + bucket=_TEST_BUCKET, + staging_key_prefix=_STAGING_PREFIX, + manifest_local_path=manifest_local, + dry_run=True, + ) + + assert report == { + **_MOCK_REPORT, + "staged_objects": 0, + "staging_key_prefix": _STAGING_PREFIX, + "dry_run": True, + } + + reset_s3_client() From 395265571e53c87341efa3755d80430b15f545ff Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 28 Apr 2026 12:25:19 -0700 Subject: [PATCH 52/76] formatting --- src/cdm_data_loaders/pipelines/ncbi_ftp_download.py | 3 --- tests/pipelines/test_ncbi_ftp_download.py | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 85581a57..233c82b1 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -34,9 +34,6 @@ DEFAULT_STAGING_KEY_PREFIX = "staging/" - - - class DownloadSettings(BaseSettings): """Configuration for the NCBI FTP assembly download pipeline.""" diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index 86d6f8e5..f5046724 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -419,11 +419,7 @@ def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 ) paginator = s3.get_paginator("list_objects_v2") - uploaded_keys = { - obj["Key"] - for page in paginator.paginate(Bucket=_TEST_BUCKET) - for obj in page.get("Contents", []) - } + uploaded_keys = {obj["Key"] for page in paginator.paginate(Bucket=_TEST_BUCKET) for obj in page.get("Contents", [])} expected_keys = { f"{_STAGING_PREFIX}{assembly_rel}/genomic.fna.gz", From 991b40e50fdb0ac262f419384220a9bbb60126ee Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 28 Apr 2026 12:28:00 -0700 Subject: [PATCH 53/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/pipelines/test_ncbi_ftp_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index f5046724..e56430c6 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -2,7 +2,7 @@ import json from pathlib import Path -from unittest.mock import MagicMock, call, patch +from unittest.mock import MagicMock, patch import boto3 import pytest From 359901f5d7058c3e7e3786c8d0d24c6d422a47b6 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Tue, 28 Apr 2026 12:28:18 -0700 Subject: [PATCH 54/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/pipelines/test_ncbi_ftp_download.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index e56430c6..cadfe559 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -12,7 +12,6 @@ from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT from cdm_data_loaders.pipelines.ncbi_ftp_download import ( - DEFAULT_STAGING_KEY_PREFIX, DownloadSettings, download_and_stage, download_batch, From 51a3680a98cb73b29ec1ef855ee26707973d8417 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 09:51:29 -0700 Subject: [PATCH 55/76] update notebooks --- notebooks/ncbi_ftp_download.ipynb | 41 ++++++++++++++++-- notebooks/ncbi_ftp_manifest.ipynb | 69 ++++++++++--------------------- notebooks/ncbi_ftp_promote.ipynb | 20 +++++++++ 3 files changed, 78 insertions(+), 52 deletions(-) diff --git a/notebooks/ncbi_ftp_download.ipynb b/notebooks/ncbi_ftp_download.ipynb index 1a2af714..f2d25938 100644 --- a/notebooks/ncbi_ftp_download.ipynb +++ b/notebooks/ncbi_ftp_download.ipynb @@ -55,8 +55,7 @@ "from cdm_data_loaders.pipelines.ncbi_ftp_download import (\n", " DEFAULT_STAGING_KEY_PREFIX,\n", " download_and_stage,\n", - ")\n", - "from cdm_data_loaders.utils.s3 import get_s3_client" + ")" ] }, { @@ -101,7 +100,7 @@ "LIMIT: int | None = None\n", "\n", "# Dry-run mode — download locally but skip S3 uploads\n", - "DRY_RUN = True\n", + "DRY_RUN = False\n", "\n", "print(f\"Bucket: {STORE_BUCKET}\")\n", "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", @@ -112,6 +111,26 @@ "print(f\"Dry-run: {DRY_RUN}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "51a857f9", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client({\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " })" + ] + }, { "cell_type": "code", "execution_count": null, @@ -190,8 +209,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "cdm-data-loaders (3.13.11)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" } }, "nbformat": 4, diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index b4e9af5e..8083c6cb 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -66,6 +66,26 @@ "from cdm_data_loaders.utils.s3 import get_s3_client, split_s3_path" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b196d5a3", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client({\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " })" + ] + }, { "cell_type": "code", "execution_count": null, @@ -91,7 +111,7 @@ "\n", "# S3 location where the new snapshot will be uploaded after diffing\n", "# format: s3:// URI\n", - "SNAPSHOT_UPLOAD_URI: str | None = None\n", + "SNAPSHOT_UPLOAD_URI: str | None = \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/assembly_summary_refseq.txt\"\n", "\n", "# Verify candidates against the S3 Lakehouse — prune assemblies already present.\n", "# Set STORE_BUCKET to your bucket name to enable, or None to skip.\n", @@ -113,53 +133,6 @@ "print(f\"Output dir: {OUTPUT_DIR}\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "be1fcf1c", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"Validate S3 connectivity and bucket/prefix configuration.\"\"\"\n", - "\n", - "import boto3\n", - "from botocore.exceptions import ClientError, NoCredentialsError\n", - "\n", - "s3 = boto3.client(\"s3\")\n", - "\n", - "# Check credentials are present\n", - "try:\n", - " sts = boto3.client(\"sts\")\n", - " identity = sts.get_caller_identity()\n", - " print(f\"✓ Credentials valid — account: {identity['Account']}, arn: {identity['Arn']}\")\n", - "except NoCredentialsError:\n", - " print(\"✗ No AWS credentials found\")\n", - " raise\n", - "except ClientError as e:\n", - " if e.response[\"Error\"][\"Code\"] == \"InvalidParameterValue\":\n", - " print(\"✓ Credentials present (STS GetCallerIdentity not supported on this endpoint — skipping identity check)\")\n", - " else:\n", - " print(f\"✗ Credential check failed: {e}\")\n", - " raise\n", - "\n", - "# Check bucket access and prefix in one step — list_objects_v2 requires only\n", - "# s3:ListBucket on the prefix, which is less restrictive than HeadBucket.\n", - "_check_bucket = STORE_BUCKET if \"STORE_BUCKET\" in dir() and STORE_BUCKET else \"cdm-lake\"\n", - "_check_prefix = STORE_KEY_PREFIX if \"STORE_KEY_PREFIX\" in dir() else \"tenant-general-warehouse/kbase/datasets/ncbi/\"\n", - "try:\n", - " resp = s3.list_objects_v2(Bucket=_check_bucket, Prefix=_check_prefix, MaxKeys=1)\n", - " if resp.get(\"KeyCount\", 0) > 0:\n", - " print(f\"✓ Bucket accessible and prefix has objects: s3://{_check_bucket}/{_check_prefix}\")\n", - " else:\n", - " print(\n", - " f\"✓ Bucket accessible but no objects found under s3://{_check_bucket}/{_check_prefix} — check STORE_KEY_PREFIX\"\n", - " )\n", - "except ClientError as e:\n", - " code = e.response[\"Error\"][\"Code\"]\n", - " print(f\"✗ S3 access check failed (HTTP {code}): {e}\")\n", - " raise" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index a1aebcdc..a23fc149 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -110,6 +110,26 @@ "print(f\"Dry-run: {DRY_RUN}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccfd88d9", + "metadata": {}, + "outputs": [], + "source": [ + "from cdm_data_loaders.utils.s3 import get_s3_client, reset_s3_client\n", + "\n", + "# Provide S3 credentials (use for local testing against MinIO test container)\n", + "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", + "if PROVIDE_CREDENTIALS:\n", + " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", + " get_s3_client({\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " })" + ] + }, { "cell_type": "code", "execution_count": null, From 8a411641c091e1c1162c8b4098163107190ab258 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 11:19:36 -0700 Subject: [PATCH 56/76] allow separate staging and destination buckets --- docs/ncbi_ftp_e2e_walkthrough.md | 30 +++--- notebooks/ncbi_ftp_download.ipynb | 22 ++--- notebooks/ncbi_ftp_manifest.ipynb | 34 +++---- notebooks/ncbi_ftp_promote.ipynb | 46 +++++----- src/cdm_data_loaders/ncbi_ftp/promote.py | 52 ++++++----- tests/integration/conftest.py | 30 ++++++ tests/integration/test_full_pipeline.py | 27 +++--- tests/integration/test_promote_e2e.py | 111 +++++++++++++---------- tests/ncbi_ftp/test_promote.py | 18 ++-- 9 files changed, 213 insertions(+), 157 deletions(-) diff --git a/docs/ncbi_ftp_e2e_walkthrough.md b/docs/ncbi_ftp_e2e_walkthrough.md index 4a710046..83eb3e1e 100644 --- a/docs/ncbi_ftp_e2e_walkthrough.md +++ b/docs/ncbi_ftp_e2e_walkthrough.md @@ -49,8 +49,8 @@ Understanding this decomposition is the key to configuring the notebooks. ### Lakehouse object (final location) ``` -s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} - └── bucket ──┘ └── key prefix ──────┘└── build_accession_path() ────────────────────────┘ +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ─────┘ └── key prefix ──────┘└── build_accession_path() ────────────────────────┘ ``` Example: @@ -61,8 +61,8 @@ s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/raw_data/GCF/900/000/ ### Staging object (Phase 2 output) ``` -s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} - └── bucket ──┘ └── key prefix ────┘└── build_accession_path() ────────────────────────┘ +s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{filename} + └── bucket ─────┘ └── key prefix ────┘└── build_accession_path() ────────────────────────┘ ``` ### Local output (Phase 1) @@ -102,6 +102,7 @@ included `scripts/s3_local.py` helper (requires no extra installs — only ```sh uv run python scripts/s3_local.py mb s3://cdm-lake +uv run python scripts/s3_local.py mb s3://cts ``` ### Lakehouse @@ -161,13 +162,13 @@ Open `notebooks/ncbi_ftp_manifest.ipynb` in JupyterLab or VS Code. | `LIMIT` | `10` | int | cap to 10 assemblies | | `PREVIOUS_SUMMARY_URI` | `None` | s3:// URI | first run — everything is "new" | | `SNAPSHOT_UPLOAD_URI` | `None` | s3:// URI | skip S3 upload for local testing | -| `STORE_BUCKET` | `"cdm-lake"` (or `None`) | bucket name | set to prune assemblies already in the Lakehouse | +| `LAKEHOUSE_BUCKET` | `"cdm-lake"` (or `None`) | bucket name | set to prune assemblies already in the Lakehouse | | `STORE_KEY_PREFIX` | `"tenant-general-warehouse/kbase/datasets/ncbi/"` | S3 key prefix | default Lakehouse path prefix | | `OUTPUT_DIR` | `Path("output")` | local path | keep as-is (local directory) | ### Initialise the S3 client for MinIO -If you set `PREVIOUS_SUMMARY_URI`, `SNAPSHOT_UPLOAD_URI`, `STORE_BUCKET`, +If you set `PREVIOUS_SUMMARY_URI`, `SNAPSHOT_UPLOAD_URI`, `LAKEHOUSE_BUCKET`, or `STAGING_URI` to point at your local MinIO, you must initialise the S3 client **before** running the cells that use them. Insert a new cell after Cell 1 (Imports) with: @@ -184,7 +185,7 @@ get_s3_client({ ``` If all three S3 variables are `None` (purely local testing), this cell can -be skipped — though on repeat runs you should set `STORE_BUCKET` so +be skipped — though on repeat runs you should set `LAKEHOUSE_BUCKET` so assemblies already promoted to the Lakehouse are pruned from the transfer manifest. @@ -202,7 +203,7 @@ checksums would take days. **How it works:** 1. Set `SCAN_STORE = True` in Cell 5 -2. The notebook scans all objects under `s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}` +2. The notebook scans all objects under `s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX}` 3. For each unique assembly found, it extracts the accession and uses the earliest object `LastModified` as a conservative `seq_rel_date` 4. It saves the synthetic summary to `LOCAL_SYNTHETIC_SUMMARY` (default: @@ -213,7 +214,7 @@ checksums would take days. **Example (for a 500K-assembly store):** ```python SCAN_STORE = True -STORE_BUCKET = "cdm-lake" +LAKEHOUSE_BUCKET = "cdm-lake" STORE_KEY_PREFIX = "tenant-general-warehouse/kbase/datasets/ncbi/" LOCAL_SYNTHETIC_SUMMARY = Path("output/synthetic_summary_from_store.txt") @@ -350,13 +351,13 @@ The download step writes to the local filesystem. To feed Phase 3 we need to upload the staged files into MinIO under a staging prefix: ```sh -uv run python scripts/s3_local.py cp notebooks/staging/raw_data/ s3://cdm-lake/staging/run1/raw_data/ +uv run python scripts/s3_local.py cp notebooks/staging/raw_data/ s3://cts/staging/run1/raw_data/ ``` Verify the upload: ```sh -uv run python scripts/s3_local.py ls s3://cdm-lake/staging/run1/ +uv run python scripts/s3_local.py ls s3://cts/staging/run1/ ``` --- @@ -369,7 +370,8 @@ Open `notebooks/ncbi_ftp_promote.ipynb`. | Constant | Walkthrough value | Format | Why | |-------------------------|------------------------------------------------------|--------|---------------------------------------------| -| `STORE_BUCKET` | `"cdm-lake"` | bucket name | matches the bucket created in Step 1 | +| `STAGING_BUCKET` | `"cts"` | bucket name | CTS staging bucket (Phase 2 writes here) | +| `LAKEHOUSE_BUCKET` | `"cdm-lake"` | bucket name | final Lakehouse destination | | `STAGING_KEY_PREFIX` | `"staging/run1/"` | S3 key prefix | matches the upload prefix from Step 3d | | `REMOVED_MANIFEST_PATH` | `None` | local path | nothing to remove on first run | | `UPDATED_MANIFEST_PATH` | `None` | local path | nothing to archive on first run | @@ -436,7 +438,7 @@ uv run python scripts/s3_local.py head \ Each promoted assembly gets a [frictionless](https://framework.frictionlessdata.io/) data package descriptor stored at: ``` -s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}metadata/{assembly_dir}_datapackage.json +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}metadata/{assembly_dir}_datapackage.json ``` For example: @@ -458,7 +460,7 @@ When an assembly is archived (updated or removed), its live descriptor is copied to: ``` -s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json +s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json ``` Use the last cell of `notebooks/ncbi_ftp_promote.ipynb` to list and preview diff --git a/notebooks/ncbi_ftp_download.ipynb b/notebooks/ncbi_ftp_download.ipynb index f2d25938..84ac4fbf 100644 --- a/notebooks/ncbi_ftp_download.ipynb +++ b/notebooks/ncbi_ftp_download.ipynb @@ -32,13 +32,13 @@ "\n", "| Suffix in variable name | Format | Example |\n", "|-------------------------|--------|---------|\n", - "| `_BUCKET` | bucket name only | `cdm-lake` |\n", + "| `_BUCKET` | bucket name only | `cts` |\n", "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `staging/run1/` |\n", "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/run1/input/transfer_manifest.txt` |\n", "| `_PATH` | local filesystem path | `output/transfer_manifest.txt` |\n", "\n", - "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`\n", - "Report: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}download_report.json`" + "Staging object: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`\n", + "Report: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}download_report.json`" ] }, { @@ -77,12 +77,12 @@ "\n", "# S3 bucket where the manifest lives and where staged files will be written\n", "# format: bucket name (no s3:// scheme)\n", - "STORE_BUCKET = \"cdm-lake\"\n", + "STAGING_BUCKET = \"cts\"\n", "\n", "# S3 object key of the transfer manifest written by Phase 1\n", - "# format: S3 object key within STORE_BUCKET (no scheme, no bucket)\n", + "# format: S3 object key within STAGING_BUCKET (no scheme, no bucket)\n", "# Set to None to use MANIFEST_LOCAL_PATH instead\n", - "MANIFEST_S3_KEY: str | None = \"staging/run1/input/transfer_manifest.txt\"\n", + "MANIFEST_S3_KEY: str | None = \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\"\n", "\n", "# Local path to the transfer manifest (alternative to MANIFEST_S3_KEY)\n", "# format: local filesystem path\n", @@ -90,8 +90,8 @@ "MANIFEST_LOCAL_PATH: str | None = None\n", "\n", "# S3 key prefix for staged output files (must match what Phase 3 expects)\n", - "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", - "STAGING_KEY_PREFIX = DEFAULT_STAGING_KEY_PREFIX + \"run1/\"\n", + "# format: S3 key prefix within STAGING_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = \"io/matt-cohere/staging/run1/output/\"\n", "\n", "# Number of parallel download and upload threads\n", "THREADS = 4\n", @@ -102,7 +102,7 @@ "# Dry-run mode — download locally but skip S3 uploads\n", "DRY_RUN = False\n", "\n", - "print(f\"Bucket: {STORE_BUCKET}\")\n", + "print(f\"Bucket: {STAGING_BUCKET}\")\n", "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", "print(f\"Manifest local: {MANIFEST_LOCAL_PATH}\")\n", "print(f\"Staging prefix: {STAGING_KEY_PREFIX}\")\n", @@ -142,7 +142,7 @@ "\n", "if MANIFEST_S3_KEY is not None:\n", " s3 = get_s3_client()\n", - " response = s3.get_object(Bucket=STORE_BUCKET, Key=MANIFEST_S3_KEY)\n", + " response = s3.get_object(Bucket=STAGING_BUCKET, Key=MANIFEST_S3_KEY)\n", " manifest_lines = response[\"Body\"].read().decode().splitlines()\n", "else:\n", " with open(MANIFEST_LOCAL_PATH) as f:\n", @@ -168,7 +168,7 @@ "\"\"\"Download assemblies from NCBI FTP and upload to S3 staging.\"\"\"\n", "\n", "report = download_and_stage(\n", - " bucket=STORE_BUCKET,\n", + " bucket=STAGING_BUCKET,\n", " staging_key_prefix=STAGING_KEY_PREFIX,\n", " manifest_s3_key=MANIFEST_S3_KEY,\n", " manifest_local_path=MANIFEST_LOCAL_PATH,\n", diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 8083c6cb..0ade0917 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -17,7 +17,7 @@ "All filtering (prefix range, limit) is applied here so downstream phases\n", "receive a final, pre-filtered manifest.\n", "\n", - "Optionally verifies candidates against the S3 Lakehouse (`STORE_BUCKET`) so\n", + "Optionally verifies candidates against the S3 Lakehouse (`LAKEHOUSE_BUCKET`) so\n", "assemblies that were already downloaded and promoted are pruned from the\n", "transfer manifest." ] @@ -36,7 +36,7 @@ "| `_KEY_PREFIX` | S3 key prefix (no scheme/bucket) | `tenant-general-warehouse/kbase/datasets/ncbi/` |\n", "| `_DIR` / `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", "\n", - "Lakehouse object: `s3://{STORE_BUCKET}/{STORE_KEY_PREFIX}raw_data/…/{filename}`" + "Lakehouse object: `s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX}raw_data/…/{filename}`" ] }, { @@ -114,11 +114,11 @@ "SNAPSHOT_UPLOAD_URI: str | None = \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/assembly_summary_refseq.txt\"\n", "\n", "# Verify candidates against the S3 Lakehouse — prune assemblies already present.\n", - "# Set STORE_BUCKET to your bucket name to enable, or None to skip.\n", + "# Set LAKEHOUSE_BUCKET to your bucket name to enable, or None to skip.\n", "# STORE_KEY_PREFIX should point to the directory containing `raw_data/`.\n", "# format: bucket name (no s3:// scheme)\n", - "STORE_BUCKET: str | None = \"cdm-lake\"\n", - "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "LAKEHOUSE_BUCKET: str | None = \"cdm-lake\"\n", + "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", "STORE_KEY_PREFIX = \"tenant-general-warehouse/kbase/datasets/ncbi/\"\n", "\n", "# Local output directory for manifest files\n", @@ -129,7 +129,7 @@ "print(f\"Database: {DATABASE}\")\n", "print(f\"Prefix range: {PREFIX_FROM} -> {PREFIX_TO}\")\n", "print(f\"Limit: {LIMIT}\")\n", - "print(f\"Verify against S3: {STORE_BUCKET or 'disabled'}\")\n", + "print(f\"Verify against S3: {LAKEHOUSE_BUCKET or 'disabled'}\")\n", "print(f\"Output dir: {OUTPUT_DIR}\")" ] }, @@ -161,7 +161,7 @@ "for the diff.\n", "\n", "Set SCAN_STORE=True below to enable. The scan will:\n", - " 1. List all objects under STORE_BUCKET/STORE_KEY_PREFIX\n", + " 1. List all objects under LAKEHOUSE_BUCKET/STORE_KEY_PREFIX\n", " 2. Extract accessions matching the DATABASE type (GCF_ for refseq, GCA_ for genbank)\n", " 3. Apply user-provided SYNTHETIC_RELEASE_DATE to all records\n", " 4. Build AssemblyRecord for each assembly found\n", @@ -177,7 +177,7 @@ "SYNTHETIC_RELEASE_DATE = \"2025/10/31\" # YYYY/MM/DD applied to all synthetic records\n", "LOCAL_SYNTHETIC_SUMMARY = Path(\"output/synthetic_summary_from_store.txt\")\n", "\n", - "if SCAN_STORE and STORE_BUCKET:\n", + "if SCAN_STORE and LAKEHOUSE_BUCKET:\n", " if LOCAL_SYNTHETIC_SUMMARY.exists() and not FORCE_RESCAN:\n", " print(f\"Loading existing synthetic summary from {LOCAL_SYNTHETIC_SUMMARY} (set FORCE_RESCAN=True to rescan)\")\n", " previous = parse_assembly_summary(LOCAL_SYNTHETIC_SUMMARY)\n", @@ -187,7 +187,7 @@ " from cdm_data_loaders.ncbi_ftp.manifest import scan_store_to_synthetic_summary\n", " from tqdm.notebook import tqdm\n", "\n", - " print(f\"Scanning s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} for existing {DATABASE} assemblies ...\")\n", + " print(f\"Scanning s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX} for existing {DATABASE} assemblies ...\")\n", " print(\"Note: large stores (500K+ assemblies) may take 15-30+ minutes.\")\n", " progress = tqdm(unit=\"assembly\", desc=\"Scanning store\", leave=True, mininterval=2.0)\n", "\n", @@ -203,7 +203,7 @@ " _last_refresh = now\n", "\n", " synthetic = scan_store_to_synthetic_summary(\n", - " STORE_BUCKET,\n", + " LAKEHOUSE_BUCKET,\n", " STORE_KEY_PREFIX,\n", " SYNTHETIC_RELEASE_DATE,\n", " database=DATABASE,\n", @@ -228,7 +228,7 @@ " previous = synthetic\n", "else:\n", " if SCAN_STORE:\n", - " print(\"SCAN_STORE=True but STORE_BUCKET not set. Skipping.\")\n", + " print(\"SCAN_STORE=True but LAKEHOUSE_BUCKET not set. Skipping.\")\n", " print(\"Skipping store scan (SCAN_STORE=False). Will load/use PREVIOUS_SUMMARY_URI instead.\")" ] }, @@ -304,13 +304,13 @@ "\"\"\"\n", "\n", "# -- Verify against Lakehouse --\n", - "if STORE_BUCKET:\n", + "if LAKEHOUSE_BUCKET:\n", " from cdm_data_loaders.ncbi_ftp.manifest import verify_transfer_candidates\n", " from tqdm.notebook import tqdm\n", "\n", " candidates = diff.new + diff.updated\n", " total = len(candidates)\n", - " print(f\"Verifying {total} candidates against s3://{STORE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", + " print(f\"Verifying {total} candidates against s3://{LAKEHOUSE_BUCKET}/{STORE_KEY_PREFIX} ...\")\n", "\n", " progress = tqdm(total=total, unit=\"assembly\", desc=\"Verifying checksums\", leave=True)\n", "\n", @@ -327,7 +327,7 @@ " verify_transfer_candidates(\n", " candidates,\n", " filtered,\n", - " STORE_BUCKET,\n", + " LAKEHOUSE_BUCKET,\n", " STORE_KEY_PREFIX,\n", " ftp_host=FTP_HOST,\n", " progress_callback=_update_progress,\n", @@ -342,7 +342,7 @@ " after = len(diff.new) + len(diff.updated)\n", " print(f\"Verified: {after} need downloading, {before - after} pruned (already in store)\")\n", "else:\n", - " print(\"Skipping S3 verification (STORE_BUCKET not set)\")\n", + " print(\"Skipping S3 verification (LAKEHOUSE_BUCKET not set)\")\n", "\n", "# -- Apply LIMIT --\n", "if LIMIT is not None:\n", @@ -407,7 +407,7 @@ "\"\"\"Upload manifests to S3 for CTS input staging (optional).\n", "\n", "Note: STAGING_URI is a full s3:// URI. The promote notebook splits this into\n", - "STORE_BUCKET + STAGING_KEY_PREFIX (separate bucket and key prefix parameters).\n", + "LAKEHOUSE_BUCKET + STAGING_KEY_PREFIX (separate bucket and key prefix parameters).\n", "\n", "This is for local testing. The CTS will stage the container's input folder in production.\n", "\"\"\"\n", @@ -415,7 +415,7 @@ "# S3 location where CTS will read input files from.\n", "# Set to None to skip upload (local-only testing).\n", "# format: s3:// URI (e.g. \"s3://cdm-lake/staging/run1/\")\n", - "STAGING_URI: str | None = \"s3://cdm-lake/staging/run1/input/\"\n", + "STAGING_URI: str | None = \"s3://cts/io/matt-cohere/staging/run1/input/\"\n", "\n", "if STAGING_URI:\n", " s3 = get_s3_client()\n", diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index a23fc149..cbf43a34 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -34,8 +34,8 @@ "| `_S3_KEY` | S3 object key (no scheme/bucket) | `staging/transfer_manifest.txt` |\n", "| `_PATH` | local filesystem path | `output/removed_manifest.txt` |\n", "\n", - "Lakehouse object: `s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", - "Staging object: `s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" + "Lakehouse object: `s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/…/{filename}`\n", + "Staging object: `s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/…/{filename}`" ] }, { @@ -66,17 +66,21 @@ "\"\"\"Configure parameters.\n", "\n", "Path layout (how variables compose into a full S3 object path):\n", - " s3://{STORE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{file}\n", - " s3://{STORE_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{assembly_dir}/{file}\n", + " s3://{LAKEHOUSE_BUCKET}/{LAKEHOUSE_KEY_PREFIX}raw_data/{GCF|GCA}/{nnn}/{nnn}/{nnn}/{assembly_dir}/{file}\n", + " s3://{STAGING_BUCKET}/{STAGING_KEY_PREFIX}raw_data/{assembly_dir}/{file}\n", "\"\"\"\n", "\n", - "# S3 bucket where staged files and final Lakehouse data live\n", + "# S3 bucket where CTS Phase 2 writes staged files\n", "# format: bucket name (no s3:// scheme)\n", - "STORE_BUCKET = \"cdm-lake\"\n", + "STAGING_BUCKET = \"cts\"\n", + "\n", + "# S3 bucket for the final Lakehouse destination\n", + "# format: bucket name (no s3:// scheme)\n", + "LAKEHOUSE_BUCKET = \"cdm-lake\"\n", "\n", "# Staging prefix written by CTS Phase 2\n", - "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", - "STAGING_KEY_PREFIX = \"staging/run1/\"\n", + "# format: S3 key prefix within STAGING_BUCKET (no scheme, no bucket)\n", + "STAGING_KEY_PREFIX = \"io/matt-cohere/staging/run1/output/\"\n", "\n", "# Local path to removed_manifest.txt from Phase 1 (or None to skip archiving)\n", "# format: local file path\n", @@ -91,17 +95,18 @@ "\n", "# S3 key of transfer_manifest.txt for trimming after promotion (or None to skip).\n", "# Only needed if the manifest was uploaded to S3 (e.g. via the staging cell in Phase 1).\n", - "# format: S3 object key within STORE_BUCKET (no scheme, no bucket)\n", - "MANIFEST_S3_KEY: str | None = \"staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", + "# format: S3 object key within STAGING_BUCKET (no scheme, no bucket)\n", + "MANIFEST_S3_KEY: str | None = \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", "\n", "# Final Lakehouse path prefix\n", - "# format: S3 key prefix within STORE_BUCKET (no scheme, no bucket)\n", + "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", "\n", "# Dry-run mode — log actions without making changes\n", "DRY_RUN = False\n", "\n", - "print(f\"Bucket: {STORE_BUCKET}\")\n", + "print(f\"Staging bucket: {STAGING_BUCKET}\")\n", + "print(f\"Lakehouse bucket: {LAKEHOUSE_BUCKET}\")\n", "print(f\"Staging key prefix: {STAGING_KEY_PREFIX}\")\n", "print(f\"Updated manifest: {UPDATED_MANIFEST_PATH}\")\n", "print(f\"NCBI release: {NCBI_RELEASE}\")\n", @@ -143,7 +148,7 @@ "paginator = s3.get_paginator(\"list_objects_v2\")\n", "\n", "staged: list[str] = []\n", - "for page in paginator.paginate(Bucket=STORE_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", + "for page in paginator.paginate(Bucket=STAGING_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", " staged.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", "\n", "sidecars = [k for k in staged if k.endswith((\".md5\", \".crc64nvme\"))]\n", @@ -172,7 +177,8 @@ "\n", "report = promote_from_s3(\n", " staging_key_prefix=STAGING_KEY_PREFIX,\n", - " bucket=STORE_BUCKET,\n", + " staging_bucket=STAGING_BUCKET,\n", + " lakehouse_bucket=LAKEHOUSE_BUCKET,\n", " removed_manifest_path=REMOVED_MANIFEST_PATH,\n", " updated_manifest_path=UPDATED_MANIFEST_PATH,\n", " ncbi_release=NCBI_RELEASE,\n", @@ -223,22 +229,16 @@ "paginator = s3.get_paginator(\"list_objects_v2\")\n", "\n", "descriptor_keys: list[str] = []\n", - "for page in paginator.paginate(Bucket=STORE_BUCKET, Prefix=LAKEHOUSE_KEY_PREFIX + \"metadata/\"):\n", + "for page in paginator.paginate(Bucket=LAKEHOUSE_BUCKET, Prefix=LAKEHOUSE_KEY_PREFIX + \"metadata/\"):\n", " descriptor_keys.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", "\n", "print(f\"Found {len(descriptor_keys)} descriptor(s) in metadata/\")\n", "\n", "for key in descriptor_keys[:5]: # preview first 5\n", - " obj = s3.get_object(Bucket=STORE_BUCKET, Key=key)\n", + " obj = s3.get_object(Bucket=LAKEHOUSE_BUCKET, Key=key)\n", " descriptor = json.loads(obj[\"Body\"].read())\n", " print()\n", - " print(f\" Key: {key}\")\n", - " print(f\" Identifier: {descriptor.get('identifier')}\")\n", - " print(f\" Version: {descriptor.get('version')}\")\n", - " print(f\" Resources: {len(descriptor.get('resources', []))} file(s)\")\n", - "\n", - "if len(descriptor_keys) > 5:\n", - " print(f\" ... and {len(descriptor_keys) - 5} more\")" + " print(f\" Key: {key}\")" ] } ], diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index ab345ba3..2a127f19 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -39,7 +39,8 @@ def promote_from_s3( # noqa: PLR0913 staging_key_prefix: str, - bucket: str, + staging_bucket: str, + lakehouse_bucket: str, removed_manifest_path: str | Path | None = None, updated_manifest_path: str | Path | None = None, ncbi_release: str | None = None, @@ -54,7 +55,8 @@ def promote_from_s3( # noqa: PLR0913 with MD5 metadata from ``.md5`` sidecar files. :param staging_key_prefix: S3 key prefix where CTS output was written - :param bucket: S3 bucket name + :param staging_bucket: S3 bucket containing the staged files (e.g. ``"cts"``) + :param lakehouse_bucket: S3 bucket for the final Lakehouse destination (e.g. ``"cdm-lake"``) :param removed_manifest_path: local path to the removed_manifest file :param updated_manifest_path: local path to the updated_manifest file :param ncbi_release: NCBI release version tag for archiving @@ -69,7 +71,7 @@ def promote_from_s3( # noqa: PLR0913 # Collect all objects under the staging prefix staged_objects: list[str] = [] - for page in paginator.paginate(Bucket=bucket, Prefix=normalized_staging_key_prefix): + for page in paginator.paginate(Bucket=staging_bucket, Prefix=normalized_staging_key_prefix): staged_objects.extend(obj["Key"] for obj in page.get("Contents", [])) # Separate data files from sidecars @@ -87,7 +89,7 @@ def promote_from_s3( # noqa: PLR0913 if manifest_file and Path(str(manifest_file)).is_file(): archived += _archive_assemblies( str(manifest_file), - bucket=bucket, + lakehouse_bucket=lakehouse_bucket, ncbi_release=ncbi_release, lakehouse_key_prefix=lakehouse_key_prefix, archive_reason=reason, @@ -100,13 +102,14 @@ def promote_from_s3( # noqa: PLR0913 sidecars, normalized_staging_key_prefix, lakehouse_key_prefix, - bucket, + staging_bucket, + lakehouse_bucket, dry_run=dry_run, ) # Trim manifest for resumability if manifest_s3_key and promoted_accessions and not dry_run: - _trim_manifest(manifest_s3_key, bucket, promoted_accessions) + _trim_manifest(manifest_s3_key, staging_bucket, promoted_accessions) # Upload frictionless descriptors for each promoted assembly descriptors_written = 0 @@ -115,7 +118,7 @@ def promote_from_s3( # noqa: PLR0913 continue try: descriptor = create_descriptor(adir, acc, resources) - upload_descriptor(descriptor, adir, bucket, lakehouse_key_prefix, dry_run=dry_run) + upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=dry_run) descriptors_written += 1 except Exception: logger.exception("Failed to write descriptor for %s", adir) @@ -149,7 +152,8 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 sidecars: set[str], normalized_staging_prefix: str, lakehouse_key_prefix: str, - bucket: str, + staging_bucket: str, + lakehouse_bucket: str, *, dry_run: bool, ) -> tuple[int, int, set[str], defaultdict[tuple[str, str], list[DescriptorResource]]]: @@ -181,19 +185,19 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 with tempfile.NamedTemporaryFile(delete=False) as tmp: tmp_path = tmp.name try: - s3.download_file(Bucket=bucket, Key=staged_key, Filename=tmp_path) + s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) # Read MD5 from sidecar metadata: dict[str, str] = {} md5_key = staged_key + ".md5" if md5_key in sidecars: - md5_obj = s3.get_object(Bucket=bucket, Key=md5_key) + md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) metadata["md5"] = md5_obj["Body"].read().decode().strip() final_key_path = PurePosixPath(final_key) upload_succeeded = upload_file( tmp_path, - f"{bucket}/{final_key_path.parent}", + f"{lakehouse_bucket}/{final_key_path.parent}", metadata=metadata, object_name=final_key_path.name, ) @@ -240,7 +244,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 def _archive_assemblies( # noqa: PLR0913 manifest_local_path: str, - bucket: str, + lakehouse_bucket: str, ncbi_release: str | None = None, lakehouse_key_prefix: str = DEFAULT_LAKEHOUSE_KEY_PREFIX, archive_reason: str = "unknown", @@ -256,7 +260,7 @@ def _archive_assemblies( # noqa: PLR0913 originals remain in place to be overwritten by the promote step. :param manifest_local_path: local path to a manifest file (one accession per line) - :param bucket: S3 bucket name + :param lakehouse_bucket: S3 bucket for the Lakehouse (source and archive destination) :param ncbi_release: release tag used in the archive path :param lakehouse_key_prefix: S3 key prefix for the Lakehouse dataset root :param archive_reason: metadata value describing why the object was archived @@ -284,7 +288,7 @@ def _archive_assemblies( # noqa: PLR0913 paginator = s3.get_paginator("list_objects_v2") matching_keys: list[str] = [] - for page in paginator.paginate(Bucket=bucket, Prefix=source_prefix): + for page in paginator.paginate(Bucket=lakehouse_bucket, Prefix=source_prefix): matching_keys.extend(obj["Key"] for obj in page.get("Contents", []) if accession in obj["Key"]) if not matching_keys: @@ -310,8 +314,8 @@ def _archive_assemblies( # noqa: PLR0913 try: copy_object( - f"{bucket}/{source_key}", - f"{bucket}/{archive_key}", + f"{lakehouse_bucket}/{source_key}", + f"{lakehouse_bucket}/{archive_key}", metadata={ "ncbi_last_release": release_tag, "archive_reason": archive_reason, @@ -319,7 +323,7 @@ def _archive_assemblies( # noqa: PLR0913 }, ) if delete_source: - delete_object(f"{bucket}/{source_key}") + delete_object(f"{lakehouse_bucket}/{source_key}") archived += 1 logger.debug(" Archived: %s -> %s", source_key, archive_key) except Exception: @@ -330,7 +334,7 @@ def _archive_assemblies( # noqa: PLR0913 try: archive_descriptor( assembly_dir, - bucket, + lakehouse_bucket, lakehouse_key_prefix, release_tag, archive_reason=archive_reason, @@ -346,11 +350,11 @@ def _archive_assemblies( # noqa: PLR0913 # ── Manifest trimming ─────────────────────────────────────────────────── -def _trim_manifest(manifest_s3_key: str, bucket: str, promoted_accessions: set[str]) -> None: +def _trim_manifest(manifest_s3_key: str, staging_bucket: str, promoted_accessions: set[str]) -> None: """Remove promoted accessions from the transfer manifest in S3. :param manifest_s3_key: S3 object key of the transfer_manifest.txt - :param bucket: S3 bucket name + :param staging_bucket: S3 bucket containing the transfer manifest :param promoted_accessions: set of accessions that were successfully promoted """ s3 = get_s3_client() @@ -360,13 +364,13 @@ def _trim_manifest(manifest_s3_key: str, bucket: str, promoted_accessions: set[s try: try: - s3.download_file(Bucket=bucket, Key=manifest_s3_key, Filename=tmp_path) + s3.download_file(Bucket=staging_bucket, Key=manifest_s3_key, Filename=tmp_path) except s3.exceptions.NoSuchKey: - logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", bucket, manifest_s3_key) + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", staging_bucket, manifest_s3_key) return except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": - logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", bucket, manifest_s3_key) + logger.warning("Manifest not found in S3 (s3://%s/%s) — skipping trim", staging_bucket, manifest_s3_key) return raise @@ -378,7 +382,7 @@ def _trim_manifest(manifest_s3_key: str, bucket: str, promoted_accessions: set[s with Path(tmp_path).open("w") as f: f.writelines(remaining) - s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=manifest_s3_key) + s3.upload_file(Filename=tmp_path, Bucket=staging_bucket, Key=manifest_s3_key) logger.info( "Trimmed manifest: %d -> %d entries (%d promoted)", len(lines), diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 0fe7384a..f27e4202 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -147,6 +147,36 @@ def test_bucket(minio_s3_client: botocore.client.BaseClient, request: pytest.Fix return bucket +@pytest.fixture +def staging_test_bucket(minio_s3_client: botocore.client.BaseClient, request: pytest.FixtureRequest) -> str: + """Create a per-test staging bucket in MinIO and return its name. + + Mirrors ``test_bucket`` but uses a ``staging-`` prefix so staging and + Lakehouse buckets are distinct within the same test. + """ + bucket = "staging-" + _bucket_name_from_node(request.node.nodeid) + if len(bucket) > _MAX_BUCKET_LEN: + suffix = hashlib.md5(bucket.encode()).hexdigest()[:6] # noqa: S324 + bucket = f"{bucket[: _MAX_BUCKET_LEN - 7]}-{suffix}" + s3 = minio_s3_client + + try: + s3.head_bucket(Bucket=bucket) + paginator = s3.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket): + for obj in page.get("Contents", []): + s3.delete_object(Bucket=bucket, Key=obj["Key"]) + except s3.exceptions.NoSuchBucket: + s3.create_bucket(Bucket=bucket) + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("404", "NoSuchBucket"): + s3.create_bucket(Bucket=bucket) + else: + raise + + return bucket + + # ── Helpers ───────────────────────────────────────────────────────────── diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index 2654215b..e8f6e4a6 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -29,7 +29,7 @@ from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch -from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio +from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 STABLE_PREFIX = "900" STAGING_PREFIX = "staging/run1/" @@ -46,6 +46,7 @@ def test_full_pipeline_small_batch( self, minio_s3_client: object, test_bucket: str, + staging_test_bucket: str, tmp_path: Path, ) -> None: """Single assembly flows through all three phases into MinIO.""" @@ -76,13 +77,14 @@ def test_full_pipeline_small_batch( assert report["failed"] == 0 # ── Upload local output to MinIO staging ──────────────────────── - keys = stage_files_to_minio(s3, test_bucket, output_dir, STAGING_PREFIX) + keys = stage_files_to_minio(s3, staging_test_bucket, output_dir, STAGING_PREFIX) assert len(keys) > 0, "Expected files staged to MinIO" # ── Phase 3: Promote from staging to final path ───────────────── promote_report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) assert promote_report["promoted"] >= 1 @@ -112,6 +114,7 @@ def test_full_pipeline_incremental( self, minio_s3_client: object, test_bucket: str, + staging_test_bucket: str, tmp_path: Path, ) -> None: """Second sync archives the old version and promotes the new one.""" @@ -133,15 +136,16 @@ def test_full_pipeline_incremental( report1 = download_batch(str(manifest1), str(output1), threads=1, limit=1) assert report1["succeeded"] >= 1 - stage_files_to_minio(s3, test_bucket, output1, STAGING_PREFIX) + stage_files_to_minio(s3, staging_test_bucket, output1, STAGING_PREFIX) - # Upload manifest to MinIO for trimming + # Upload manifest to MinIO for trimming (manifest lives in staging bucket) manifest_key = "ncbi/transfer_manifest.txt" - s3.upload_file(Filename=str(manifest1), Bucket=test_bucket, Key=manifest_key) + s3.upload_file(Filename=str(manifest1), Bucket=staging_test_bucket, Key=manifest_key) promote1 = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, manifest_s3_key=manifest_key, lakehouse_key_prefix=PATH_PREFIX, ) @@ -190,15 +194,16 @@ def test_full_pipeline_incremental( assert report2["succeeded"] >= 1 # Clean staging and re-stage - staging_keys = list_all_keys(s3, test_bucket, STAGING_PREFIX) + staging_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) for key in staging_keys: - s3.delete_object(Bucket=test_bucket, Key=key) - stage_files_to_minio(s3, test_bucket, output2, STAGING_PREFIX) + s3.delete_object(Bucket=staging_test_bucket, Key=key) + stage_files_to_minio(s3, staging_test_bucket, output2, STAGING_PREFIX) # Phase 3 — promote with archival promote2 = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, updated_manifest_path=str(updated_manifest), ncbi_release="test-incremental", lakehouse_key_prefix=PATH_PREFIX, diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 46324603..67778c64 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -21,7 +21,7 @@ ) from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 -from .conftest import get_object_metadata, list_all_keys, seed_lakehouse +from .conftest import get_object_metadata, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 from pathlib import Path @@ -82,14 +82,15 @@ def _write_manifest(tmp_path: Path, accessions: list[str], name: str) -> Path: class TestPromoteFromStaging: """Promote staged files to final Lakehouse paths.""" - def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str) -> None: + def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Staged files appear at the final Lakehouse path with MD5 metadata.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -112,21 +113,23 @@ def test_promote_from_staging(self, minio_s3_client: object, test_bucket: str) - class TestPromoteIdempotent: """Promoting the same staging data twice should succeed without errors.""" - def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str) -> None: + def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Second promote succeeds and produces the same final state.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) report1 = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") report2 = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") @@ -142,7 +145,7 @@ def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str) -> class TestPromoteArchiveUpdated: """Archive existing assemblies before overwriting with updated versions.""" - def test_archive_updated(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + def test_archive_updated(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: """Updated assemblies are archived before being overwritten.""" s3 = minio_s3_client @@ -154,13 +157,14 @@ def test_archive_updated(self, minio_s3_client: object, test_bucket: str, tmp_pa seed_lakehouse(s3, test_bucket, ACCESSION_A, old_files, PATH_PREFIX, ASSEMBLY_DIR_A) # Stage "new" version - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, updated_manifest_path=str(updated_manifest), ncbi_release="2024-01", lakehouse_key_prefix=PATH_PREFIX, @@ -186,7 +190,7 @@ def test_archive_updated(self, minio_s3_client: object, test_bucket: str, tmp_pa class TestPromoteArchiveRemoved: """Archive and delete replaced/suppressed assemblies.""" - def test_archive_removed(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + def test_archive_removed(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: """Removed assemblies are archived and source objects are deleted.""" s3 = minio_s3_client @@ -201,7 +205,8 @@ def test_archive_removed(self, minio_s3_client: object, test_bucket: str, tmp_pa # Stage something (even empty staging is fine — promote won't find data files for this accession) report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, removed_manifest_path=str(removed_manifest), ncbi_release="2024-01", lakehouse_key_prefix=PATH_PREFIX, @@ -230,14 +235,15 @@ def test_archive_removed(self, minio_s3_client: object, test_bucket: str, tmp_pa class TestPromoteDryRun: """Dry-run mode should not create any objects.""" - def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str) -> None: + def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Dry-run logs actions but creates no objects at the final path.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, dry_run=True, ) @@ -255,34 +261,35 @@ def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str) -> Non class TestPromoteTrimsManifest: """Manifest trimming removes promoted accessions.""" - def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: """Transfer manifest in MinIO is trimmed to exclude promoted accessions.""" s3 = minio_s3_client - # Upload a transfer manifest with 3 entries to MinIO + # Upload a transfer manifest with 3 entries to MinIO (manifest lives in staging) manifest_key = "ncbi/transfer_manifest.txt" manifest_lines = [ "/genomes/all/GCF/900/000/001/GCF_900000001.1_FakeAssemblyA/\n", "/genomes/all/GCF/900/000/002/GCF_900000002.1_FakeAssemblyB/\n", "/genomes/all/GCF/900/000/003/GCF_900000003.1_FakeAssemblyC/\n", ] - s3.put_object(Bucket=test_bucket, Key=manifest_key, Body="".join(manifest_lines).encode()) + s3.put_object(Bucket=staging_test_bucket, Key=manifest_key, Body="".join(manifest_lines).encode()) # Stage only assemblies A and B (not C) - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_B) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_B) report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, manifest_s3_key=manifest_key, lakehouse_key_prefix=PATH_PREFIX, ) assert report["failed"] == 0 - # Read back the manifest from MinIO - resp = s3.get_object(Bucket=test_bucket, Key=manifest_key) + # Read back the manifest from MinIO (it lives in staging) + resp = s3.get_object(Bucket=staging_test_bucket, Key=manifest_key) remaining = resp["Body"].read().decode() remaining_lines = [line.strip() for line in remaining.strip().splitlines() if line.strip()] @@ -296,7 +303,7 @@ def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, tmp_pat class TestPromoteIncompleteStaging: """Incomplete staging (sidecar only, no data) should not promote anything.""" - def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> None: + def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Only .md5 sidecars staged → nothing promoted.""" s3 = minio_s3_client @@ -305,11 +312,12 @@ def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> base = f"{STAGING_PREFIX}{rel}" fname = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" md5_key = f"{base}{fname}.md5" - s3.put_object(Bucket=test_bucket, Key=md5_key, Body=_md5(FAKE_GENOMIC).encode()) + s3.put_object(Bucket=staging_test_bucket, Key=md5_key, Body=_md5(FAKE_GENOMIC).encode()) report = promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -327,14 +335,15 @@ def test_incomplete_staging(self, minio_s3_client: object, test_bucket: str) -> class TestPromoteCreatesDescriptor: """Promote step writes a frictionless descriptor for each promoted assembly.""" - def test_descriptor_created(self, minio_s3_client: object, test_bucket: str) -> None: + def test_descriptor_created(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """After promote, a JSON descriptor exists under ``metadata/``.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -345,14 +354,15 @@ def test_descriptor_created(self, minio_s3_client: object, test_bucket: str) -> assert body["identifier"] == f"NCBI:{ACCESSION_A}" assert body["resource_type"] == "dataset" - def test_descriptor_resources_include_promoted_files(self, minio_s3_client: object, test_bucket: str) -> None: + def test_descriptor_resources_include_promoted_files(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Descriptor's ``resources`` list references the final Lakehouse key.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -363,14 +373,15 @@ def test_descriptor_resources_include_promoted_files(self, minio_s3_client: obje resource_paths = [r["path"] for r in body["resources"]] assert any(PATH_PREFIX + "raw_data/" in p for p in resource_paths) - def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucket: str) -> None: + def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Resources with .md5 sidecars include the hash value.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -382,15 +393,16 @@ def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucke for resource in body["resources"]: assert "hash" in resource, f"Expected hash in resource: {resource}" - def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: object, test_bucket: str) -> None: + def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Each assembly gets its own descriptor file.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_B) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_B) promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, ) @@ -406,7 +418,7 @@ def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: obj class TestPromoteArchiveUpdatedIncludesDescriptor: """Archiving updated assemblies also archives the descriptor.""" - def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: """After archiving an updated assembly, the descriptor appears under archive/.""" s3 = minio_s3_client @@ -419,12 +431,13 @@ def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: s descriptor_key = build_descriptor_key(ASSEMBLY_DIR_A, PATH_PREFIX) s3.put_object(Bucket=test_bucket, Key=descriptor_key, Body=json.dumps(descriptor).encode()) - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, updated_manifest_path=str(updated_manifest), ncbi_release="2024-01", lakehouse_key_prefix=PATH_PREFIX, @@ -441,7 +454,7 @@ def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: s class TestPromoteArchiveRemovedIncludesDescriptor: """Archiving removed assemblies also archives the descriptor.""" - def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_bucket: str, tmp_path: Path) -> None: + def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: """After archiving a removed assembly, the descriptor is under archive/.""" s3 = minio_s3_client @@ -457,7 +470,8 @@ def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_b promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, removed_manifest_path=str(removed_manifest), ncbi_release="2024-01", lakehouse_key_prefix=PATH_PREFIX, @@ -473,14 +487,15 @@ def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_b class TestPromoteDryRunNoDescriptor: """Dry-run must not write any descriptor files.""" - def test_dry_run_no_descriptor(self, minio_s3_client: object, test_bucket: str) -> None: + def test_dry_run_no_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: """Dry-run does not upload a descriptor to the metadata/ prefix.""" s3 = minio_s3_client - _stage_assembly(s3, test_bucket, ASSEMBLY_DIR_A) + _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) promote_from_s3( staging_key_prefix=STAGING_PREFIX, - bucket=test_bucket, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, lakehouse_key_prefix=PATH_PREFIX, dry_run=True, ) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index fdb180db..73777b9b 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -31,7 +31,7 @@ def test_promote_dry_run_no_writes(mock_s3_client_no_checksum: botocore.client.B prefix = "staging/run1/" _stage_files(mock_s3_client_no_checksum, prefix) - report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET, dry_run=True) + report = promote_from_s3(staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET, dry_run=True) assert report["promoted"] == 1 assert report["dry_run"] is True @@ -45,7 +45,7 @@ def test_promote_with_metadata(mock_s3_client_no_checksum: botocore.client.BaseC prefix = "staging/run1/" _stage_files(mock_s3_client_no_checksum, prefix) - report = promote_from_s3(staging_key_prefix=prefix, bucket=TEST_BUCKET) + report = promote_from_s3(staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET) assert report["promoted"] == 1 # only .fna.gz, not download_report.json assert report["failed"] == 0 @@ -106,7 +106,7 @@ def test_archive_assemblies_removed(mock_s3_client_no_checksum: botocore.client. assert ( _archive_assemblies( str(manifest), - bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="replaced_or_suppressed", delete_source=True, @@ -137,7 +137,7 @@ def test_archive_assemblies_updated_no_delete( assert ( _archive_assemblies( - str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated", delete_source=False + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated", delete_source=False ) == 1 ) @@ -164,9 +164,9 @@ def test_archive_assemblies_multiple_releases_no_collision( manifest = tmp_path / "updated.txt" manifest.write_text(f"{accession}\n") - _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") + _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated") mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") - _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") + _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") archive_key_1 = ( f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" @@ -191,7 +191,7 @@ def test_archive_assemblies_dry_run(mock_s3_client_no_checksum: botocore.client. assert ( _archive_assemblies( str(manifest), - bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="replaced_or_suppressed", delete_source=True, @@ -212,7 +212,7 @@ def test_archive_assemblies_no_objects_skips( """Accessions with no existing S3 objects are silently skipped.""" manifest = tmp_path / "updated.txt" manifest.write_text("GCF_000001215.4\n") - assert _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release="2024-01") == 0 + assert _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01") == 0 @pytest.mark.s3 @@ -228,7 +228,7 @@ def test_archive_assemblies_unknown_release_fallback( manifest = tmp_path / "updated.txt" manifest.write_text(f"{accession}\n") - assert _archive_assemblies(str(manifest), bucket=TEST_BUCKET, ncbi_release=None) == 1 + assert _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release=None) == 1 archive_key = ( f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" From 1da3d05629999c69cc7b1d1c9861a98e4bc5465b Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 12:29:48 -0700 Subject: [PATCH 57/76] consolidate progress bars --- notebooks/ncbi_ftp_download.ipynb | 18 +++++--- notebooks/ncbi_ftp_manifest.ipynb | 16 ++++--- notebooks/ncbi_ftp_promote.ipynb | 17 ++++--- src/cdm_data_loaders/ncbi_ftp/assembly.py | 4 +- src/cdm_data_loaders/ncbi_ftp/metadata.py | 2 +- src/cdm_data_loaders/ncbi_ftp/promote.py | 13 +++--- .../pipelines/ncbi_ftp_download.py | 35 ++++++++------- src/cdm_data_loaders/utils/s3.py | 45 +++++++++++++------ tests/integration/test_promote_e2e.py | 32 +++++++++---- tests/ncbi_ftp/test_promote.py | 10 ++++- tests/utils/test_s3.py | 12 +++-- 11 files changed, 132 insertions(+), 72 deletions(-) diff --git a/notebooks/ncbi_ftp_download.ipynb b/notebooks/ncbi_ftp_download.ipynb index 84ac4fbf..c8e10603 100644 --- a/notebooks/ncbi_ftp_download.ipynb +++ b/notebooks/ncbi_ftp_download.ipynb @@ -124,11 +124,13 @@ "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", "if PROVIDE_CREDENTIALS:\n", " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", - " get_s3_client({\n", - " \"endpoint_url\": \"http://localhost:9000\",\n", - " \"aws_access_key_id\": \"minioadmin\",\n", - " \"aws_secret_access_key\": \"minioadmin\",\n", - " })" + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" ] }, { @@ -187,6 +189,8 @@ "source": [ "\"\"\"Display download and staging report.\"\"\"\n", "\n", + "FAILURE_PREVIEW = 10\n", + "\n", "print(\"=\" * 50)\n", "print(\"DOWNLOAD & STAGE REPORT\")\n", "print(\"=\" * 50)\n", @@ -200,8 +204,10 @@ "\n", "if report[\"failed\"] > 0:\n", " print(\"\\nFailed assemblies:\")\n", - " for failure in report[\"failures\"]:\n", + " for failure in report[\"failures\"][:FAILURE_PREVIEW]:\n", " print(f\" {failure['path']}: {failure['error']}\")\n", + " if report[\"failed\"] > FAILURE_PREVIEW:\n", + " print(f\" ... and {report['failed'] - FAILURE_PREVIEW} more\")\n", "\n", "if report[\"dry_run\"]:\n", " print(\"\\nThis was a dry-run. Set DRY_RUN = False and re-run to upload to S3.\")" diff --git a/notebooks/ncbi_ftp_manifest.ipynb b/notebooks/ncbi_ftp_manifest.ipynb index 0ade0917..49045123 100644 --- a/notebooks/ncbi_ftp_manifest.ipynb +++ b/notebooks/ncbi_ftp_manifest.ipynb @@ -79,11 +79,13 @@ "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", "if PROVIDE_CREDENTIALS:\n", " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", - " get_s3_client({\n", - " \"endpoint_url\": \"http://localhost:9000\",\n", - " \"aws_access_key_id\": \"minioadmin\",\n", - " \"aws_secret_access_key\": \"minioadmin\",\n", - " })" + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" ] }, { @@ -111,7 +113,9 @@ "\n", "# S3 location where the new snapshot will be uploaded after diffing\n", "# format: s3:// URI\n", - "SNAPSHOT_UPLOAD_URI: str | None = \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/assembly_summary_refseq.txt\"\n", + "SNAPSHOT_UPLOAD_URI: str | None = (\n", + " \"s3://cdm-lake/tenant-general-warehouse/kbase/datasets/ncbi/assembly_summary_refseq.txt\"\n", + ")\n", "\n", "# Verify candidates against the S3 Lakehouse — prune assemblies already present.\n", "# Set LAKEHOUSE_BUCKET to your bucket name to enable, or None to skip.\n", diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index cbf43a34..1be65156 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -96,7 +96,9 @@ "# S3 key of transfer_manifest.txt for trimming after promotion (or None to skip).\n", "# Only needed if the manifest was uploaded to S3 (e.g. via the staging cell in Phase 1).\n", "# format: S3 object key within STAGING_BUCKET (no scheme, no bucket)\n", - "MANIFEST_S3_KEY: str | None = \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", + "MANIFEST_S3_KEY: str | None = (\n", + " \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", + ")\n", "\n", "# Final Lakehouse path prefix\n", "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", @@ -128,11 +130,13 @@ "PROVIDE_CREDENTIALS = False # Set to False to rely on environment credentials (e.g. IAM role)\n", "if PROVIDE_CREDENTIALS:\n", " reset_s3_client() # Clear any existing client to ensure new credentials are used\n", - " get_s3_client({\n", - " \"endpoint_url\": \"http://localhost:9000\",\n", - " \"aws_access_key_id\": \"minioadmin\",\n", - " \"aws_secret_access_key\": \"minioadmin\",\n", - " })" + " get_s3_client(\n", + " {\n", + " \"endpoint_url\": \"http://localhost:9000\",\n", + " \"aws_access_key_id\": \"minioadmin\",\n", + " \"aws_secret_access_key\": \"minioadmin\",\n", + " }\n", + " )" ] }, { @@ -218,6 +222,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7fb27b941602401d91542211134fc71a", "metadata": {}, "outputs": [], "source": [ diff --git a/src/cdm_data_loaders/ncbi_ftp/assembly.py b/src/cdm_data_loaders/ncbi_ftp/assembly.py index cd2981ef..b424702b 100644 --- a/src/cdm_data_loaders/ncbi_ftp/assembly.py +++ b/src/cdm_data_loaders/ncbi_ftp/assembly.py @@ -163,7 +163,7 @@ def download_assembly_to_local( dest_dir = Path(output_dir) / rel_path dest_dir.mkdir(parents=True, exist_ok=True) - logger.info("Downloading %s -> %s", accession, dest_dir) + logger.debug("Downloading %s -> %s", accession, dest_dir) owns_ftp = ftp is None if owns_ftp: @@ -196,7 +196,7 @@ def download_assembly_to_local( for filename in target_files: last_activity = _download_and_verify(ftp, filename, dest_dir, md5_checksums, stats, last_activity) - logger.info(" %s: %d files downloaded", accession, stats["files_downloaded"]) + logger.debug(" %s: %d files downloaded", accession, stats["files_downloaded"]) finally: if owns_ftp: diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index 42cfe865..43a9c04f 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -194,7 +194,7 @@ def upload_descriptor( try: s3.upload_file(Filename=tmp_path, Bucket=bucket, Key=key) - logger.info("Uploaded descriptor: s3://%s/%s", bucket, key) + logger.debug("Uploaded descriptor: s3://%s/%s", bucket, key) finally: Path(tmp_path).unlink() diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 2a127f19..f449ee7b 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -14,6 +14,7 @@ from typing import Any import botocore.exceptions +import tqdm from cdm_data_loaders.ncbi_ftp.metadata import ( DescriptorResource, @@ -113,9 +114,8 @@ def promote_from_s3( # noqa: PLR0913 # Upload frictionless descriptors for each promoted assembly descriptors_written = 0 - for (adir, acc), resources in assembly_resources.items(): - if not resources: - continue + non_empty = [(k, v) for k, v in assembly_resources.items() if v] + for (adir, acc), resources in tqdm.tqdm(non_empty, unit="descriptor", desc="Writing descriptors"): try: descriptor = create_descriptor(adir, acc, resources) upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=dry_run) @@ -167,7 +167,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 promoted_accessions: set[str] = set() assembly_resources: defaultdict[tuple[str, str], list[DescriptorResource]] = defaultdict(list) - for staged_key in data_files: + for staged_key in tqdm.tqdm(data_files, unit="file", desc="Promoting"): if staged_key.endswith("download_report.json"): continue @@ -177,7 +177,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 final_key = lakehouse_key_prefix + rel_path if dry_run: - logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) + logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) promoted += 1 continue @@ -200,6 +200,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 f"{lakehouse_bucket}/{final_key_path.parent}", metadata=metadata, object_name=final_key_path.name, + show_progress=False, ) if not upload_succeeded: logger.error("Failed to upload promoted file %s to %s", staged_key, final_key) @@ -276,7 +277,7 @@ def _archive_assemblies( # noqa: PLR0913 with Path(manifest_local_path).open() as f: accessions = [line.strip() for line in f if line.strip()] - for accession in accessions: + for accession in tqdm.tqdm(accessions, unit="accession", desc="Archiving"): m = re.match(r"(GC[AF])_(\d{3})(\d{3})(\d{3})\.\d+", accession) if not m: logger.warning("Cannot parse accession for archival: %s", accession) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 233c82b1..9c628942 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -15,6 +15,7 @@ from pathlib import Path from typing import Any +import tqdm from pydantic import AliasChoices, Field from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed from pydantic_settings import BaseSettings, SettingsConfigDict @@ -126,14 +127,16 @@ def _attempt() -> dict[str, Any]: return path, None try: - with ThreadPoolExecutor(max_workers=threads) as executor: - futures = {executor.submit(_download_one, p): p for p in assembly_paths} - for future in as_completed(futures): - path, error = future.result() - if error: - logger.error("FAILED: %s: %s", path, error) - with lock: - failed.append({"path": path, "error": str(error)}) + with tqdm.tqdm(total=len(assembly_paths), unit="assembly", desc="Downloading from NCBI FTP") as pbar: + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = {executor.submit(_download_one, p): p for p in assembly_paths} + for future in as_completed(futures): + path, error = future.result() + if error: + logger.error("FAILED: %s: %s", path, error) + with lock: + failed.append({"path": path, "error": str(error)}) + pbar.update(1) finally: pool.close_all() @@ -263,13 +266,15 @@ def download_and_stage( def _upload(task: tuple[Path, str]) -> None: local_file, dest = task - upload_file(local_file, dest) - - with ThreadPoolExecutor(max_workers=threads) as executor: - futures = [executor.submit(_upload, t) for t in upload_tasks] - for future in as_completed(futures): - future.result() - staged_objects += 1 + upload_file(local_file, dest, show_progress=False) + + with tqdm.tqdm(total=len(upload_tasks), unit="file", desc="Staging to S3") as pbar: + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = [executor.submit(_upload, t) for t in upload_tasks] + for future in as_completed(futures): + future.result() + staged_objects += 1 + pbar.update(1) logger.info("Staged %d objects to s3://%s/%s", staged_objects, bucket, staging_key_prefix) diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index c010404f..07c9a47d 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -201,6 +201,7 @@ def upload_file( destination_dir: str, object_name: str | None = None, metadata: dict[str, str] | None = None, + show_progress: bool = True, ) -> bool: """Upload an object to an S3 bucket. @@ -232,7 +233,7 @@ def upload_file( s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" if metadata is None and object_exists(s3_path): - logger.info("File already present: %s", s3_path) + logger.debug("File already present: %s", s3_path) return True s3 = get_s3_client() @@ -241,21 +242,29 @@ def upload_file( extra_args = {**DEFAULT_EXTRA_ARGS, **(({"Metadata": metadata}) if metadata is not None else {})} # Upload the file - file_size = local_file_path.stat().st_size - with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: - logger.info("uploading %s to %s", str(local_file_path), s3_path) - try: + logger.debug("uploading %s to %s", str(local_file_path), s3_path) + try: + if show_progress: + file_size = local_file_path.stat().st_size + with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.upload_file( + Filename=str(local_file_path), + Bucket=bucket, + Key=key, + Callback=pbar.update, + ExtraArgs=extra_args, + ) + else: s3.upload_file( Filename=str(local_file_path), Bucket=bucket, Key=key, - Callback=pbar.update, ExtraArgs=extra_args, ) - except Exception as e: # noqa: BLE001 - logger.exception("Error uploading to s3") - return False - return True + except Exception as e: # noqa: BLE001 + logger.exception("Error uploading to s3") + return False + return True def stream_to_s3(url: str, s3_path: str, requests: ModuleType) -> str: @@ -287,7 +296,9 @@ def stream_to_s3(url: str, s3_path: str, requests: ModuleType) -> str: return f"{bucket}/{key}" -def download_file(s3_path: str, local_file_path: str | Path, version_id: str | None = None) -> None: +def download_file( + s3_path: str, local_file_path: str | Path, version_id: str | None = None, show_progress: bool = True +) -> None: """Download an object from s3. WARNING: will overwrite existing files but will not overwrite a file whilst trying to make a directory @@ -333,13 +344,21 @@ def download_file(s3_path: str, local_file_path: str | Path, version_id: str | N # set ``unit_scale=True`` so tqdm uses SI unit prefixes # ``unit="B"`` means it adds the string "B" as a suffix # progress is reported as (e.g.) "14.5kB/s". - with tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + if show_progress: + with tqdm.tqdm(total=object_size, unit="B", unit_scale=True, desc=str(local_file_path)) as pbar: + s3.download_file( + Bucket=bucket, + Key=key, + ExtraArgs=extra_args, + Filename=str(local_file_path), + Callback=pbar.update, + ) + else: s3.download_file( Bucket=bucket, Key=key, ExtraArgs=extra_args, Filename=str(local_file_path), - Callback=pbar.update, ) diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 67778c64..7e28c796 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -145,7 +145,9 @@ def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str, sta class TestPromoteArchiveUpdated: """Archive existing assemblies before overwriting with updated versions.""" - def test_archive_updated(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: + def test_archive_updated( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: """Updated assemblies are archived before being overwritten.""" s3 = minio_s3_client @@ -190,7 +192,9 @@ def test_archive_updated(self, minio_s3_client: object, test_bucket: str, stagin class TestPromoteArchiveRemoved: """Archive and delete replaced/suppressed assemblies.""" - def test_archive_removed(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: + def test_archive_removed( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: """Removed assemblies are archived and source objects are deleted.""" s3 = minio_s3_client @@ -261,7 +265,9 @@ def test_promote_dry_run(self, minio_s3_client: object, test_bucket: str, stagin class TestPromoteTrimsManifest: """Manifest trimming removes promoted accessions.""" - def test_trims_manifest(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: + def test_trims_manifest( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: """Transfer manifest in MinIO is trimmed to exclude promoted accessions.""" s3 = minio_s3_client @@ -354,7 +360,9 @@ def test_descriptor_created(self, minio_s3_client: object, test_bucket: str, sta assert body["identifier"] == f"NCBI:{ACCESSION_A}" assert body["resource_type"] == "dataset" - def test_descriptor_resources_include_promoted_files(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + def test_descriptor_resources_include_promoted_files( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: """Descriptor's ``resources`` list references the final Lakehouse key.""" s3 = minio_s3_client _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) @@ -373,7 +381,9 @@ def test_descriptor_resources_include_promoted_files(self, minio_s3_client: obje resource_paths = [r["path"] for r in body["resources"]] assert any(PATH_PREFIX + "raw_data/" in p for p in resource_paths) - def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + def test_descriptor_resources_have_md5( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: """Resources with .md5 sidecars include the hash value.""" s3 = minio_s3_client _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) @@ -393,7 +403,9 @@ def test_descriptor_resources_have_md5(self, minio_s3_client: object, test_bucke for resource in body["resources"]: assert "hash" in resource, f"Expected hash in resource: {resource}" - def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + def test_multiple_assemblies_get_separate_descriptors( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: """Each assembly gets its own descriptor file.""" s3 = minio_s3_client _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) @@ -418,7 +430,9 @@ def test_multiple_assemblies_get_separate_descriptors(self, minio_s3_client: obj class TestPromoteArchiveUpdatedIncludesDescriptor: """Archiving updated assemblies also archives the descriptor.""" - def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: + def test_archive_copies_descriptor( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: """After archiving an updated assembly, the descriptor appears under archive/.""" s3 = minio_s3_client @@ -454,7 +468,9 @@ def test_archive_copies_descriptor(self, minio_s3_client: object, test_bucket: s class TestPromoteArchiveRemovedIncludesDescriptor: """Archiving removed assemblies also archives the descriptor.""" - def test_archive_removed_copies_descriptor(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path) -> None: + def test_archive_removed_copies_descriptor( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: """After archiving a removed assembly, the descriptor is under archive/.""" s3 = minio_s3_client diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index 73777b9b..db19bcdb 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -31,7 +31,9 @@ def test_promote_dry_run_no_writes(mock_s3_client_no_checksum: botocore.client.B prefix = "staging/run1/" _stage_files(mock_s3_client_no_checksum, prefix) - report = promote_from_s3(staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET, dry_run=True) + report = promote_from_s3( + staging_key_prefix=prefix, staging_bucket=TEST_BUCKET, lakehouse_bucket=TEST_BUCKET, dry_run=True + ) assert report["promoted"] == 1 assert report["dry_run"] is True @@ -137,7 +139,11 @@ def test_archive_assemblies_updated_no_delete( assert ( _archive_assemblies( - str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated", delete_source=False + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-06", + archive_reason="updated", + delete_source=False, ) == 1 ) diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 59287cd5..7e68acc7 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -393,16 +393,14 @@ def test_upload_file_uses_custom_object_name(mock_s3_client: Any, sample_file: P @pytest.mark.s3 -def test_upload_file_skips_when_already_present( - mock_s3_client: Any, sample_file: Path, caplog: pytest.LogCaptureFixture -) -> None: - """Verify that uploading a file that already exists is skipped and returns True.""" +def test_upload_file_skips_when_already_present(mock_s3_client: Any, sample_file: Path) -> None: + """Verify that uploading a file that already exists is skipped, returns True, and leaves the object unchanged.""" mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads") assert result is True - last_log_message = caplog.records[-1] - assert "File already present" in last_log_message.message - assert last_log_message.levelno == logging.INFO + # The existing object must not have been overwritten + obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") + assert obj["Body"].read() == b"old" @pytest.mark.usefixtures("mock_s3_client") From 0d3aa68d979ac589a4cdca2e1b102ea013fde4a7 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 13:56:07 -0700 Subject: [PATCH 58/76] consolidate download with staging and promotion with deletion from staging and metadata generation Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/promote.py | 185 ++++++++++-------- .../pipelines/ncbi_ftp_download.py | 153 ++++++++++++--- tests/pipelines/test_ncbi_ftp_download.py | 108 ++++++---- 3 files changed, 303 insertions(+), 143 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index f449ee7b..10532cfd 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -98,7 +98,7 @@ def promote_from_s3( # noqa: PLR0913 dry_run=dry_run, ) - promoted, failed, promoted_accessions, assembly_resources = _promote_data_files( + promoted, failed, descriptors_written, promoted_accessions = _promote_data_files( data_files, sidecars, normalized_staging_key_prefix, @@ -112,17 +112,6 @@ def promote_from_s3( # noqa: PLR0913 if manifest_s3_key and promoted_accessions and not dry_run: _trim_manifest(manifest_s3_key, staging_bucket, promoted_accessions) - # Upload frictionless descriptors for each promoted assembly - descriptors_written = 0 - non_empty = [(k, v) for k, v in assembly_resources.items() if v] - for (adir, acc), resources in tqdm.tqdm(non_empty, unit="descriptor", desc="Writing descriptors"): - try: - descriptor = create_descriptor(adir, acc, resources) - upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=dry_run) - descriptors_written += 1 - except Exception: - logger.exception("Failed to write descriptor for %s", adir) - if descriptors_written: logger.info("Wrote %d frictionless descriptor(s)", descriptors_written) @@ -156,88 +145,128 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 lakehouse_bucket: str, *, dry_run: bool, -) -> tuple[int, int, set[str], defaultdict[tuple[str, str], list[DescriptorResource]]]: +) -> tuple[int, int, int, set[str]]: """Promote each data file from staging to the final Lakehouse path. - :return: (promoted_count, failed_count, promoted_accessions, assembly_resources) + Files are grouped by assembly. When all files for an assembly are promoted + successfully, the frictionless descriptor is written immediately and the staged + files (including sidecars) are deleted from staging. This prevents staging + accumulation across runs and ensures partial runs leave descriptors for all + completed assemblies. + + :return: (promoted_count, failed_count, descriptors_written, promoted_accessions) """ s3 = get_s3_client() promoted = 0 failed = 0 + descriptors_written = 0 promoted_accessions: set[str] = set() - assembly_resources: defaultdict[tuple[str, str], list[DescriptorResource]] = defaultdict(list) - for staged_key in tqdm.tqdm(data_files, unit="file", desc="Promoting"): + # Group files by assembly; skip download_report.json and non-raw_data paths + assembly_files: defaultdict[tuple[str, str], list[str]] = defaultdict(list) + for staged_key in data_files: if staged_key.endswith("download_report.json"): continue - rel_path = staged_key[len(normalized_staging_prefix) :] if not rel_path.startswith("raw_data/"): continue - final_key = lakehouse_key_prefix + rel_path - - if dry_run: - logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) - promoted += 1 - continue - - try: - with tempfile.NamedTemporaryFile(delete=False) as tmp: - tmp_path = tmp.name - try: - s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) - - # Read MD5 from sidecar - metadata: dict[str, str] = {} - md5_key = staged_key + ".md5" - if md5_key in sidecars: - md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) - metadata["md5"] = md5_obj["Body"].read().decode().strip() - + acc_match = re.search(r"(GC[AF]_\d{9}\.\d+)", staged_key) + adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", staged_key) + if acc_match and adir_match: + assembly_files[(adir_match.group(1), acc_match.group(1))].append(staged_key) + + total_files = sum(len(v) for v in assembly_files.values()) + with tqdm.tqdm(total=total_files, unit="file", desc="Promoting") as pbar: + for (adir, acc), files in assembly_files.items(): + assembly_failed = 0 + resources: list[DescriptorResource] = [] + promoted_keys: list[str] = [] + + for staged_key in files: + rel_path = staged_key[len(normalized_staging_prefix) :] + final_key = lakehouse_key_prefix + rel_path final_key_path = PurePosixPath(final_key) - upload_succeeded = upload_file( - tmp_path, - f"{lakehouse_bucket}/{final_key_path.parent}", - metadata=metadata, - object_name=final_key_path.name, - show_progress=False, - ) - if not upload_succeeded: - logger.error("Failed to upload promoted file %s to %s", staged_key, final_key) - failed += 1 + + if dry_run: + logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) + promoted += 1 + pbar.update(1) continue - promoted += 1 - - # Track promoted accession for manifest trimming - acc_match = re.search(r"(GC[AF]_\d{9}\.\d+)", staged_key) - if acc_match: - promoted_accessions.add(acc_match.group(1)) - - # Track resources for frictionless descriptor creation - adir_match = re.search(r"raw_data/GC[AF]/\d+/\d+/\d+/([^/]+)/", final_key) - if adir_match and acc_match: - adir = adir_match.group(1) - acc = acc_match.group(1) - fname = final_key_path.name - ext = fname.rsplit(".", 1)[-1] if "." in fname else "" - md5_hash = metadata.get("md5") - resource: DescriptorResource = { - "name": fname.lower(), - "path": final_key, - "format": ext, - "bytes": Path(tmp_path).stat().st_size, - "hash": md5_hash, - } - assembly_resources[(adir, acc)].append(resource) - - finally: - Path(tmp_path).unlink() - except Exception: - logger.exception("Failed to promote %s", staged_key) - failed += 1 - - return promoted, failed, promoted_accessions, assembly_resources + file_promoted = False + try: + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp_path = tmp.name + try: + s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) + + # Read MD5 from sidecar + metadata: dict[str, str] = {} + md5_key = staged_key + ".md5" + if md5_key in sidecars: + md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) + metadata["md5"] = md5_obj["Body"].read().decode().strip() + + upload_succeeded = upload_file( + tmp_path, + f"{lakehouse_bucket}/{final_key_path.parent}", + metadata=metadata, + object_name=final_key_path.name, + show_progress=False, + ) + if not upload_succeeded: + logger.error("Failed to upload promoted file %s to %s", staged_key, final_key) + else: + promoted += 1 + promoted_keys.append(staged_key) + promoted_accessions.add(acc) + file_promoted = True + + fname = final_key_path.name + ext = fname.rsplit(".", 1)[-1] if "." in fname else "" + resource: DescriptorResource = { + "name": fname.lower(), + "path": final_key, + "format": ext, + "bytes": Path(tmp_path).stat().st_size, + "hash": metadata.get("md5"), + } + resources.append(resource) + + finally: + Path(tmp_path).unlink() + except Exception: + logger.exception("Failed to promote %s", staged_key) + + if not file_promoted: + assembly_failed += 1 + pbar.update(1) + + failed += assembly_failed + + # Write descriptor and delete staged files immediately after a fully successful assembly + if assembly_failed == 0 and promoted_keys: + try: + descriptor = create_descriptor(adir, acc, resources) + upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False) + descriptors_written += 1 + except Exception: + logger.exception("Failed to write descriptor for %s", adir) + + for staged_key in promoted_keys: + try: + delete_object(f"{staging_bucket}/{staged_key}") + except Exception: + logger.warning("Failed to delete staged file %s", staged_key) + for sidecar_ext in (".md5", ".crc64nvme"): + sidecar_key = staged_key + sidecar_ext + if sidecar_key in sidecars: + try: + delete_object(f"{staging_bucket}/{sidecar_key}") + except Exception: + logger.warning("Failed to delete staged sidecar %s", sidecar_key) + + return promoted, failed, descriptors_written, promoted_accessions # ── Archive assemblies ────────────────────────────────────────────────── diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 9c628942..57a73154 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -7,6 +7,7 @@ import json import logging +import shutil import tempfile import threading from concurrent.futures import ThreadPoolExecutor, as_completed @@ -20,7 +21,12 @@ from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed from pydantic_settings import BaseSettings, SettingsConfigDict -from cdm_data_loaders.ncbi_ftp.assembly import FTP_HOST, download_assembly_to_local +from cdm_data_loaders.ncbi_ftp.assembly import ( + FTP_HOST, + build_accession_path, + download_assembly_to_local, + parse_assembly_path, +) from cdm_data_loaders.pipelines.core import run_cli from cdm_data_loaders.pipelines.cts_defaults import DEFAULT_SETTINGS_CONFIG_DICT, INPUT_MOUNT, OUTPUT_MOUNT from cdm_data_loaders.utils.cdm_logger import get_cdm_logger @@ -70,6 +76,41 @@ class DownloadSettings(BaseSettings): ) +# ── Private helpers ───────────────────────────────────────────────────── + + +def _upload_assembly_dir( + assembly_dir: Path, + tmp_root: Path, + bucket: str, + staging_key_prefix: str, +) -> int: + """Upload all files under *assembly_dir* to S3, deleting each file immediately after upload. + + Empty directories are removed after all files are uploaded. If the + directory does not exist (e.g. the assembly had no files) the function + returns zero without raising. + + :param assembly_dir: local directory for one assembly + :param tmp_root: root of the temp directory (used to compute relative S3 paths) + :param bucket: destination S3 bucket + :param staging_key_prefix: S3 key prefix within *bucket* + :return: number of files uploaded + """ + if not assembly_dir.exists(): + return 0 + count = 0 + for f in sorted(assembly_dir.rglob("*")): + if f.is_file(): + relative = f.relative_to(tmp_root) + dest_prefix = f"{bucket}/{staging_key_prefix.rstrip('/')}/{relative.parent}" + upload_file(f, dest_prefix, show_progress=False) + f.unlink() + count += 1 + shutil.rmtree(assembly_dir, ignore_errors=True) + return count + + # ── Batch download ─────────────────────────────────────────────────────── @@ -208,13 +249,18 @@ def download_and_stage( Exactly one of *manifest_s3_key* or *manifest_local_path* must be given. + Downloads and uploads are pipelined per assembly: each worker downloads one + assembly, immediately uploads its files to S3, then deletes the local copies + before picking up the next assembly. At most *threads* assembly directories + exist on disk simultaneously, preventing disk exhaustion on large batches. + :param bucket: destination S3 bucket name :param staging_key_prefix: key prefix inside the bucket (e.g. ``"staging/run1/"``) :param manifest_s3_key: S3 object key of the transfer manifest within *bucket* :param manifest_local_path: local path to the transfer manifest file - :param threads: number of parallel download **and** upload threads + :param threads: number of parallel download-and-upload workers :param ftp_host: NCBI FTP hostname - :param limit: optional limit for testing (pass to :func:`download_batch`) + :param limit: optional limit for testing :param dry_run: when ``True``, download but skip all S3 uploads :return: download report extended with ``staged_objects``, ``staging_key_prefix``, ``dry_run`` """ @@ -238,46 +284,89 @@ def download_and_stage( manifest_dest.write_bytes(Path(manifest_local_path).read_bytes()) logger.info("Manifest read from local path: %s", manifest_local_path) - report = download_batch( - manifest_path=manifest_dest, - output_dir=tmp, - threads=threads, - ftp_host=ftp_host, - limit=limit, - ) + with manifest_dest.open() as f: + assembly_paths = [line.strip() for line in f if line.strip() and not line.startswith("#")] - staged_objects = 0 + if limit: + assembly_paths = assembly_paths[:limit] - if not dry_run: - raw_data_dir = tmp / "raw_data" - report_json = tmp / "download_report.json" + logger.info("Starting download & stage of %d assemblies with %d threads", len(assembly_paths), threads) - upload_tasks: list[tuple[Path, str]] = [] - - if raw_data_dir.exists(): - for local_file in sorted(raw_data_dir.rglob("*")): - if local_file.is_file(): - relative = local_file.relative_to(tmp) - dest_prefix = f"{bucket}/{staging_key_prefix.rstrip('/')}/{relative.parent}" - upload_tasks.append((local_file, dest_prefix)) - - if report_json.exists(): - upload_tasks.append((report_json, f"{bucket}/{staging_key_prefix.rstrip('/')}")) + pool = ThreadLocalFTP(ftp_host) + lock = threading.Lock() + success_count = 0 + staged_objects = 0 + failed: list[dict[str, str]] = [] + all_stats: list[dict[str, Any]] = [] + + def _download_upload_one(path: str) -> tuple[str, Exception | None]: + nonlocal success_count, staged_objects + + @retry( + retry=retry_if_exception_type(error_temp), + stop=stop_after_attempt(3), + wait=wait_fixed(5), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _attempt() -> dict[str, Any]: + return download_assembly_to_local(path, tmp, ftp_host=ftp_host, ftp=pool.get()) + + try: + stats = _attempt() + except Exception as e: # noqa: BLE001 + return path, e + + if not dry_run: + _db, assembly_dir_name, _accession = parse_assembly_path(path) + assembly_local_dir = tmp / build_accession_path(assembly_dir_name) + count = _upload_assembly_dir(assembly_local_dir, tmp, bucket, staging_key_prefix) + with lock: + staged_objects += count - def _upload(task: tuple[Path, str]) -> None: - local_file, dest = task - upload_file(local_file, dest, show_progress=False) + with lock: + success_count += 1 + all_stats.append(stats) + return path, None - with tqdm.tqdm(total=len(upload_tasks), unit="file", desc="Staging to S3") as pbar: + try: + with tqdm.tqdm(total=len(assembly_paths), unit="assembly", desc="Downloading & staging") as pbar: with ThreadPoolExecutor(max_workers=threads) as executor: - futures = [executor.submit(_upload, t) for t in upload_tasks] + futures = {executor.submit(_download_upload_one, p): p for p in assembly_paths} for future in as_completed(futures): - future.result() - staged_objects += 1 + path, error = future.result() + if error: + logger.error("FAILED: %s: %s", path, error) + with lock: + failed.append({"path": path, "error": str(error)}) pbar.update(1) + finally: + pool.close_all() + + report: dict[str, Any] = { + "timestamp": datetime.now(UTC).isoformat(), + "total_attempted": len(assembly_paths), + "succeeded": success_count, + "failed": len(failed), + "failures": failed, + "assembly_stats": all_stats, + } + if not dry_run: + report_path = tmp / "download_report.json" + with report_path.open("w") as f: + json.dump(report, f, indent=2) + upload_file(report_path, f"{bucket}/{staging_key_prefix.rstrip('/')}", show_progress=False) + staged_objects += 1 logger.info("Staged %d objects to s3://%s/%s", staged_objects, bucket, staging_key_prefix) + logger.info( + "SUMMARY: %d attempted, %d succeeded, %d failed", + len(assembly_paths), + success_count, + len(failed), + ) + return { **report, "staged_objects": staged_objects, diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index cadfe559..7c45b476 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -18,6 +18,21 @@ ) from cdm_data_loaders.utils.s3 import reset_s3_client +_MOCK_STATS = { + "accession": "GCF_000001215.4", + "assembly_dir": "GCF_000001215.4_Release_6_plus_ISO1_MT", + "files_downloaded": 0, + "files_skipped_checksum_mismatch": 0, + "files_without_checksum": 0, +} +_MOCK_STATS_2 = { + "accession": "GCF_000001405.40", + "assembly_dir": "GCF_000001405.40_GRCh38.p14", + "files_downloaded": 0, + "files_skipped_checksum_mismatch": 0, + "files_without_checksum": 0, +} + _DEFAULT_THREADS = 4 _CUSTOM_THREADS = 8 _ALIAS_THREADS = 16 @@ -279,7 +294,7 @@ def test_download_and_stage_manifest_source( manifest_s3_key: str | None, use_local: bool, ) -> None: - """Manifest lines are passed to download_batch regardless of source (S3 or local).""" + """Assembly paths from the manifest are processed regardless of source (S3 or local).""" reset_s3_client() s3 = _make_moto_s3() @@ -290,11 +305,11 @@ def test_download_and_stage_manifest_source( manifest_local = tmp_path / "manifest.txt" manifest_local.write_text(_MANIFEST_CONTENT) - captured_content: list[str] = [] + called_paths: list[str] = [] - def _capturing_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 - captured_content.append(Path(manifest_path).read_text()) - return _MOCK_REPORT + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 + called_paths.append(path) + return _MOCK_STATS import cdm_data_loaders.utils.s3 as s3_mod @@ -302,7 +317,11 @@ def _capturing_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_capturing_batch), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), ): download_and_stage( bucket=_TEST_BUCKET, @@ -310,9 +329,11 @@ def _capturing_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 manifest_s3_key=manifest_s3_key, manifest_local_path=manifest_local, dry_run=True, + threads=1, ) - assert captured_content == [_MANIFEST_CONTENT] + expected_paths = [l for l in _MANIFEST_CONTENT.splitlines() if l.strip()] + assert sorted(called_paths) == sorted(expected_paths) reset_s3_client() @@ -363,7 +384,11 @@ def test_download_and_stage_exactly_one_source_required( patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ), ): result = download_and_stage( bucket=_TEST_BUCKET, @@ -372,7 +397,7 @@ def test_download_and_stage_exactly_one_source_required( manifest_local_path=local_path, dry_run=True, ) - assert result["succeeded"] == _MOCK_REPORT["succeeded"] + assert result["succeeded"] == _EXPECTED_ATTEMPTED reset_s3_client() @@ -382,24 +407,22 @@ def test_download_and_stage_exactly_one_source_required( @mock_aws def test_download_and_stage_uploads_to_staging(tmp_path: Path) -> None: - """Files produced in raw_data/ and download_report.json are all uploaded to staging.""" + """Files produced by download_assembly_to_local and download_report.json are all staged to S3.""" reset_s3_client() s3 = _make_moto_s3() manifest_local = tmp_path / "manifest.txt" - manifest_local.write_text(_MANIFEST_CONTENT) + # Single assembly so the fake download writes exactly the files we expect + manifest_local.write_text("/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/\n") assembly_rel = "raw_data/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT" - def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 - out = Path(output_dir) - asm_dir = out / assembly_rel + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 + asm_dir = Path(output_dir) / assembly_rel asm_dir.mkdir(parents=True) (asm_dir / "genomic.fna.gz").write_bytes(b"fasta_data") (asm_dir / "genomic.fna.gz.md5").write_bytes(b"abc123") - report_path = out / "download_report.json" - report_path.write_text(json.dumps(_MOCK_REPORT)) - return _MOCK_REPORT + return {**_MOCK_STATS, "files_downloaded": 2} import cdm_data_loaders.utils.s3 as s3_mod @@ -407,7 +430,11 @@ def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_fake_download_batch), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), ): report = download_and_stage( bucket=_TEST_BUCKET, @@ -443,11 +470,11 @@ def test_download_and_stage_dry_run_skips_upload(tmp_path: Path) -> None: manifest_local = tmp_path / "manifest.txt" manifest_local.write_text(_MANIFEST_CONTENT) - def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 + def _fake_download(path, output_dir, **kwargs): # noqa: ARG001 asm_dir = Path(output_dir) / "raw_data/GCF/000/001/215/GCF_000001215.4" - asm_dir.mkdir(parents=True) + asm_dir.mkdir(parents=True, exist_ok=True) (asm_dir / "genomic.fna.gz").write_bytes(b"fasta") - return _MOCK_REPORT + return _MOCK_STATS import cdm_data_loaders.utils.s3 as s3_mod @@ -455,7 +482,11 @@ def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", side_effect=_fake_download_batch), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + side_effect=_fake_download, + ), ): report = download_and_stage( bucket=_TEST_BUCKET, @@ -485,7 +516,7 @@ def _fake_download_batch(manifest_path, output_dir, **kwargs): # noqa: ARG001 ) @mock_aws def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: - """The limit parameter is forwarded verbatim to download_batch.""" + """The limit parameter truncates the number of assemblies processed.""" reset_s3_client() s3 = _make_moto_s3() @@ -498,7 +529,11 @@ def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT) as mock_batch, + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ) as mock_dl, ): download_and_stage( bucket=_TEST_BUCKET, @@ -508,7 +543,9 @@ def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: dry_run=True, ) - assert mock_batch.call_args.kwargs["limit"] == limit + # The manifest has 2 entries; limit caps how many were processed + expected_calls = min(limit, _EXPECTED_ATTEMPTED) + assert mock_dl.call_count == expected_calls reset_s3_client() @@ -518,7 +555,7 @@ def test_download_and_stage_limit_forwarded(tmp_path: Path, limit: int) -> None: @mock_aws def test_download_and_stage_report_shape(tmp_path: Path) -> None: - """Return value includes all download_batch keys plus staged_objects, staging_key_prefix, dry_run.""" + """Return value contains all expected keys including staged_objects, staging_key_prefix, dry_run.""" reset_s3_client() s3 = _make_moto_s3() @@ -531,7 +568,11 @@ def test_download_and_stage_report_shape(tmp_path: Path) -> None: patch.object(s3_mod, "get_s3_client", return_value=s3), patch.object(s3_mod, "_s3_client", s3), patch("cdm_data_loaders.pipelines.ncbi_ftp_download.get_s3_client", return_value=s3), - patch("cdm_data_loaders.pipelines.ncbi_ftp_download.download_batch", return_value=_MOCK_REPORT), + patch("cdm_data_loaders.pipelines.ncbi_ftp_download.ThreadLocalFTP"), + patch( + "cdm_data_loaders.pipelines.ncbi_ftp_download.download_assembly_to_local", + return_value=_MOCK_STATS, + ), ): report = download_and_stage( bucket=_TEST_BUCKET, @@ -540,11 +581,12 @@ def test_download_and_stage_report_shape(tmp_path: Path) -> None: dry_run=True, ) - assert report == { - **_MOCK_REPORT, - "staged_objects": 0, - "staging_key_prefix": _STAGING_PREFIX, - "dry_run": True, - } + for key in ("timestamp", "total_attempted", "succeeded", "failed", "failures", "assembly_stats"): + assert key in report + assert report["staged_objects"] == 0 + assert report["staging_key_prefix"] == _STAGING_PREFIX + assert report["dry_run"] is True + assert report["total_attempted"] == _EXPECTED_ATTEMPTED + assert report["succeeded"] == _EXPECTED_ATTEMPTED reset_s3_client() From c823283ffb3ec94c499f766c754eb35080e42de9 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 13:58:56 -0700 Subject: [PATCH 59/76] Potential fix for pull request finding 'Unused global variable' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/pipelines/test_ncbi_ftp_download.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index 7c45b476..6c016804 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -25,13 +25,6 @@ "files_skipped_checksum_mismatch": 0, "files_without_checksum": 0, } -_MOCK_STATS_2 = { - "accession": "GCF_000001405.40", - "assembly_dir": "GCF_000001405.40_GRCh38.p14", - "files_downloaded": 0, - "files_skipped_checksum_mismatch": 0, - "files_without_checksum": 0, -} _DEFAULT_THREADS = 4 _CUSTOM_THREADS = 8 From cf930134c16f616e9386eb7ef660fb4c28b63c85 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 13:59:12 -0700 Subject: [PATCH 60/76] Potential fix for pull request finding 'Unused global variable' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/pipelines/test_ncbi_ftp_download.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/pipelines/test_ncbi_ftp_download.py b/tests/pipelines/test_ncbi_ftp_download.py index 6c016804..99a9ebfd 100644 --- a/tests/pipelines/test_ncbi_ftp_download.py +++ b/tests/pipelines/test_ncbi_ftp_download.py @@ -254,15 +254,6 @@ def test_handles_download_failure(self, tmp_path: Path) -> None: _TEST_BUCKET = "test-bucket" _STAGING_PREFIX = "staging/run1/" -_MOCK_REPORT = { - "timestamp": "2026-01-01T00:00:00+00:00", - "total_attempted": 2, - "succeeded": 2, - "failed": 0, - "failures": [], - "assembly_stats": [], -} - def _make_moto_s3(): """Return a moto-backed S3 client with the test bucket created.""" From 2b45416831ff05e4b7e1ba75f62bc21098f2eea6 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 14:13:18 -0700 Subject: [PATCH 61/76] fix tests --- src/cdm_data_loaders/ncbi_ftp/promote.py | 12 +++++++++--- tests/integration/test_promote_e2e.py | 10 ++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 10532cfd..3be925fe 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -19,6 +19,7 @@ from cdm_data_loaders.ncbi_ftp.metadata import ( DescriptorResource, archive_descriptor, + build_descriptor_key, create_descriptor, upload_descriptor, ) @@ -27,6 +28,7 @@ copy_object, delete_object, get_s3_client, + object_exists, upload_file, ) @@ -247,9 +249,13 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 # Write descriptor and delete staged files immediately after a fully successful assembly if assembly_failed == 0 and promoted_keys: try: - descriptor = create_descriptor(adir, acc, resources) - upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False) - descriptors_written += 1 + descriptor_key = build_descriptor_key(adir, lakehouse_key_prefix) + if object_exists(f"{lakehouse_bucket}/{descriptor_key}"): + logger.debug("Descriptor already exists, skipping: %s", descriptor_key) + else: + descriptor = create_descriptor(adir, acc, resources) + upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False) + descriptors_written += 1 except Exception: logger.exception("Failed to write descriptor for %s", adir) diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 7e28c796..ed3f5dd1 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -114,7 +114,12 @@ class TestPromoteIdempotent: """Promoting the same staging data twice should succeed without errors.""" def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: - """Second promote succeeds and produces the same final state.""" + """Second promote on empty staging succeeds and leaves the lakehouse unchanged. + + After the first promote, staged files are deleted. A second run therefore + finds nothing to promote — which is correct and expected. The lakehouse + contents must be identical after both runs. + """ s3 = minio_s3_client _stage_assembly(s3, staging_test_bucket, ASSEMBLY_DIR_A) @@ -135,8 +140,9 @@ def test_promote_idempotent(self, minio_s3_client: object, test_bucket: str, sta keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") assert report1["failed"] == 0 + assert report1["promoted"] >= 1 assert report2["failed"] == 0 - assert report2["promoted"] >= 1 + assert report2["promoted"] == 0 # staging was cleared by the first run assert keys_after_first == keys_after_second From f3c09842d13c2294367688ca549d3788cd7e6918 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 15:16:12 -0700 Subject: [PATCH 62/76] keep ftp connection alive Co-authored-by: Copilot --- .../pipelines/ncbi_ftp_download.py | 14 ++++++------- src/cdm_data_loaders/utils/ftp_client.py | 20 ++++++++++++++++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 57a73154..4a2597df 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -18,7 +18,7 @@ import tqdm from pydantic import AliasChoices, Field -from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed, wait_exponential from pydantic_settings import BaseSettings, SettingsConfigDict from cdm_data_loaders.ncbi_ftp.assembly import ( @@ -148,9 +148,9 @@ def _download_one(path: str) -> tuple[str, Exception | None]: nonlocal success_count @retry( - retry=retry_if_exception_type(error_temp), - stop=stop_after_attempt(3), - wait=wait_fixed(5), + retry=retry_if_exception_type((error_temp, BrokenPipeError, ConnectionResetError, EOFError)), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=5, max=60), reraise=True, before_sleep=before_sleep_log(logger, logging.WARNING), ) @@ -303,9 +303,9 @@ def _download_upload_one(path: str) -> tuple[str, Exception | None]: nonlocal success_count, staged_objects @retry( - retry=retry_if_exception_type(error_temp), - stop=stop_after_attempt(3), - wait=wait_fixed(5), + retry=retry_if_exception_type((error_temp, BrokenPipeError, ConnectionResetError, EOFError)), + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=5, max=60), reraise=True, before_sleep=before_sleep_log(logger, logging.WARNING), ) diff --git a/src/cdm_data_loaders/utils/ftp_client.py b/src/cdm_data_loaders/utils/ftp_client.py index 902779e5..bd372924 100644 --- a/src/cdm_data_loaders/utils/ftp_client.py +++ b/src/cdm_data_loaders/utils/ftp_client.py @@ -153,8 +153,26 @@ def __init__(self, host: str, timeout: int = DEFAULT_TIMEOUT) -> None: self._connections: list[FTP] = [] def get(self) -> FTP: - """Return the FTP connection for the current thread, creating one if needed.""" + """Return the FTP connection for the current thread, reconnecting if stale. + + Sends a NOOP to verify the connection is still alive before returning it. + If the server has closed the connection (e.g. after an idle timeout or + session limit), the dead socket is discarded and a fresh connection is + established transparently. + """ ftp = getattr(self._local, "ftp", None) + if ftp is not None: + try: + ftp.voidcmd("NOOP") + except Exception: + # Connection is stale — discard it and reconnect below + with contextlib.suppress(Exception): + ftp.quit() + with self._lock: + with contextlib.suppress(ValueError): + self._connections.remove(ftp) + ftp = None + self._local.ftp = None if ftp is None: ftp = connect_ftp(self._host, self._timeout) self._local.ftp = ftp From b6f0bddcc3edb1a2f401ceff14ee326db027980c Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 15:21:45 -0700 Subject: [PATCH 63/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- src/cdm_data_loaders/pipelines/ncbi_ftp_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 4a2597df..2cd6a0f4 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -18,7 +18,7 @@ import tqdm from pydantic import AliasChoices, Field -from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_fixed, wait_exponential +from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential from pydantic_settings import BaseSettings, SettingsConfigDict from cdm_data_loaders.ncbi_ftp.assembly import ( From 5fcada9b48241fe9ae6caec1381908d0d2f64168 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Wed, 29 Apr 2026 15:41:14 -0700 Subject: [PATCH 64/76] smooth progress bars --- src/cdm_data_loaders/pipelines/ncbi_ftp_download.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 2cd6a0f4..062575f6 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -168,7 +168,9 @@ def _attempt() -> dict[str, Any]: return path, None try: - with tqdm.tqdm(total=len(assembly_paths), unit="assembly", desc="Downloading from NCBI FTP") as pbar: + with tqdm.tqdm( + total=len(assembly_paths), unit="assembly", desc="Downloading from NCBI FTP", smoothing=0.01 + ) as pbar: with ThreadPoolExecutor(max_workers=threads) as executor: futures = {executor.submit(_download_one, p): p for p in assembly_paths} for future in as_completed(futures): @@ -330,7 +332,9 @@ def _attempt() -> dict[str, Any]: return path, None try: - with tqdm.tqdm(total=len(assembly_paths), unit="assembly", desc="Downloading & staging") as pbar: + with tqdm.tqdm( + total=len(assembly_paths), unit="assembly", desc="Downloading & staging", smoothing=0.01 + ) as pbar: with ThreadPoolExecutor(max_workers=threads) as executor: futures = {executor.submit(_download_upload_one, p): p for p in assembly_paths} for future in as_completed(futures): From 6b78cb01832b51c4ec57a6b4cf698edbea52e97d Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 08:25:55 -0700 Subject: [PATCH 65/76] add promote progress bar and reduce logger output on dry runs Co-authored-by: Copilot --- notebooks/ncbi_ftp_promote.ipynb | 43 +++++++++++++++++++++-- src/cdm_data_loaders/ncbi_ftp/metadata.py | 4 +-- src/cdm_data_loaders/ncbi_ftp/promote.py | 14 ++++++-- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index 1be65156..2691251f 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -100,6 +100,11 @@ " \"io/matt-cohere/staging/run1/input/transfer_manifest.txt\" # e.g. \"staging/transfer_manifest.txt\"\n", ")\n", "\n", + "# Local path to transfer_manifest.txt (used when the manifest has not been uploaded to S3).\n", + "# Used only for the object-count estimate in the scan step; set to None to skip.\n", + "# format: local file path\n", + "MANIFEST_LOCAL_PATH: str | None = None # e.g. \"output/transfer_manifest.txt\"\n", + "\n", "# Final Lakehouse path prefix\n", "# format: S3 key prefix within LAKEHOUSE_BUCKET (no scheme, no bucket)\n", "LAKEHOUSE_KEY_PREFIX = DEFAULT_LAKEHOUSE_KEY_PREFIX\n", @@ -113,6 +118,7 @@ "print(f\"Updated manifest: {UPDATED_MANIFEST_PATH}\")\n", "print(f\"NCBI release: {NCBI_RELEASE}\")\n", "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", + "print(f\"Manifest local path: {MANIFEST_LOCAL_PATH}\")\n", "print(f\"Lakehouse prefix: {LAKEHOUSE_KEY_PREFIX}\")\n", "print(f\"Dry-run: {DRY_RUN}\")" ] @@ -148,12 +154,45 @@ "source": [ "\"\"\"Scan staged files and display summary.\"\"\"\n", "\n", + "import tqdm\n", + "\n", "s3 = get_s3_client()\n", "paginator = s3.get_paginator(\"list_objects_v2\")\n", "\n", + "# Estimate total objects from manifest so tqdm can show a percentage.\n", + "# Each assembly typically produces ~11 data files + ~10 .md5 sidecars = ~21 objects.\n", + "_FILES_PER_ASSEMBLY_EST = 21\n", + "estimated_total = None\n", + "if MANIFEST_S3_KEY:\n", + " try:\n", + " _resp = s3.get_object(Bucket=STAGING_BUCKET, Key=MANIFEST_S3_KEY)\n", + " _lines = [\n", + " ln.strip() for ln in _resp[\"Body\"].read().decode().splitlines() if ln.strip() and not ln.startswith(\"#\")\n", + " ]\n", + " estimated_total = len(_lines) * _FILES_PER_ASSEMBLY_EST\n", + " print(f\"Manifest (S3) has {len(_lines)} assemblies → estimated ~{estimated_total} staged objects\")\n", + " except Exception as e:\n", + " print(f\"Could not read S3 manifest for estimate: {e}\")\n", + "elif MANIFEST_LOCAL_PATH:\n", + " try:\n", + " from pathlib import Path\n", + "\n", + " _lines = [\n", + " ln.strip()\n", + " for ln in Path(MANIFEST_LOCAL_PATH).read_text().splitlines()\n", + " if ln.strip() and not ln.startswith(\"#\")\n", + " ]\n", + " estimated_total = len(_lines) * _FILES_PER_ASSEMBLY_EST\n", + " print(f\"Manifest (local) has {len(_lines)} assemblies → estimated ~{estimated_total} staged objects\")\n", + " except Exception as e:\n", + " print(f\"Could not read local manifest for estimate: {e}\")\n", + "\n", "staged: list[str] = []\n", - "for page in paginator.paginate(Bucket=STAGING_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", - " staged.extend(obj[\"Key\"] for obj in page.get(\"Contents\", []))\n", + "with tqdm.tqdm(total=estimated_total, unit=\"obj\", desc=\"Scanning staging prefix\", dynamic_ncols=True) as pbar:\n", + " for page in paginator.paginate(Bucket=STAGING_BUCKET, Prefix=STAGING_KEY_PREFIX):\n", + " keys = [obj[\"Key\"] for obj in page.get(\"Contents\", [])]\n", + " staged.extend(keys)\n", + " pbar.update(len(keys))\n", "\n", "sidecars = [k for k in staged if k.endswith((\".md5\", \".crc64nvme\"))]\n", "data_files = [k for k in staged if k not in set(sidecars)]\n", diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index 43a9c04f..a10b3c76 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -182,7 +182,7 @@ def upload_descriptor( key = build_descriptor_key(assembly_dir, key_prefix) if dry_run: - logger.info("[dry-run] would upload descriptor: s3://%s/%s", bucket, key) + logger.debug("[dry-run] would upload descriptor: s3://%s/%s", bucket, key) return key s3 = get_s3_client() @@ -227,7 +227,7 @@ def archive_descriptor( # noqa: PLR0913 archive_key = build_archive_descriptor_key(assembly_dir, release_tag, key_prefix) if dry_run: - logger.info("[dry-run] would archive descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) + logger.debug("[dry-run] would archive descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) return True s3 = get_s3_client() diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 3be925fe..3f6ceebf 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -178,6 +178,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 assembly_files[(adir_match.group(1), acc_match.group(1))].append(staged_key) total_files = sum(len(v) for v in assembly_files.values()) + _dry_run_log_count = 0 with tqdm.tqdm(total=total_files, unit="file", desc="Promoting") as pbar: for (adir, acc), files in assembly_files.items(): assembly_failed = 0 @@ -190,7 +191,11 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 final_key_path = PurePosixPath(final_key) if dry_run: - logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) + if _dry_run_log_count < 10: + logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) + else: + logger.debug("[dry-run] would promote: %s -> %s", staged_key, final_key) + _dry_run_log_count += 1 promoted += 1 pbar.update(1) continue @@ -312,6 +317,7 @@ def _archive_assemblies( # noqa: PLR0913 with Path(manifest_local_path).open() as f: accessions = [line.strip() for line in f if line.strip()] + _dry_run_log_count = 0 for accession in tqdm.tqdm(accessions, unit="accession", desc="Archiving"): m = re.match(r"(GC[AF])_(\d{3})(\d{3})(\d{3})\.\d+", accession) if not m: @@ -344,7 +350,11 @@ def _archive_assemblies( # noqa: PLR0913 archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{rel}" if dry_run: - logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) + if _dry_run_log_count < 10: + logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) + else: + logger.debug("[dry-run] would archive: %s -> %s", source_key, archive_key) + _dry_run_log_count += 1 archived += 1 continue From afcae5c6fc4419edc0083b1866371a816b7c070e Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 08:45:42 -0700 Subject: [PATCH 66/76] print removed manifest path --- notebooks/ncbi_ftp_promote.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index 2691251f..e7995dc8 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -115,6 +115,7 @@ "print(f\"Staging bucket: {STAGING_BUCKET}\")\n", "print(f\"Lakehouse bucket: {LAKEHOUSE_BUCKET}\")\n", "print(f\"Staging key prefix: {STAGING_KEY_PREFIX}\")\n", + "print(f\"Removed manifest: {REMOVED_MANIFEST_PATH}\")\n", "print(f\"Updated manifest: {UPDATED_MANIFEST_PATH}\")\n", "print(f\"NCBI release: {NCBI_RELEASE}\")\n", "print(f\"Manifest S3 key: {MANIFEST_S3_KEY}\")\n", From 2e8da0ce23b17624a5306344de7c399d36788a83 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 08:52:21 -0700 Subject: [PATCH 67/76] optimize array creation --- notebooks/ncbi_ftp_promote.ipynb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/notebooks/ncbi_ftp_promote.ipynb b/notebooks/ncbi_ftp_promote.ipynb index e7995dc8..6e16e1a4 100644 --- a/notebooks/ncbi_ftp_promote.ipynb +++ b/notebooks/ncbi_ftp_promote.ipynb @@ -195,8 +195,13 @@ " staged.extend(keys)\n", " pbar.update(len(keys))\n", "\n", - "sidecars = [k for k in staged if k.endswith((\".md5\", \".crc64nvme\"))]\n", - "data_files = [k for k in staged if k not in set(sidecars)]\n", + "sidecars = []\n", + "data_files = []\n", + "for k in staged:\n", + " if k.endswith((\".md5\", \".crc64nvme\")):\n", + " sidecars.append(k)\n", + " else:\n", + " data_files.append(k)\n", "\n", "print(f\"Staged objects: {len(staged)}\")\n", "print(f\" Data files: {len(data_files)}\")\n", From ae929164cf41200d905a212d1b80d6c5514afeca Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 09:47:59 -0700 Subject: [PATCH 68/76] remove checksum type from copy headers --- src/cdm_data_loaders/utils/s3.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 07c9a47d..105f8d73 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -476,7 +476,6 @@ def copy_object( Bucket=new_s3_bucket, Key=new_s3_key, **extra, - **DEFAULT_EXTRA_ARGS, ) @@ -529,7 +528,6 @@ def copy_directory(current_s3_path: str, new_s3_path: str) -> tuple[dict[str, st CopySource={"Bucket": current_s3_bucket, "Key": current_key}, Bucket=new_s3_bucket, Key=new_key, - **DEFAULT_EXTRA_ARGS, ) if resp["ResponseMetadata"]["HTTPStatusCode"] == SUCCESS_RESPONSE: successes[source_path] = dest_path From 1a8f54a00b25b782dab533f6b0f7c4bbdd1bcde2 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 11:24:07 -0700 Subject: [PATCH 69/76] use tags for metadata on copy to avoid boto errors Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/metadata.py | 2 +- src/cdm_data_loaders/ncbi_ftp/promote.py | 4 +-- src/cdm_data_loaders/utils/s3.py | 44 +++++++++++++---------- tests/integration/conftest.py | 20 +++++++++-- tests/integration/test_full_pipeline.py | 6 ++-- tests/integration/test_promote_e2e.py | 12 +++---- tests/ncbi_ftp/test_promote.py | 7 ++-- tests/utils/test_s3.py | 30 +++++++++------- 8 files changed, 78 insertions(+), 47 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index a10b3c76..8098e9c4 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -247,7 +247,7 @@ def archive_descriptor( # noqa: PLR0913 copy_object( f"{bucket}/{source_key}", f"{bucket}/{archive_key}", - metadata={ + tags={ "ncbi_last_release": release_tag, "archive_reason": archive_reason, "archive_date": datestamp, diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 3f6ceebf..7920d7b7 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -217,7 +217,7 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 upload_succeeded = upload_file( tmp_path, f"{lakehouse_bucket}/{final_key_path.parent}", - metadata=metadata, + tags=metadata, object_name=final_key_path.name, show_progress=False, ) @@ -362,7 +362,7 @@ def _archive_assemblies( # noqa: PLR0913 copy_object( f"{lakehouse_bucket}/{source_key}", f"{lakehouse_bucket}/{archive_key}", - metadata={ + tags={ "ncbi_last_release": release_tag, "archive_reason": archive_reason, "archive_date": datestamp, diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 105f8d73..0766f248 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -200,7 +200,7 @@ def upload_file( local_file_path: Path | str, destination_dir: str, object_name: str | None = None, - metadata: dict[str, str] | None = None, + tags: dict[str, str] | None = None, show_progress: bool = True, ) -> bool: """Upload an object to an S3 bucket. @@ -232,14 +232,14 @@ def upload_file( object_name = local_file_path.name s3_path = f"{destination_dir.removesuffix('/')}/{object_name}" - if metadata is None and object_exists(s3_path): + if tags is None and object_exists(s3_path): logger.debug("File already present: %s", s3_path) return True s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) - extra_args = {**DEFAULT_EXTRA_ARGS, **(({"Metadata": metadata}) if metadata is not None else {})} + extra_args = {**DEFAULT_EXTRA_ARGS, **(({"Metadata": tags}) if tags is not None else {})} # Upload the file logger.debug("uploading %s to %s", str(local_file_path), s3_path) @@ -439,13 +439,18 @@ def upload_dir( def copy_object( current_s3_path: str, new_s3_path: str, - metadata: dict[str, str] | None = None, + tags: dict[str, str] | None = None, ) -> dict[str, Any]: - """Copy an object from one place to another, adding in a CRC64NVME checksum. + """Copy an object from one place to another, inheriting the source user metadata. - When *metadata* is supplied the destination object carries exactly those - key/value pairs (``MetadataDirective='REPLACE'``). When *metadata* is - ``None`` (the default) the source metadata is inherited. + Source user metadata (e.g. ``md5``) is preserved on the destination because + ``MetadataDirective`` is omitted, which defaults to ``COPY``. + + When *metadata* is supplied it is applied as S3 object **tags** via a + separate ``put_object_tagging`` call rather than as user metadata. This + avoids passing ``MetadataDirective=REPLACE`` to ``CopyObject``, which causes + botocore to inject an unsigned checksum header that AWS rejects with + AccessDenied. A successful copy operation will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 200 @@ -457,27 +462,30 @@ def copy_object( :type current_s3_path: str :param new_s3_path: the desired new file path on s3, INCLUDING the bucket name :type new_s3_path: str - :param metadata: user metadata to set on the destination object; when provided the source metadata is replaced - :type metadata: dict[str, str] | None - :return: dictionary containing response + :param tags: key-value pairs to set as S3 object tags on the destination + :type tags: dict[str, str] | None + :return: dictionary containing response from the copy operation :rtype: dict[str, Any] """ s3 = get_s3_client() (current_s3_bucket, current_s3_key) = split_s3_path(current_s3_path) (new_s3_bucket, new_s3_key) = split_s3_path(new_s3_path) - extra: dict[str, Any] = {} - if metadata is not None: - extra["Metadata"] = metadata - extra["MetadataDirective"] = "REPLACE" - - return s3.copy_object( + resp = s3.copy_object( CopySource={"Bucket": current_s3_bucket, "Key": current_s3_key}, Bucket=new_s3_bucket, Key=new_s3_key, - **extra, ) + if tags: + s3.put_object_tagging( + Bucket=new_s3_bucket, + Key=new_s3_key, + Tagging={"TagSet": [{"Key": k, "Value": v} for k, v in tags.items()]}, + ) + + return resp + def copy_directory(current_s3_path: str, new_s3_path: str) -> tuple[dict[str, str], dict[str, Any]]: """Copy all objects under a given S3 prefix to a new prefix. diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index f27e4202..08e9fe00 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -252,12 +252,28 @@ def list_all_keys(s3: botocore.client.BaseClient, bucket: str, prefix: str = "") def get_object_metadata(s3: botocore.client.BaseClient, bucket: str, key: str) -> dict[str, Any]: - """Return the user metadata dict for an S3 object. + """Return the S3 user metadata dict for an S3 object (from HeadObject). :param s3: boto3 S3 client :param bucket: bucket name :param key: object key - :return: metadata dict + :return: user metadata dict """ resp = s3.head_object(Bucket=bucket, Key=key) return resp.get("Metadata", {}) + + +def get_object_tags(s3: botocore.client.BaseClient, bucket: str, key: str) -> dict[str, Any]: + """Return the S3 object tags dict for an S3 object (from GetObjectTagging). + + Archive attributes (e.g. ``archive_reason``) are stored as S3 object tags + rather than user metadata to avoid the botocore unsigned-header bug with + ``MetadataDirective=REPLACE``. + + :param s3: boto3 S3 client + :param bucket: bucket name + :param key: object key + :return: tags dict + """ + resp = s3.get_object_tagging(Bucket=bucket, Key=key) + return {t["Key"]: t["Value"] for t in resp.get("TagSet", [])} diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index e8f6e4a6..af8f164b 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -29,7 +29,7 @@ from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch -from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 +from .conftest import get_object_metadata, get_object_tags, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 STABLE_PREFIX = "900" STAGING_PREFIX = "staging/run1/" @@ -215,8 +215,8 @@ def test_full_pipeline_incremental( if promote2["archived"] > 0: assert len(archive_keys) >= 1 for key in archive_keys: - meta = get_object_metadata(s3, test_bucket, key) - assert meta.get("archive_reason") == "updated" + tags = get_object_tags(s3, test_bucket, key) + assert tags.get("archive_reason") == "updated" # Final Lakehouse path should still have files final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index ed3f5dd1..7b9b8d48 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -21,7 +21,7 @@ ) from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 -from .conftest import get_object_metadata, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 +from .conftest import get_object_metadata, get_object_tags, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 from pathlib import Path @@ -188,9 +188,9 @@ def test_archive_updated( # Verify archive metadata for key in archive_keys: - meta = get_object_metadata(s3, test_bucket, key) - assert meta.get("archive_reason") == "updated" - assert meta.get("ncbi_last_release") == "2024-01" + tags = get_object_tags(s3, test_bucket, key) + assert tags.get("archive_reason") == "updated" + assert tags.get("ncbi_last_release") == "2024-01" @pytest.mark.integration @@ -231,8 +231,8 @@ def test_archive_removed( # Verify archive metadata for key in archive_keys: - meta = get_object_metadata(s3, test_bucket, key) - assert meta.get("archive_reason") == "replaced_or_suppressed" + tags = get_object_tags(s3, test_bucket, key) + assert tags.get("archive_reason") == "replaced_or_suppressed" # Verify source objects are deleted rel = build_accession_path(ASSEMBLY_DIR_A) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index db19bcdb..9c61ca6e 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -153,8 +153,11 @@ def test_archive_assemblies_updated_no_delete( f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" ) resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) - assert resp["Metadata"]["archive_reason"] == "updated" - assert resp["Metadata"]["ncbi_last_release"] == "2024-06" + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + tag_resp = mock_s3_client_no_checksum.get_object_tagging(Bucket=TEST_BUCKET, Key=archive_key) + tags = {t["Key"]: t["Value"] for t in tag_resp["TagSet"]} + assert tags["archive_reason"] == "updated" + assert tags["ncbi_last_release"] == "2024-06" @pytest.mark.s3 diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 7e68acc7..c88bbadd 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -899,7 +899,7 @@ def test_delete_object_removes_object(mock_s3_client: Any, bucket: str, protocol def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample_file: Path, bucket: str) -> None: """Verify that upload_file with metadata stores user metadata on the uploaded object.""" metadata = {"md5": "abc123", "source": "ncbi"} - result = upload_file(sample_file, f"{bucket}/uploads", metadata=metadata) + result = upload_file(sample_file, f"{bucket}/uploads", tags=metadata) assert result is True resp = mock_s3_client.head_object(Bucket=bucket, Key=f"uploads/{sample_file.name}") @@ -910,7 +910,7 @@ def test_upload_file_with_metadata_attaches_metadata(mock_s3_client: Any, sample @pytest.mark.s3 def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sample_file: Path) -> None: """Verify that the object_name parameter overrides the filename.""" - result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"k": "v"}, object_name="renamed.txt") + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", tags={"k": "v"}, object_name="renamed.txt") assert result is True obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key="uploads/renamed.txt") assert obj["Body"].read() == b"hello s3" @@ -920,7 +920,7 @@ def test_upload_file_with_metadata_custom_object_name(mock_s3_client: Any, sampl def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, sample_file: Path) -> None: """Verify that upload_file with metadata uploads even when the object already exists.""" mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}", Body=b"old") - result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", metadata={"new": "true"}) + result = upload_file(sample_file, f"{CDM_LAKE_BUCKET}/uploads", tags={"new": "true"}) assert result is True obj = mock_s3_client.get_object(Bucket=CDM_LAKE_BUCKET, Key=f"uploads/{sample_file.name}") assert obj["Body"].read() == b"hello s3" @@ -931,7 +931,7 @@ def test_upload_file_with_metadata_overwrites_existing(mock_s3_client: Any, samp def test_upload_file_with_metadata_raises_on_empty_destination(sample_file: Path) -> None: """Verify ValueError when destination_dir is empty.""" with pytest.raises(ValueError, match="No destination directory"): - upload_file(sample_file, "", metadata={"k": "v"}) + upload_file(sample_file, "", tags={"k": "v"}) @pytest.mark.usefixtures("mock_s3_client") @@ -939,7 +939,7 @@ def test_upload_file_with_metadata_raises_on_empty_destination(sample_file: Path @pytest.mark.s3 def test_upload_file_with_metadata_accepts_str_and_path(sample_file: Path, path_type: type[str] | type[Path]) -> None: """Verify that upload_file with metadata accepts both str and Path.""" - result = upload_file(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", metadata={}) + result = upload_file(path_type(sample_file), f"{CDM_LAKE_BUCKET}/uploads", tags={}) assert result is True @@ -978,23 +978,27 @@ def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: @pytest.mark.parametrize("destination", BUCKETS) @pytest.mark.s3 def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: - """Verify that copy_object with metadata copies and replaces metadata.""" + """Verify that copy_object with tags copies the object and sets S3 object tags on the destination.""" mocked_s3_client_no_checksum.put_object( Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"old_key": "old_val"} ) - new_metadata = {"archive_reason": "replaced", "archive_date": "2026-04-16"} + new_tags = {"archive_reason": "replaced", "archive_date": "2026-04-16"} response = copy_object( f"{CDM_LAKE_BUCKET}/src/file.txt", f"{destination}/archive/file.txt", - metadata=new_metadata, + tags=new_tags, ) assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK - # verify the destination has the new metadata, not the old + # archive fields are stored as S3 tags, not user metadata + tag_resp = mocked_s3_client_no_checksum.get_object_tagging(Bucket=destination, Key="archive/file.txt") + tag_dict = {t["Key"]: t["Value"] for t in tag_resp["TagSet"]} + assert tag_dict["archive_reason"] == "replaced" + assert tag_dict["archive_date"] == "2026-04-16" + + # source user metadata is preserved (MetadataDirective=COPY) resp = mocked_s3_client_no_checksum.head_object(Bucket=destination, Key="archive/file.txt") - assert resp["Metadata"]["archive_reason"] == "replaced" - assert resp["Metadata"]["archive_date"] == "2026-04-16" - assert "old_key" not in resp["Metadata"] + assert resp["Metadata"].get("old_key") == "old_val" # verify source still exists assert object_exists(f"{CDM_LAKE_BUCKET}/src/file.txt") @@ -1007,7 +1011,7 @@ def test_copy_object_with_metadata_preserves_content(mocked_s3_client_no_checksu copy_object( f"{CDM_LAKE_BUCKET}/src/data.bin", f"{CDM_LAKE_BUCKET}/dst/data.bin", - metadata={"tag": "value"}, + tags={"tag": "value"}, ) obj = mocked_s3_client_no_checksum.get_object(Bucket=CDM_LAKE_BUCKET, Key="dst/data.bin") assert obj["Body"].read() == b"binary data" From fa3df736c93281831f44cc8db119a8de455c0814 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 11:53:48 -0700 Subject: [PATCH 70/76] remove tagging option on s3 copy Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/metadata.py | 17 +++++++---------- src/cdm_data_loaders/ncbi_ftp/promote.py | 8 +------- src/cdm_data_loaders/utils/s3.py | 20 +------------------- tests/integration/conftest.py | 16 ---------------- tests/integration/test_full_pipeline.py | 5 ++--- tests/integration/test_promote_e2e.py | 14 ++++++-------- tests/ncbi_ftp/test_metadata.py | 6 +++--- tests/ncbi_ftp/test_promote.py | 22 +++++----------------- tests/utils/test_s3.py | 21 ++++++--------------- 9 files changed, 31 insertions(+), 98 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index 8098e9c4..e616c9b5 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -67,16 +67,19 @@ def build_descriptor_key(assembly_dir: str, key_prefix: str) -> str: return f"{prefix}metadata/{assembly_dir}_datapackage.json" -def build_archive_descriptor_key(assembly_dir: str, release_tag: str, key_prefix: str) -> str: +def build_archive_descriptor_key( + assembly_dir: str, release_tag: str, key_prefix: str, archive_reason: str = "unknown" +) -> str: """Return the S3 key for the archived descriptor of *assembly_dir*. :param assembly_dir: full assembly directory name :param release_tag: NCBI release tag used in the archive path, e.g. ``"2024-01"`` :param key_prefix: Lakehouse key prefix - :return: S3 key under ``archive/{release_tag}/metadata/`` + :param archive_reason: reason for archival, encoded as a path segment + :return: S3 key under ``archive/{release_tag}/{archive_reason}/metadata/`` """ prefix = key_prefix.rstrip("/") + "/" - return f"{prefix}archive/{release_tag}/metadata/{assembly_dir}_datapackage.json" + return f"{prefix}archive/{release_tag}/{archive_reason}/metadata/{assembly_dir}_datapackage.json" def create_descriptor( @@ -224,7 +227,7 @@ def archive_descriptor( # noqa: PLR0913 :return: ``True`` if the descriptor was (or would be) archived; ``False`` if not found """ source_key = build_descriptor_key(assembly_dir, key_prefix) - archive_key = build_archive_descriptor_key(assembly_dir, release_tag, key_prefix) + archive_key = build_archive_descriptor_key(assembly_dir, release_tag, key_prefix, archive_reason) if dry_run: logger.debug("[dry-run] would archive descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) @@ -243,15 +246,9 @@ def archive_descriptor( # noqa: PLR0913 return False raise - datestamp = datetime.now(UTC).strftime("%Y-%m-%d") copy_object( f"{bucket}/{source_key}", f"{bucket}/{archive_key}", - tags={ - "ncbi_last_release": release_tag, - "archive_reason": archive_reason, - "archive_date": datestamp, - }, ) logger.debug("Archived descriptor: s3://%s/%s -> %s", bucket, source_key, archive_key) return True diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 7920d7b7..cb5fa6c9 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -311,7 +311,6 @@ def _archive_assemblies( # noqa: PLR0913 """ s3 = get_s3_client() release_tag = ncbi_release or "unknown" - datestamp = datetime.now(UTC).strftime("%Y-%m-%d") archived = 0 with Path(manifest_local_path).open() as f: @@ -347,7 +346,7 @@ def _archive_assemblies( # noqa: PLR0913 for source_key in matching_keys: rel = source_key[len(lakehouse_key_prefix) :] - archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{rel}" + archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{archive_reason}/{rel}" if dry_run: if _dry_run_log_count < 10: @@ -362,11 +361,6 @@ def _archive_assemblies( # noqa: PLR0913 copy_object( f"{lakehouse_bucket}/{source_key}", f"{lakehouse_bucket}/{archive_key}", - tags={ - "ncbi_last_release": release_tag, - "archive_reason": archive_reason, - "archive_date": datestamp, - }, ) if delete_source: delete_object(f"{lakehouse_bucket}/{source_key}") diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index 0766f248..a6d85976 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -439,19 +439,12 @@ def upload_dir( def copy_object( current_s3_path: str, new_s3_path: str, - tags: dict[str, str] | None = None, ) -> dict[str, Any]: """Copy an object from one place to another, inheriting the source user metadata. Source user metadata (e.g. ``md5``) is preserved on the destination because ``MetadataDirective`` is omitted, which defaults to ``COPY``. - When *metadata* is supplied it is applied as S3 object **tags** via a - separate ``put_object_tagging`` call rather than as user metadata. This - avoids passing ``MetadataDirective=REPLACE`` to ``CopyObject``, which causes - botocore to inject an unsigned checksum header that AWS rejects with - AccessDenied. - A successful copy operation will return a response where resp["ResponseMetadata"]["HTTPStatusCode"] == 200 @@ -462,8 +455,6 @@ def copy_object( :type current_s3_path: str :param new_s3_path: the desired new file path on s3, INCLUDING the bucket name :type new_s3_path: str - :param tags: key-value pairs to set as S3 object tags on the destination - :type tags: dict[str, str] | None :return: dictionary containing response from the copy operation :rtype: dict[str, Any] """ @@ -471,21 +462,12 @@ def copy_object( (current_s3_bucket, current_s3_key) = split_s3_path(current_s3_path) (new_s3_bucket, new_s3_key) = split_s3_path(new_s3_path) - resp = s3.copy_object( + return s3.copy_object( CopySource={"Bucket": current_s3_bucket, "Key": current_s3_key}, Bucket=new_s3_bucket, Key=new_s3_key, ) - if tags: - s3.put_object_tagging( - Bucket=new_s3_bucket, - Key=new_s3_key, - Tagging={"TagSet": [{"Key": k, "Value": v} for k, v in tags.items()]}, - ) - - return resp - def copy_directory(current_s3_path: str, new_s3_path: str) -> tuple[dict[str, str], dict[str, Any]]: """Copy all objects under a given S3 prefix to a new prefix. diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 08e9fe00..b8bdf9ba 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -261,19 +261,3 @@ def get_object_metadata(s3: botocore.client.BaseClient, bucket: str, key: str) - """ resp = s3.head_object(Bucket=bucket, Key=key) return resp.get("Metadata", {}) - - -def get_object_tags(s3: botocore.client.BaseClient, bucket: str, key: str) -> dict[str, Any]: - """Return the S3 object tags dict for an S3 object (from GetObjectTagging). - - Archive attributes (e.g. ``archive_reason``) are stored as S3 object tags - rather than user metadata to avoid the botocore unsigned-header bug with - ``MetadataDirective=REPLACE``. - - :param s3: boto3 S3 client - :param bucket: bucket name - :param key: object key - :return: tags dict - """ - resp = s3.get_object_tagging(Bucket=bucket, Key=key) - return {t["Key"]: t["Value"] for t in resp.get("TagSet", [])} diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index af8f164b..0c948764 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -29,7 +29,7 @@ from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 from cdm_data_loaders.pipelines.ncbi_ftp_download import download_batch -from .conftest import get_object_metadata, get_object_tags, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 +from .conftest import get_object_metadata, list_all_keys, stage_files_to_minio, staging_test_bucket # noqa: F401 STABLE_PREFIX = "900" STAGING_PREFIX = "staging/run1/" @@ -215,8 +215,7 @@ def test_full_pipeline_incremental( if promote2["archived"] > 0: assert len(archive_keys) >= 1 for key in archive_keys: - tags = get_object_tags(s3, test_bucket, key) - assert tags.get("archive_reason") == "updated" + assert "/updated/" in key # Final Lakehouse path should still have files final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index 7b9b8d48..b4aabe6c 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -21,7 +21,7 @@ ) from cdm_data_loaders.ncbi_ftp.promote import DEFAULT_LAKEHOUSE_KEY_PREFIX, promote_from_s3 -from .conftest import get_object_metadata, get_object_tags, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 +from .conftest import get_object_metadata, list_all_keys, seed_lakehouse, staging_test_bucket # noqa: F401 from pathlib import Path @@ -188,9 +188,8 @@ def test_archive_updated( # Verify archive metadata for key in archive_keys: - tags = get_object_tags(s3, test_bucket, key) - assert tags.get("archive_reason") == "updated" - assert tags.get("ncbi_last_release") == "2024-01" + assert "/updated/" in key + assert "/2024-01/" in key @pytest.mark.integration @@ -231,8 +230,7 @@ def test_archive_removed( # Verify archive metadata for key in archive_keys: - tags = get_object_tags(s3, test_bucket, key) - assert tags.get("archive_reason") == "replaced_or_suppressed" + assert "/replaced_or_suppressed/" in key # Verify source objects are deleted rel = build_accession_path(ASSEMBLY_DIR_A) @@ -463,7 +461,7 @@ def test_archive_copies_descriptor( lakehouse_key_prefix=PATH_PREFIX, ) - archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX) + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX, "updated") # Confirm the archive descriptor object exists resp = s3.head_object(Bucket=test_bucket, Key=archive_key) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 @@ -499,7 +497,7 @@ def test_archive_removed_copies_descriptor( lakehouse_key_prefix=PATH_PREFIX, ) - archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX) + archive_key = build_archive_descriptor_key(ASSEMBLY_DIR_A, "2024-01", PATH_PREFIX, "replaced_or_suppressed") resp = s3.head_object(Bucket=test_bucket, Key=archive_key) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 diff --git a/tests/ncbi_ftp/test_metadata.py b/tests/ncbi_ftp/test_metadata.py index 0b471b48..df1df775 100644 --- a/tests/ncbi_ftp/test_metadata.py +++ b/tests/ncbi_ftp/test_metadata.py @@ -77,9 +77,9 @@ def test_build_descriptor_key(prefix: str) -> None: ], ) def test_build_archive_descriptor_key(prefix: str, tag: str) -> None: - """Archive key includes tag and has no double slash; trailing slash on prefix is normalized.""" - key = build_archive_descriptor_key(_ASSEMBLY_DIR, tag, prefix) - assert key == f"{_KEY_PREFIX}archive/{tag}/metadata/{_ASSEMBLY_DIR}_datapackage.json" + """Archive key includes tag and reason segment; no double slash; prefix trailing slash normalized.""" + key = build_archive_descriptor_key(_ASSEMBLY_DIR, tag, prefix, "updated") + assert key == f"{_KEY_PREFIX}archive/{tag}/updated/metadata/{_ASSEMBLY_DIR}_datapackage.json" assert "//" not in key diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index 9c61ca6e..3c7bc05e 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -118,7 +118,7 @@ def test_archive_assemblies_removed(mock_s3_client_no_checksum: botocore.client. assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 0 archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/" + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/replaced_or_suppressed/" f"raw_data/GCF/000/005/845/{accession}_ASM584v2/{accession}_genomic.fna.gz" ) assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 @@ -149,15 +149,9 @@ def test_archive_assemblies_updated_no_delete( ) assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key).get("KeyCount", 0) == 1 - archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=archive_key) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 - tag_resp = mock_s3_client_no_checksum.get_object_tagging(Bucket=TEST_BUCKET, Key=archive_key) - tags = {t["Key"]: t["Value"] for t in tag_resp["TagSet"]} - assert tags["archive_reason"] == "updated" - assert tags["ncbi_last_release"] == "2024-06" @pytest.mark.s3 @@ -177,12 +171,8 @@ def test_archive_assemblies_multiple_releases_no_collision( mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"v2-data") _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-06", archive_reason="updated") - archive_key_1 = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) - archive_key_2 = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) + archive_key_1 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + archive_key_2 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-06/updated/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_1)["Body"].read() == b"v1-data" assert mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=archive_key_2)["Body"].read() == b"v2-data" @@ -239,7 +229,5 @@ def test_archive_assemblies_unknown_release_fallback( assert _archive_assemblies(str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release=None) == 1 - archive_key = ( - f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" - ) + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index c88bbadd..36dd9205 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -974,44 +974,35 @@ def test_head_object_with_protocols(mock_s3_client: Any, protocol: str) -> None: assert result["size"] == SIZE_DATA -# copy_object with metadata +# copy_object @pytest.mark.parametrize("destination", BUCKETS) @pytest.mark.s3 -def test_copy_object_with_metadata_replaces_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: - """Verify that copy_object with tags copies the object and sets S3 object tags on the destination.""" +def test_copy_object_preserves_user_metadata(mocked_s3_client_no_checksum: Any, destination: str) -> None: + """copy_object preserves source user metadata (MetadataDirective=COPY default).""" mocked_s3_client_no_checksum.put_object( - Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"old_key": "old_val"} + Bucket=CDM_LAKE_BUCKET, Key="src/file.txt", Body=b"archive me", Metadata={"md5": "abc123"} ) - new_tags = {"archive_reason": "replaced", "archive_date": "2026-04-16"} response = copy_object( f"{CDM_LAKE_BUCKET}/src/file.txt", f"{destination}/archive/file.txt", - tags=new_tags, ) assert response["ResponseMetadata"]["HTTPStatusCode"] == HTTP_STATUS_OK - # archive fields are stored as S3 tags, not user metadata - tag_resp = mocked_s3_client_no_checksum.get_object_tagging(Bucket=destination, Key="archive/file.txt") - tag_dict = {t["Key"]: t["Value"] for t in tag_resp["TagSet"]} - assert tag_dict["archive_reason"] == "replaced" - assert tag_dict["archive_date"] == "2026-04-16" - # source user metadata is preserved (MetadataDirective=COPY) resp = mocked_s3_client_no_checksum.head_object(Bucket=destination, Key="archive/file.txt") - assert resp["Metadata"].get("old_key") == "old_val" + assert resp["Metadata"].get("md5") == "abc123" # verify source still exists assert object_exists(f"{CDM_LAKE_BUCKET}/src/file.txt") @pytest.mark.s3 -def test_copy_object_with_metadata_preserves_content(mocked_s3_client_no_checksum: Any) -> None: +def test_copy_object_preserves_content(mocked_s3_client_no_checksum: Any) -> None: """Verify that the content of the copied object matches the original.""" mocked_s3_client_no_checksum.put_object(Bucket=CDM_LAKE_BUCKET, Key="src/data.bin", Body=b"binary data") copy_object( f"{CDM_LAKE_BUCKET}/src/data.bin", f"{CDM_LAKE_BUCKET}/dst/data.bin", - tags={"tag": "value"}, ) obj = mocked_s3_client_no_checksum.get_object(Bucket=CDM_LAKE_BUCKET, Key="dst/data.bin") assert obj["Body"].read() == b"binary data" From 4c9c1c8c057e444777780945468fcc8a4b06e80b Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 12:26:48 -0700 Subject: [PATCH 71/76] parallelize copies and batch deletes Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/promote.py | 58 +++- src/cdm_data_loaders/utils/s3.py | 25 ++ tests/integration/test_promote_e2e.py | 422 +++++++++++++++++++++++ tests/ncbi_ftp/test_promote.py | 321 +++++++++++++++++ tests/utils/test_s3.py | 32 ++ 5 files changed, 841 insertions(+), 17 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index cb5fa6c9..52a71612 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -9,6 +9,7 @@ import re import tempfile from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import UTC, datetime from pathlib import Path, PurePosixPath from typing import Any @@ -27,6 +28,7 @@ from cdm_data_loaders.utils.s3 import ( copy_object, delete_object, + delete_objects, get_s3_client, object_exists, upload_file, @@ -344,30 +346,52 @@ def _archive_assemblies( # noqa: PLR0913 assembly_dir = adir_match.group(1) break - for source_key in matching_keys: - rel = source_key[len(lakehouse_key_prefix) :] - archive_key = f"{lakehouse_key_prefix}archive/{release_tag}/{archive_reason}/{rel}" + key_pairs = [ + ( + source_key, + f"{lakehouse_key_prefix}archive/{release_tag}/{archive_reason}/{source_key[len(lakehouse_key_prefix) :]}", + ) + for source_key in matching_keys + ] - if dry_run: + if dry_run: + for source_key, archive_key in key_pairs: if _dry_run_log_count < 10: logger.info("[dry-run] would archive: %s -> %s", source_key, archive_key) else: logger.debug("[dry-run] would archive: %s -> %s", source_key, archive_key) _dry_run_log_count += 1 - archived += 1 - continue + archived += len(key_pairs) + continue - try: - copy_object( - f"{lakehouse_bucket}/{source_key}", - f"{lakehouse_bucket}/{archive_key}", - ) - if delete_source: - delete_object(f"{lakehouse_bucket}/{source_key}") - archived += 1 - logger.debug(" Archived: %s -> %s", source_key, archive_key) - except Exception: - logger.exception("Failed to archive %s", source_key) + # Copy all files for this accession concurrently + keys_to_delete: list[str] = [] + n_workers = min(32, len(key_pairs)) + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = { + executor.submit( + copy_object, + f"{lakehouse_bucket}/{src}", + f"{lakehouse_bucket}/{arch}", + ): src + for src, arch in key_pairs + } + for future in as_completed(futures): + src = futures[future] + try: + future.result() + archived += 1 + if delete_source: + keys_to_delete.append(src) + logger.debug(" Archived: %s", src) + except Exception: + logger.exception("Failed to archive %s", src) + + # Batch-delete source keys in a single API call + if keys_to_delete: + del_errors = delete_objects(lakehouse_bucket, keys_to_delete) + for err in del_errors: + logger.warning("Failed to delete %s: %s", err.get("Key"), err.get("Message")) # Archive the frictionless descriptor alongside raw data if assembly_dir: diff --git a/src/cdm_data_loaders/utils/s3.py b/src/cdm_data_loaders/utils/s3.py index a6d85976..738309e8 100644 --- a/src/cdm_data_loaders/utils/s3.py +++ b/src/cdm_data_loaders/utils/s3.py @@ -547,3 +547,28 @@ def delete_object(s3_path: str) -> dict[str, Any]: s3 = get_s3_client() (bucket, key) = split_s3_path(s3_path) return s3.delete_object(Bucket=bucket, Key=key) + + +def delete_objects(bucket: str, keys: list[str]) -> list[dict[str, Any]]: + """Delete multiple objects from an S3 bucket in a single API call. + + Splits into batches of 1000 (the S3 API maximum per request). + + :param bucket: S3 bucket name (no protocol prefix) + :param keys: list of S3 keys to delete + :return: list of per-key error dicts returned by S3 (empty if all succeeded) + :rtype: list[dict[str, Any]] + """ + if not keys: + return [] + + s3 = get_s3_client() + errors: list[dict[str, Any]] = [] + for i in range(0, len(keys), 1000): + batch = keys[i : i + 1000] + resp = s3.delete_objects( + Bucket=bucket, + Delete={"Objects": [{"Key": k} for k in batch], "Quiet": False}, + ) + errors.extend(resp.get("Errors", [])) + return errors diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index b4aabe6c..b805f03c 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -522,3 +522,425 @@ def test_dry_run_no_descriptor(self, minio_s3_client: object, test_bucket: str, metadata_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "metadata/") assert len(metadata_keys) == 0, f"Dry-run should not create descriptor files, found: {metadata_keys}" + + +# ── Parallel archiving tests ───────────────────────────────────────────── + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveMultiFileConcurrent: + """Verify parallel copy archives all files correctly with correct content.""" + + def test_all_files_archived_with_correct_content( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Every file is archived with byte-identical content when copied concurrently.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + + # Seed many files for assembly A at final Lakehouse path + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC_CONTENT", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN_CONTENT", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA_CONTENT", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"ASSEMBLY_REPORT", + f"{ASSEMBLY_DIR_A}_assembly_stats.txt": b"ASSEMBLY_STATS", + f"{ASSEMBLY_DIR_A}_cds_from_genomic.fna.gz": b"CDS_CONTENT", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + archived = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + assert archived == len(many_files) + + # Verify every archived file has correct content + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in many_files.items(): + archive_key = f"{PATH_PREFIX}archive/2024-01/updated/{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=archive_key) + actual_body = obj["Body"].read() + assert actual_body == expected_body, f"Content mismatch for {fname}" + + def test_archive_key_paths_are_correct( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archived keys follow the exact ``archive/{release}/{reason}/{rel_path}`` pattern.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"content"} + seed_lakehouse(s3, test_bucket, ACCESSION_B, files, PATH_PREFIX, ASSEMBLY_DIR_B) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_B], "removed_manifest.txt") + _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-02", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + rel = build_accession_path(ASSEMBLY_DIR_B) + expected_key = f"{PATH_PREFIX}archive/2024-02/replaced_or_suppressed/{rel}{ASSEMBLY_DIR_B}_genomic.fna.gz" + resp = s3.head_object(Bucket=test_bucket, Key=expected_key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveDeleteSourceBatch: + """Verify batch delete removes all source objects after concurrent copy.""" + + def test_all_sources_deleted_after_archive( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """After archive with delete_source=True, no source objects remain.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"rna", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"report", + } + source_keys = seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + assert archived == len(many_files) + # Source keys must all be gone + for key in source_keys: + remaining = list_all_keys(s3, test_bucket, key) + assert len(remaining) == 0, f"Source not deleted: {key}" + + def test_archive_present_source_gone( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archive destinations exist AND sources are gone after replaced_or_suppressed archive.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + # Archive keys present + archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/") + assert len(archive_keys) == len(files), f"Expected {len(files)} archive keys, got: {archive_keys}" + # Source keys absent + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == 0, f"Source objects remain: {source_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPartialArchiveResume: + """Corner case: a prior archive run was interrupted mid-way. + + Re-running must complete cleanly without errors, leave all archive keys + present with current content, and (when delete_source=True) remove all + source keys regardless of which files were processed in the prior run. + """ + + def test_partial_updated_archive_resumes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Re-running after a partial updated archive overwrites stale copies and archives missing files. + + Scenario: 3 files, file_a was archived in a prior run (stale content), + file_b and file_c were not. Re-run should overwrite file_a with current + content and archive file_b, file_c. + """ + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + rel = build_accession_path(ASSEMBLY_DIR_A) + + file_a = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + file_b = f"{ASSEMBLY_DIR_A}_protein.faa.gz" + file_c = f"{ASSEMBLY_DIR_A}_rna.fna.gz" + + current_content = {file_a: b"current-genomic", file_b: b"current-protein", file_c: b"current-rna"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, current_content, PATH_PREFIX, ASSEMBLY_DIR_A) + + # Pre-seed a stale archive copy for file_a (simulating prior partial run) + archive_prefix = f"{PATH_PREFIX}archive/2024-01/updated/{rel}" + s3.put_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}", Body=b"stale-genomic") + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + archived = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + # All 3 files counted + assert archived == 3 # noqa: PLR2004 + # file_a overwritten with current content + obj_a = s3.get_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}") + assert obj_a["Body"].read() == b"current-genomic", "file_a archive should be overwritten" + # file_b and file_c now archived + for fname in (file_b, file_c): + resp = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # Sources untouched (delete_source=False) + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == len(current_content) + + def test_partial_replaced_archive_resumes_and_deletes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Re-running replaced_or_suppressed archive after partial run completes and deletes all sources. + + Scenario: file_a was copied+deleted in prior run (no longer at source), + file_b was copied but NOT deleted (still at source), file_c was untouched. + Re-run processes file_b and file_c, deletes both. Result: no sources remain. + """ + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + rel = build_accession_path(ASSEMBLY_DIR_A) + + file_a = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + file_b = f"{ASSEMBLY_DIR_A}_protein.faa.gz" + file_c = f"{ASSEMBLY_DIR_A}_rna.fna.gz" + archive_prefix = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel}" + + # Only file_b and file_c remain at source (file_a already gone) + s3.put_object( + Bucket=test_bucket, + Key=f"{PATH_PREFIX}{rel}{file_b}", + Body=b"protein", + Metadata={"md5": hashlib.md5(b"protein").hexdigest()}, # noqa: S324 + ) + s3.put_object( + Bucket=test_bucket, + Key=f"{PATH_PREFIX}{rel}{file_c}", + Body=b"rna", + Metadata={"md5": hashlib.md5(b"rna").hexdigest()}, # noqa: S324 + ) + # file_a already at archive destination + s3.put_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}", Body=b"genomic") + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + # 2 newly archived (file_b and file_c) + assert archived == 2 # noqa: PLR2004 + # file_b and file_c archive keys exist + for fname in (file_b, file_c): + resp = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # No source keys remain (file_b and file_c were deleted) + source_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(source_keys) == 0, f"Source objects remain: {source_keys}" + # file_a archive key is still intact + resp_a = s3.head_object(Bucket=test_bucket, Key=f"{archive_prefix}{file_a}") + assert resp_a["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + def test_full_rerun_after_complete_archive_is_idempotent( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Running archive again when all files already exist at archive paths is safe (no errors).""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + } + seed_lakehouse(s3, test_bucket, ACCESSION_A, files, PATH_PREFIX, ASSEMBLY_DIR_A) + + updated_manifest = _write_manifest(tmp_path, [ACCESSION_A], "updated_manifest.txt") + + # First run — archives all files + archived_1 = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + # Second run — same manifest, same source files still present + archived_2 = _archive_assemblies( + str(updated_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="updated", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + assert archived_1 == len(files) + assert archived_2 == len(files) + # Archive keys still present with correct content + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in files.items(): + key = f"{PATH_PREFIX}archive/2024-01/updated/{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=key) + assert obj["Body"].read() == expected_body + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveMultiAccessionManifest: + """Multiple accessions in a single manifest are all archived.""" + + def test_two_accessions_both_archived( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Both accessions are archived with correct keys when listed in one manifest.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + + files_a = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic-A"} + files_b = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"genomic-B"} + seed_lakehouse(s3, test_bucket, ACCESSION_A, files_a, PATH_PREFIX, ASSEMBLY_DIR_A) + seed_lakehouse(s3, test_bucket, ACCESSION_B, files_b, PATH_PREFIX, ASSEMBLY_DIR_B) + + manifest = _write_manifest(tmp_path, [ACCESSION_A, ACCESSION_B], "removed_manifest.txt") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + ) + + assert archived == 2 # noqa: PLR2004 + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + key_a = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel_a}{ASSEMBLY_DIR_A}_genomic.fna.gz" + key_b = f"{PATH_PREFIX}archive/2024-01/replaced_or_suppressed/{rel_b}{ASSEMBLY_DIR_B}_genomic.fna.gz" + assert s3.head_object(Bucket=test_bucket, Key=key_a)["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + assert s3.head_object(Bucket=test_bucket, Key=key_b)["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + # Sources deleted + assert len(list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_a}")) == 0 + assert len(list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_b}")) == 0 + + def test_three_accessions_correct_archive_reason_segment( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Archive keys for all three accessions include the archive_reason segment.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + accessions_and_dirs = [ + (ACCESSION_A, ASSEMBLY_DIR_A), + (ACCESSION_B, ASSEMBLY_DIR_B), + (ACCESSION_C, ASSEMBLY_DIR_C), + ] + for accession, assembly_dir in accessions_and_dirs: + seed_lakehouse( + s3, + test_bucket, + accession, + {f"{assembly_dir}_genomic.fna.gz": b"data"}, + PATH_PREFIX, + assembly_dir, + ) + + manifest = _write_manifest(tmp_path, [acc for acc, _ in accessions_and_dirs], "removed_manifest.txt") + _archive_assemblies( + str(manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=False, + ) + + all_archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/2024-03/") + assert len(all_archive_keys) == 3 # noqa: PLR2004 + for key in all_archive_keys: + assert "/replaced_or_suppressed/" in key, f"Archive key missing reason segment: {key}" + assert "/2024-03/" in key, f"Archive key missing release segment: {key}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestArchiveDryRunParallel: + """Dry-run with many files leaves everything unchanged.""" + + def test_dry_run_no_copies_no_deletes( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str, tmp_path: Path + ) -> None: + """Dry-run with multiple files per accession creates no archive keys and keeps sources.""" + from cdm_data_loaders.ncbi_ftp.promote import _archive_assemblies + + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"protein", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"rna", + } + source_keys = seed_lakehouse(s3, test_bucket, ACCESSION_A, many_files, PATH_PREFIX, ASSEMBLY_DIR_A) + + removed_manifest = _write_manifest(tmp_path, [ACCESSION_A], "removed_manifest.txt") + archived = _archive_assemblies( + str(removed_manifest), + lakehouse_bucket=test_bucket, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + lakehouse_key_prefix=PATH_PREFIX, + delete_source=True, + dry_run=True, + ) + + assert archived == len(many_files) + # No archive keys + archive_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}archive/") + assert len(archive_keys) == 0, f"Dry-run created archive keys: {archive_keys}" + # All sources still present + for key in source_keys: + remaining = list_all_keys(s3, test_bucket, key) + assert len(remaining) == 1, f"Source missing after dry-run: {key}" diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index 3c7bc05e..976d06e0 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -231,3 +231,324 @@ def test_archive_assemblies_unknown_release_fallback( archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/unknown/unknown/raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" assert mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key).get("KeyCount", 0) == 1 + + +# ── Concurrent / multi-file archive (new behaviour) ───────────────────── + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_all_copied( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """All files for an accession are copied concurrently — none missed.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + file_names = [ + f"{accession}_genomic.fna.gz", + f"{accession}_protein.faa.gz", + f"{accession}_rna.fna.gz", + f"{accession}_assembly_report.txt", + f"{accession}_assembly_stats.txt", + ] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=fname.encode()) + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + assert archived == len(file_names) + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_content_preserved( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Archive copies preserve byte-for-byte content of each file.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + files = { + f"{accession}_genomic.fna.gz": b"\x1f\x8bGENOMIC", + f"{accession}_protein.faa.gz": b"\x1f\x8bPROTEIN", + } + for fname, body in files.items(): + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=body) + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + for fname, original_body in files.items(): + obj = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert obj["Body"].read() == original_body, f"Content mismatch for {fname}" + + +@pytest.mark.s3 +def test_archive_assemblies_multi_file_delete_all( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Batch delete removes ALL source files when delete_source=True.""" + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + file_names = [ + f"{accession}_genomic.fna.gz", + f"{accession}_protein.faa.gz", + f"{accession}_assembly_report.txt", + ] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + + assert archived == len(file_names) + # All sources deleted + for fname in file_names: + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{base}{fname}") + assert result.get("KeyCount", 0) == 0, f"Source not deleted: {fname}" + # All archives present + archive_base = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-03/replaced_or_suppressed/raw_data/GCF/000/005/845/{asm_dir}/" + ) + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +# ── Partial-archive idempotency ────────────────────────────────────────── + + +@pytest.mark.s3 +def test_archive_assemblies_partial_already_archived_overwritten( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Re-running archive after a partial run overwrites the already-archived files. + + Simulates a partial failure: file_a was archived, file_b was not. + The second run should archive both file_a (overwrite) and file_b. + """ + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + archive_base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/GCF/000/001/215/{asm_dir}/" + + file_a = f"{accession}_genomic.fna.gz" + file_b = f"{accession}_protein.faa.gz" + + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_a}", Body=b"new-genomic") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_b}", Body=b"new-protein") + # Simulate partial prior run: file_a already archived with stale content + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_a}", Body=b"stale-genomic") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="updated", + delete_source=False, + ) + + assert archived == 2 # noqa: PLR2004 + # file_a should now have the current content (overwritten) + obj_a = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_a}") + assert obj_a["Body"].read() == b"new-genomic", "Re-run should overwrite stale archive" + # file_b should now be archived + obj_b = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{file_b}") + assert obj_b["Body"].read() == b"new-protein" + + +@pytest.mark.s3 +def test_archive_assemblies_partial_delete_resumes( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Re-running replaced_or_suppressed archive after partial copy+delete is safe. + + Simulates: file_a was copied+deleted, file_b was copied but NOT deleted, + file_c was not touched. The re-run finds only file_b and file_c present + (file_a is gone), archives both, and deletes both. + """ + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + archive_base = ( + f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-03/replaced_or_suppressed/raw_data/GCF/000/005/845/{asm_dir}/" + ) + + file_b = f"{accession}_protein.faa.gz" + file_c = f"{accession}_assembly_report.txt" + + # file_a already gone (deleted in first partial run) + # file_b present at source (not yet deleted from first partial run) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_b}", Body=b"protein") + # file_c present at source (not touched at all) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{file_c}", Body=b"report") + # file_a already at archive destination + mock_s3_client_no_checksum.put_object( + Bucket=TEST_BUCKET, Key=f"{archive_base}{accession}_genomic.fna.gz", Body=b"genomic" + ) + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-03", + archive_reason="replaced_or_suppressed", + delete_source=True, + ) + + # Only the 2 remaining source files were archived + assert archived == 2 # noqa: PLR2004 + # Both now gone from source + for fname in (file_b, file_c): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{base}{fname}") + assert result.get("KeyCount", 0) == 0, f"Expected {fname} deleted" + # file_a archive still intact (not touched by re-run) + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{archive_base}{accession}_genomic.fna.gz") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_idempotent_updated_reruns_cleanly( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Running updated archive twice on the same data produces the same result.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/" + file_names = [f"{accession}_genomic.fna.gz", f"{accession}_protein.faa.gz"] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"content") + + manifest = tmp_path / "updated.txt" + manifest.write_text(f"{accession}\n") + + archived_1 = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + archived_2 = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + + assert archived_1 == len(file_names) + assert archived_2 == len(file_names) + # Sources still present after both runs (delete_source=False) + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_multi_accession_manifest( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Multiple accessions in a single manifest are all archived.""" + accessions = [ + ("GCF_000001215.4", "GCF_000001215.4_Release_6", "GCF/000/001/215"), + ("GCF_000005845.2", "GCF_000005845.2_ASM584v2", "GCF/000/005/845"), + ] + for accession, asm_dir, path in accessions: + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/{path}/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "updated.txt" + manifest.write_text("\n".join(acc for acc, _, _ in accessions) + "\n") + + archived = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + + assert archived == len(accessions) + for accession, asm_dir, path in accessions: + archive_key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/2024-01/updated/raw_data/{path}/{asm_dir}/{accession}_genomic.fna.gz" + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_key) + assert result.get("KeyCount", 0) == 1, f"Archive missing for {accession}" + + +@pytest.mark.s3 +def test_archive_assemblies_dry_run_multi_file( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """dry_run with multiple files per accession makes no copies and no deletes.""" + accession = "GCF_000005845.2" + asm_dir = f"{accession}_ASM584v2" + base = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{asm_dir}/" + file_names = [f"{accession}_genomic.fna.gz", f"{accession}_protein.faa.gz", f"{accession}_rna.fna.gz"] + for fname in file_names: + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}", Body=b"data") + + manifest = tmp_path / "removed.txt" + manifest.write_text(f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), + lakehouse_bucket=TEST_BUCKET, + ncbi_release="2024-01", + archive_reason="replaced_or_suppressed", + delete_source=True, + dry_run=True, + ) + + # Reported count matches + assert archived == len(file_names) + # No actual archive keys created + archive_prefix = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}archive/" + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=archive_prefix) + assert result.get("KeyCount", 0) == 0 + # Sources untouched + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{base}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_archive_assemblies_invalid_accession_skipped( + mock_s3_client_no_checksum: botocore.client.BaseClient, tmp_path: Path +) -> None: + """Malformed accession lines are skipped; valid ones still archived.""" + accession = "GCF_000001215.4" + asm_dir = f"{accession}_Release_6" + key = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{asm_dir}/{accession}_genomic.fna.gz" + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=key, Body=b"data") + + manifest = tmp_path / "mixed.txt" + manifest.write_text("NOT_AN_ACCESSION\n\n \n" + f"{accession}\n") + + archived = _archive_assemblies( + str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" + ) + assert archived == 1 diff --git a/tests/utils/test_s3.py b/tests/utils/test_s3.py index 36dd9205..8c6c2f9b 100644 --- a/tests/utils/test_s3.py +++ b/tests/utils/test_s3.py @@ -22,6 +22,7 @@ copy_directory, copy_object, delete_object, + delete_objects, download_file, get_s3_client, head_object, @@ -1017,3 +1018,34 @@ def test_delete_object_no_such_bucket() -> None: assert object_exists(s3_path) is False with pytest.raises(Exception, match="The specified bucket does not exist"): delete_object(s3_path) + + +# delete_objects +@pytest.mark.s3 +def test_delete_objects_removes_all(mock_s3_client: Any) -> None: + """delete_objects removes every listed key in a single call.""" + keys = ["bulk/a.txt", "bulk/b.txt", "bulk/c.txt"] + for k in keys: + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key=k, Body=b"data") + + errors = delete_objects(CDM_LAKE_BUCKET, keys) + + assert errors == [] + for k in keys: + assert object_exists(f"{CDM_LAKE_BUCKET}/{k}") is False + + +@pytest.mark.s3 +def test_delete_objects_empty_list_is_noop(mock_s3_client: Any) -> None: + """delete_objects with an empty list makes no API call and returns no errors.""" + mock_s3_client.put_object(Bucket=CDM_LAKE_BUCKET, Key="keep/me.txt", Body=b"safe") + errors = delete_objects(CDM_LAKE_BUCKET, []) + assert errors == [] + assert object_exists(f"{CDM_LAKE_BUCKET}/keep/me.txt") is True + + +@pytest.mark.s3 +def test_delete_objects_nonexistent_keys_no_error(mock_s3_client: Any) -> None: + """Deleting keys that don't exist returns no errors (S3 delete is idempotent).""" + errors = delete_objects(CDM_LAKE_BUCKET, ["ghost/a.txt", "ghost/b.txt"]) + assert errors == [] From 409626d423ddd9d76237c5fc924ad2130d7f07ea Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 12:47:44 -0700 Subject: [PATCH 72/76] optimize promotion step Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/promote.py | 140 +++---- tests/integration/test_promote_e2e.py | 370 ++++++++++++++++++ tests/ncbi_ftp/test_promote.py | 453 +++++++++++++++++++++++ 3 files changed, 896 insertions(+), 67 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 52a71612..0c50118d 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -27,7 +27,6 @@ from cdm_data_loaders.utils.cdm_logger import get_cdm_logger from cdm_data_loaders.utils.s3 import ( copy_object, - delete_object, delete_objects, get_s3_client, object_exists, @@ -179,6 +178,50 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 if acc_match and adir_match: assembly_files[(adir_match.group(1), acc_match.group(1))].append(staged_key) + def _promote_one(staged_key: str) -> tuple[DescriptorResource, str]: + """Download one staged file, re-upload to Lakehouse with MD5 metadata. + + :return: ``(resource_dict, staged_key)`` on success; raises on failure. + """ + rel_path = staged_key[len(normalized_staging_prefix) :] + final_key = lakehouse_key_prefix + rel_path + final_key_path = PurePosixPath(final_key) + + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp_path = tmp.name + try: + s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) + + metadata: dict[str, str] = {} + md5_key = staged_key + ".md5" + if md5_key in sidecars: + md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) + metadata["md5"] = md5_obj["Body"].read().decode().strip() + + upload_succeeded = upload_file( + tmp_path, + f"{lakehouse_bucket}/{final_key_path.parent}", + tags=metadata, + object_name=final_key_path.name, + show_progress=False, + ) + if not upload_succeeded: + msg = f"upload_file returned False for {staged_key}" + raise RuntimeError(msg) + + fname = final_key_path.name + ext = fname.rsplit(".", 1)[-1] if "." in fname else "" + resource: DescriptorResource = { + "name": fname.lower(), + "path": final_key, + "format": ext, + "bytes": Path(tmp_path).stat().st_size, + "hash": metadata.get("md5"), + } + return resource, staged_key + finally: + Path(tmp_path).unlink() + total_files = sum(len(v) for v in assembly_files.values()) _dry_run_log_count = 0 with tqdm.tqdm(total=total_files, unit="file", desc="Promoting") as pbar: @@ -187,12 +230,10 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 resources: list[DescriptorResource] = [] promoted_keys: list[str] = [] - for staged_key in files: - rel_path = staged_key[len(normalized_staging_prefix) :] - final_key = lakehouse_key_prefix + rel_path - final_key_path = PurePosixPath(final_key) - - if dry_run: + if dry_run: + for staged_key in files: + rel_path = staged_key[len(normalized_staging_prefix) :] + final_key = lakehouse_key_prefix + rel_path if _dry_run_log_count < 10: logger.info("[dry-run] would promote: %s -> %s", staged_key, final_key) else: @@ -200,56 +241,24 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 _dry_run_log_count += 1 promoted += 1 pbar.update(1) - continue - - file_promoted = False - try: - with tempfile.NamedTemporaryFile(delete=False) as tmp: - tmp_path = tmp.name + continue + + # Download and re-upload all files for this assembly concurrently + n_workers = min(32, len(files)) + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = {executor.submit(_promote_one, key): key for key in files} + for future in as_completed(futures): + staged_key = futures[future] try: - s3.download_file(Bucket=staging_bucket, Key=staged_key, Filename=tmp_path) - - # Read MD5 from sidecar - metadata: dict[str, str] = {} - md5_key = staged_key + ".md5" - if md5_key in sidecars: - md5_obj = s3.get_object(Bucket=staging_bucket, Key=md5_key) - metadata["md5"] = md5_obj["Body"].read().decode().strip() - - upload_succeeded = upload_file( - tmp_path, - f"{lakehouse_bucket}/{final_key_path.parent}", - tags=metadata, - object_name=final_key_path.name, - show_progress=False, - ) - if not upload_succeeded: - logger.error("Failed to upload promoted file %s to %s", staged_key, final_key) - else: - promoted += 1 - promoted_keys.append(staged_key) - promoted_accessions.add(acc) - file_promoted = True - - fname = final_key_path.name - ext = fname.rsplit(".", 1)[-1] if "." in fname else "" - resource: DescriptorResource = { - "name": fname.lower(), - "path": final_key, - "format": ext, - "bytes": Path(tmp_path).stat().st_size, - "hash": metadata.get("md5"), - } - resources.append(resource) - - finally: - Path(tmp_path).unlink() - except Exception: - logger.exception("Failed to promote %s", staged_key) - - if not file_promoted: - assembly_failed += 1 - pbar.update(1) + resource, _ = future.result() + resources.append(resource) + promoted_keys.append(staged_key) + promoted += 1 + promoted_accessions.add(acc) + except Exception: + logger.exception("Failed to promote %s", staged_key) + assembly_failed += 1 + pbar.update(1) failed += assembly_failed @@ -266,18 +275,15 @@ def _promote_data_files( # noqa: PLR0913, PLR0915 except Exception: logger.exception("Failed to write descriptor for %s", adir) - for staged_key in promoted_keys: - try: - delete_object(f"{staging_bucket}/{staged_key}") - except Exception: - logger.warning("Failed to delete staged file %s", staged_key) + # Batch-delete all staged data files and their sidecars in one API call + keys_to_delete = list(promoted_keys) + for key in promoted_keys: for sidecar_ext in (".md5", ".crc64nvme"): - sidecar_key = staged_key + sidecar_ext - if sidecar_key in sidecars: - try: - delete_object(f"{staging_bucket}/{sidecar_key}") - except Exception: - logger.warning("Failed to delete staged sidecar %s", sidecar_key) + if key + sidecar_ext in sidecars: + keys_to_delete.append(key + sidecar_ext) + del_errors = delete_objects(staging_bucket, keys_to_delete) + for err in del_errors: + logger.warning("Failed to delete staged file %s: %s", err.get("Key"), err.get("Message")) return promoted, failed, descriptors_written, promoted_accessions diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index b805f03c..d412ea27 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -944,3 +944,373 @@ def test_dry_run_no_copies_no_deletes( for key in source_keys: remaining = list_all_keys(s3, test_bucket, key) assert len(remaining) == 1, f"Source missing after dry-run: {key}" + + +# ── Concurrent promotion tests ──────────────────────────────────────────── + + +def _stage_many( + s3: object, + bucket: str, + assembly_dir: str, + files: dict[str, bytes], + *, + with_md5: bool = True, +) -> None: + """Stage *files* with optional .md5 sidecars under the standard staging prefix.""" + rel = build_accession_path(assembly_dir) + base = f"{STAGING_PREFIX}{rel}" + for fname, content in files.items(): + key = f"{base}{fname}" + s3.put_object(Bucket=bucket, Key=key, Body=content) + if with_md5: + s3.put_object(Bucket=bucket, Key=f"{key}.md5", Body=_md5(content).encode()) + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteMultiFileConcurrent: + """Verify concurrent promotion lands all files with correct content and MD5.""" + + def test_six_files_all_promoted_with_correct_content( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Every staged file arrives at the correct final key with byte-identical content.""" + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA", + f"{ASSEMBLY_DIR_A}_assembly_report.txt": b"REPORT", + f"{ASSEMBLY_DIR_A}_assembly_stats.txt": b"STATS", + f"{ASSEMBLY_DIR_A}_cds_from_genomic.fna.gz": b"CDS", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, many_files) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == len(many_files) + assert report["failed"] == 0 + + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, expected_body in many_files.items(): + key = f"{PATH_PREFIX}{rel}{fname}" + obj = s3.get_object(Bucket=test_bucket, Key=key) + assert obj["Body"].read() == expected_body, f"Content mismatch: {fname}" + + def test_md5_metadata_correct_per_file( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Each promoted file carries MD5 metadata matching its own content, not another file's.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"GENOMIC_UNIQUE", + f"{ASSEMBLY_DIR_A}_protein.faa.gz": b"PROTEIN_UNIQUE", + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA_UNIQUE", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + for fname, content in files.items(): + key = f"{PATH_PREFIX}{rel}{fname}" + meta = get_object_metadata(s3, test_bucket, key) + assert meta.get("md5") == _md5(content), f"Wrong MD5 metadata on {fname}" + + def test_file_without_sidecar_has_no_md5_metadata( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """A file staged without a .md5 sidecar is promoted but has no md5 metadata key.""" + s3 = minio_s3_client + fname = f"{ASSEMBLY_DIR_A}_genomic.fna.gz" + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {fname: FAKE_GENOMIC}, with_md5=False) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel = build_accession_path(ASSEMBLY_DIR_A) + meta = get_object_metadata(s3, test_bucket, f"{PATH_PREFIX}{rel}{fname}") + assert "md5" not in meta, f"Expected no md5 metadata, got: {meta}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteStagingCleanup: + """After a fully successful promote, all staged files and sidecars are deleted.""" + + def test_staged_data_files_deleted( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Data files are removed from staging after a successful assembly promote.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=False) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + remaining_staging = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(remaining_staging) == 0, f"Staging not cleaned: {remaining_staging}" + + def test_md5_sidecars_deleted(self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str) -> None: + """Both data files and .md5 sidecars are removed from staging after promote.""" + s3 = minio_s3_client + files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=True) + + # Verify sidecars exist before promote + rel = build_accession_path(ASSEMBLY_DIR_A) + before_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert any(k.endswith(".md5") for k in before_keys), "Test setup: expected .md5 sidecars" + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + after_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(after_keys) == 0, f"Staging not fully cleaned (including sidecars): {after_keys}" + + def test_two_assemblies_staging_both_cleaned( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Staging for both assemblies is fully cleaned when both assemblies succeed.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_B, + {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + remaining = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert len(remaining) == 0, f"Staging not fully cleaned after two-assembly promote: {remaining}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteTwoAssembliesBothLand: + """Both assemblies staged together are both promoted to correct Lakehouse paths.""" + + def test_both_assemblies_at_correct_final_paths( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Each assembly's files appear at distinct, correctly-routed final Lakehouse paths.""" + s3 = minio_s3_client + files_a = {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"genomic-A"} + files_b = {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"genomic-B"} + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files_a) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, files_b) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + obj_a = s3.get_object(Bucket=test_bucket, Key=f"{PATH_PREFIX}{rel_a}{ASSEMBLY_DIR_A}_genomic.fna.gz") + obj_b = s3.get_object(Bucket=test_bucket, Key=f"{PATH_PREFIX}{rel_b}{ASSEMBLY_DIR_B}_genomic.fna.gz") + assert obj_a["Body"].read() == b"genomic-A" + assert obj_b["Body"].read() == b"genomic-B" + + def test_final_path_keys_do_not_overlap( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Files for assembly A and assembly B land at distinct paths — no key collision.""" + s3 = minio_s3_client + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": b"a"}) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": b"b"}) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + rel_a = build_accession_path(ASSEMBLY_DIR_A) + rel_b = build_accession_path(ASSEMBLY_DIR_B) + keys_a = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_a}") + keys_b = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel_b}") + assert len(keys_a) == 1 + assert len(keys_b) == 1 + assert keys_a[0] != keys_b[0] + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteDryRunMultiFile: + """dry_run leaves staging untouched and writes nothing to the Lakehouse.""" + + def test_dry_run_many_files_staging_untouched( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """All staged files (data + .md5) survive a dry-run promote unchanged.""" + s3 = minio_s3_client + many_files = { + f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, + f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN, + f"{ASSEMBLY_DIR_A}_rna.fna.gz": b"RNA", + } + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, many_files, with_md5=True) + staging_before = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + + report = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + assert report["promoted"] == len(many_files) + assert report["dry_run"] is True + + # Staging unchanged + staging_after = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) + assert staging_after == staging_before, "Dry-run should not alter staging" + + # Nothing at final path + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run created Lakehouse objects: {final_keys}" + + def test_dry_run_two_assemblies_nothing_written( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Dry-run with two staged assemblies creates no Lakehouse objects.""" + s3 = minio_s3_client + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}) + _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_B, {f"{ASSEMBLY_DIR_B}_genomic.fna.gz": FAKE_GENOMIC}) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + dry_run=True, + ) + + final_keys = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + assert len(final_keys) == 0, f"Dry-run created objects: {final_keys}" + + +@pytest.mark.integration +@pytest.mark.slow_test +class TestPromoteSecondRunOnEmptyStaging: + """After staging is cleaned, a second promote run promotes 0 files without error.""" + + def test_second_run_promoted_zero( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Re-running promote on already-cleaned staging succeeds with promoted=0.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC}, + with_md5=True, + ) + + report1 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + report2 = promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + + assert report1["promoted"] == 1 + assert report2["promoted"] == 0 + assert report2["failed"] == 0 + + # Final key still present after second run + rel = build_accession_path(ASSEMBLY_DIR_A) + final_keys = list_all_keys(s3, test_bucket, f"{PATH_PREFIX}{rel}") + assert len(final_keys) == 1 + + def test_lakehouse_unchanged_on_second_run( + self, minio_s3_client: object, test_bucket: str, staging_test_bucket: str + ) -> None: + """Lakehouse contents are identical before and after a second (no-op) promote run.""" + s3 = minio_s3_client + _stage_many( + s3, + staging_test_bucket, + ASSEMBLY_DIR_A, + {f"{ASSEMBLY_DIR_A}_genomic.fna.gz": FAKE_GENOMIC, f"{ASSEMBLY_DIR_A}_protein.faa.gz": FAKE_PROTEIN}, + with_md5=True, + ) + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_first = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + promote_from_s3( + staging_key_prefix=STAGING_PREFIX, + staging_bucket=staging_test_bucket, + lakehouse_bucket=test_bucket, + lakehouse_key_prefix=PATH_PREFIX, + ) + keys_after_second = list_all_keys(s3, test_bucket, PATH_PREFIX + "raw_data/") + + assert keys_after_first == keys_after_second diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index 976d06e0..ab1184df 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -1,10 +1,13 @@ """Tests for ncbi_ftp.promote module — S3 promote, archive, manifest trimming.""" +import hashlib from pathlib import Path +from unittest.mock import patch import botocore.client import pytest +import cdm_data_loaders.ncbi_ftp.promote as promote_mod from cdm_data_loaders.ncbi_ftp.promote import ( DEFAULT_LAKEHOUSE_KEY_PREFIX, _archive_assemblies, @@ -14,6 +17,51 @@ from tests.ncbi_ftp.conftest import TEST_BUCKET +# ── Promotion test constants ───────────────────────────────────────────── + +_STAGE_PREFIX = "staging/run1/" + +# Assembly 1 +_ACC1 = "GCF_000001215.4" +_DIR1 = "GCF_000001215.4_Release_6" +_STG1 = f"{_STAGE_PREFIX}raw_data/GCF/000/001/215/{_DIR1}/" +_LKH1 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/001/215/{_DIR1}/" + +# Assembly 2 +_ACC2 = "GCF_000005845.2" +_DIR2 = "GCF_000005845.2_ASM584v2" +_STG2 = f"{_STAGE_PREFIX}raw_data/GCF/000/005/845/{_DIR2}/" +_LKH2 = f"{DEFAULT_LAKEHOUSE_KEY_PREFIX}raw_data/GCF/000/005/845/{_DIR2}/" + + +def _stage( + s3: botocore.client.BaseClient, + staging_base: str, + files: dict[str, bytes], + *, + with_md5: bool = True, + with_crc64: bool = False, +) -> list[str]: + """Stage files at *staging_base*, optionally adding .md5 / .crc64nvme sidecars. + + Returns list of all staged keys (data files only, not sidecars). + """ + keys = [] + for fname, content in files.items(): + key = f"{staging_base}{fname}" + s3.put_object(Bucket=TEST_BUCKET, Key=key, Body=content) + keys.append(key) + if with_md5: + s3.put_object( + Bucket=TEST_BUCKET, + Key=f"{key}.md5", + Body=hashlib.md5(content).hexdigest().encode(), # noqa: S324 + ) + if with_crc64: + s3.put_object(Bucket=TEST_BUCKET, Key=f"{key}.crc64nvme", Body=b"fake-crc") + return keys + + def _stage_files(s3_client: botocore.client.BaseClient, prefix: str) -> None: """Upload sample staged files to mock S3.""" for key in [ @@ -552,3 +600,408 @@ def test_archive_assemblies_invalid_accession_skipped( str(manifest), lakehouse_bucket=TEST_BUCKET, ncbi_release="2024-01", archive_reason="updated" ) assert archived == 1 + + +# ── Concurrent / multi-file promotion (new behaviour) ──────────────────── + + +@pytest.mark.s3 +def test_promote_multi_file_all_land_at_final_path( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """All files for an assembly are promoted concurrently — none missed.""" + file_names = [ + f"{_ACC1}_genomic.fna.gz", + f"{_ACC1}_protein.faa.gz", + f"{_ACC1}_rna.fna.gz", + f"{_ACC1}_assembly_report.txt", + f"{_ACC1}_assembly_stats.txt", + ] + _stage(mock_s3_client_no_checksum, _STG1, {f: f.encode() for f in file_names}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == len(file_names) + assert report["failed"] == 0 + for fname in file_names: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_file_content_preserved( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Content at the final key is byte-identical to the staged content.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"\x1f\x8bGENOMIC", + f"{_ACC1}_protein.faa.gz": b"\x1f\x8bPROTEIN", + f"{_ACC1}_rna.fna.gz": b"\x1f\x8bRNA", + } + _stage(mock_s3_client_no_checksum, _STG1, files) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for fname, expected in files.items(): + obj = mock_s3_client_no_checksum.get_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert obj["Body"].read() == expected, f"Content mismatch for {fname}" + + +@pytest.mark.s3 +def test_promote_md5_metadata_set_from_sidecar( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """MD5 metadata on the promoted object matches the .md5 sidecar value.""" + content = b"\x1f\x8bGENOMIC" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: content}, with_md5=True) + expected_md5 = hashlib.md5(content).hexdigest() # noqa: S324 + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") == expected_md5 + + +@pytest.mark.s3 +def test_promote_no_sidecar_no_md5_metadata( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """A file staged without a .md5 sidecar is promoted but carries no md5 metadata.""" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: b"data"}, with_md5=False) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") is None + + +@pytest.mark.s3 +def test_promote_staging_data_files_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged data files are deleted from staging after a fully successful assembly promote.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"genomic", + f"{_ACC1}_protein.faa.gz": b"protein", + } + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, files) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for key in staged_keys: + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=key) + assert result.get("KeyCount", 0) == 0, f"Staged data file not deleted: {key}" + + +@pytest.mark.s3 +def test_promote_md5_sidecars_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged .md5 sidecar files are deleted from staging after a successful promote.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"genomic", + f"{_ACC1}_protein.faa.gz": b"protein", + } + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for key in staged_keys: + for sidecar_key in (f"{key}.md5", f"{key}.crc64nvme"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=sidecar_key) + assert result.get("KeyCount", 0) == 0, f"Sidecar not deleted: {sidecar_key}" + + +@pytest.mark.s3 +def test_promote_crc64nvme_sidecars_deleted_after_promote( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Staged .crc64nvme sidecar files are also batch-deleted after a successful promote.""" + fname = f"{_ACC1}_genomic.fna.gz" + _stage(mock_s3_client_no_checksum, _STG1, {fname: b"data"}, with_md5=True, with_crc64=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + staged_key = f"{_STG1}{fname}" + for sidecar_key in (f"{staged_key}.md5", f"{staged_key}.crc64nvme"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=sidecar_key) + assert result.get("KeyCount", 0) == 0, f"Sidecar not deleted: {sidecar_key}" + + +@pytest.mark.s3 +def test_promote_partial_failure_staging_not_cleaned( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """When one file in an assembly fails, NO staged files for that assembly are deleted. + + Preserving staging on partial failure lets an operator re-run without + re-staging and without losing the partially-promoted state. + """ + file_ok = f"{_ACC1}_genomic.fna.gz" + file_fail = f"{_ACC1}_protein.faa.gz" + _stage(mock_s3_client_no_checksum, _STG1, {file_ok: b"ok", file_fail: b"fail"}) + staged_ok = f"{_STG1}{file_ok}" + staged_fail = f"{_STG1}{file_fail}" + + # Make download_file raise for exactly the failing key + original_download = mock_s3_client_no_checksum.download_file + + def _download_one_fail(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == staged_fail: + msg = "simulated download failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _download_one_fail + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + # Staging files must still be present (cleanup skipped due to failure) + for key in (staged_ok, staged_fail): + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, ( + f"Expected staged file to survive partial failure: {key}" + ) # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_partial_failure_failed_count( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """report[\"failed\"] reflects the number of files that could not be promoted.""" + file_names = [f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz", f"{_ACC1}_rna.fna.gz"] + _stage(mock_s3_client_no_checksum, _STG1, {f: b"data" for f in file_names}) + + failing_key = f"{_STG1}{file_names[1]}" + original_download = mock_s3_client_no_checksum.download_file + + def _download_middle_fail(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == failing_key: + msg = "simulated failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _download_middle_fail + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + assert report["promoted"] == 2 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_two_assemblies_independent_cleanup( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """A fully successful assembly cleans up its staging even when another assembly partially fails. + + Assembly 1 fully succeeds → staging cleared. + Assembly 2 has one failing file → staging NOT cleared. + """ + # Assembly 1: two files, both succeed + _stage( + mock_s3_client_no_checksum, + _STG1, + {f"{_ACC1}_genomic.fna.gz": b"g1", f"{_ACC1}_protein.faa.gz": b"p1"}, + ) + # Assembly 2: two files, one will fail + _stage( + mock_s3_client_no_checksum, + _STG2, + {f"{_ACC2}_genomic.fna.gz": b"g2", f"{_ACC2}_protein.faa.gz": b"p2"}, + ) + failing_key = f"{_STG2}{_ACC2}_protein.faa.gz" + original_download = mock_s3_client_no_checksum.download_file + + def _patched(Bucket: str, Key: str, Filename: str, **kw: object) -> None: # noqa: N803 + if Key == failing_key: + msg = "simulated failure" + raise RuntimeError(msg) + return original_download(Bucket=Bucket, Key=Key, Filename=Filename, **kw) + + mock_s3_client_no_checksum.download_file = _patched + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["failed"] == 1 + + # Assembly 1 staging must be gone + for fname in (f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz"): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{_STG1}{fname}") + assert result.get("KeyCount", 0) == 0, f"Assembly 1 staging should be cleaned: {fname}" + + # Assembly 2 staging must remain (partial failure) + for fname in (f"{_ACC2}_genomic.fna.gz", f"{_ACC2}_protein.faa.gz"): + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_STG2}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, ( + f"Assembly 2 staging must survive partial failure: {fname}" + ) # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_assembly_all_succeed_all_cleaned( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Two assemblies both fully succeed → all staged files removed for both.""" + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"g1"}) + _stage(mock_s3_client_no_checksum, _STG2, {f"{_ACC2}_genomic.fna.gz": b"g2"}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == 2 # noqa: PLR2004 + assert report["failed"] == 0 + + for stg, fname, lkh in ( + (_STG1, f"{_ACC1}_genomic.fna.gz", _LKH1), + (_STG2, f"{_ACC2}_genomic.fna.gz", _LKH2), + ): + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=f"{stg}{fname}") + assert result.get("KeyCount", 0) == 0, f"Staging not cleaned: {fname}" + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{lkh}{fname}") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_dry_run_multi_file_no_writes_no_cleanup( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """dry_run with multiple files writes nothing to final path and does not delete staging.""" + file_names = [f"{_ACC1}_genomic.fna.gz", f"{_ACC1}_protein.faa.gz", f"{_ACC1}_rna.fna.gz"] + staged_keys = _stage(mock_s3_client_no_checksum, _STG1, {f: f.encode() for f in file_names}) + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + dry_run=True, + ) + + assert report["promoted"] == len(file_names) + assert report["dry_run"] is True + + # Final path must be empty + result = mock_s3_client_no_checksum.list_objects_v2(Bucket=TEST_BUCKET, Prefix=_LKH1) + assert result.get("KeyCount", 0) == 0 + + # Staging keys must survive + for key in staged_keys: + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=key) + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200, f"Staging deleted during dry-run: {key}" # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_skips_non_raw_data_paths( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Files outside raw_data/ (e.g. download_report.json) are silently skipped.""" + # Stage a real data file alongside non-promotable files + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"data"}) + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{_STAGE_PREFIX}download_report.json", Body=b"{}") + mock_s3_client_no_checksum.put_object(Bucket=TEST_BUCKET, Key=f"{_STAGE_PREFIX}logs/run.log", Body=b"logs") + + report = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report["promoted"] == 1 # only the .fna.gz + assert report["failed"] == 0 + + +@pytest.mark.s3 +def test_promote_idempotent_second_run_on_empty_staging( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Second promote run after staging has been cleaned promotes 0 files without error.""" + _stage(mock_s3_client_no_checksum, _STG1, {f"{_ACC1}_genomic.fna.gz": b"data"}) + + report1 = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + report2 = promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + assert report1["promoted"] == 1 + assert report2["promoted"] == 0 + assert report2["failed"] == 0 + + # Final key still present after second run + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{_ACC1}_genomic.fna.gz") + assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 # noqa: PLR2004 + + +@pytest.mark.s3 +def test_promote_multi_file_md5_per_file( + mock_s3_client_no_checksum: botocore.client.BaseClient, +) -> None: + """Each promoted file carries the MD5 matching its own content, not another file's.""" + files = { + f"{_ACC1}_genomic.fna.gz": b"GENOMIC_UNIQUE", + f"{_ACC1}_protein.faa.gz": b"PROTEIN_UNIQUE", + f"{_ACC1}_rna.fna.gz": b"RNA_UNIQUE", + } + _stage(mock_s3_client_no_checksum, _STG1, files, with_md5=True) + + promote_from_s3( + staging_key_prefix=_STAGE_PREFIX, + staging_bucket=TEST_BUCKET, + lakehouse_bucket=TEST_BUCKET, + ) + + for fname, content in files.items(): + expected_md5 = hashlib.md5(content).hexdigest() # noqa: S324 + resp = mock_s3_client_no_checksum.head_object(Bucket=TEST_BUCKET, Key=f"{_LKH1}{fname}") + assert resp["Metadata"].get("md5") == expected_md5, f"Wrong MD5 on {fname}" From fdb702646dc72e0367898a433caef4ee9af2fdb5 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 12:50:14 -0700 Subject: [PATCH 73/76] Potential fix for pull request finding 'Unused local variable' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/integration/test_promote_e2e.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_promote_e2e.py b/tests/integration/test_promote_e2e.py index d412ea27..20c72022 100644 --- a/tests/integration/test_promote_e2e.py +++ b/tests/integration/test_promote_e2e.py @@ -1084,7 +1084,6 @@ def test_md5_sidecars_deleted(self, minio_s3_client: object, test_bucket: str, s _stage_many(s3, staging_test_bucket, ASSEMBLY_DIR_A, files, with_md5=True) # Verify sidecars exist before promote - rel = build_accession_path(ASSEMBLY_DIR_A) before_keys = list_all_keys(s3, staging_test_bucket, STAGING_PREFIX) assert any(k.endswith(".md5") for k in before_keys), "Test setup: expected .md5 sidecars" From 4d690a8af1c3a0b030e2d45e0fad633521bfa646 Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 12:50:24 -0700 Subject: [PATCH 74/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_promote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index ab1184df..bb04d080 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -2,7 +2,6 @@ import hashlib from pathlib import Path -from unittest.mock import patch import botocore.client import pytest From e305d74affa8837c85d87602498abd946ff1a67e Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 12:50:34 -0700 Subject: [PATCH 75/76] Potential fix for pull request finding 'Unused import' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- tests/ncbi_ftp/test_promote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ncbi_ftp/test_promote.py b/tests/ncbi_ftp/test_promote.py index bb04d080..ae0ff887 100644 --- a/tests/ncbi_ftp/test_promote.py +++ b/tests/ncbi_ftp/test_promote.py @@ -6,7 +6,6 @@ import botocore.client import pytest -import cdm_data_loaders.ncbi_ftp.promote as promote_mod from cdm_data_loaders.ncbi_ftp.promote import ( DEFAULT_LAKEHOUSE_KEY_PREFIX, _archive_assemblies, From c01dbdca283d748454b0678809ddd11982b1673a Mon Sep 17 00:00:00 2001 From: Matt Dawson Date: Thu, 30 Apr 2026 14:51:20 -0700 Subject: [PATCH 76/76] capture missing return values Co-authored-by: Copilot --- src/cdm_data_loaders/ncbi_ftp/metadata.py | 2 +- src/cdm_data_loaders/ncbi_ftp/promote.py | 9 +++++++-- src/cdm_data_loaders/pipelines/ncbi_ftp_download.py | 12 ++++++++---- tests/integration/test_full_pipeline.py | 10 +++++----- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/cdm_data_loaders/ncbi_ftp/metadata.py b/src/cdm_data_loaders/ncbi_ftp/metadata.py index e616c9b5..b5b6175e 100644 --- a/src/cdm_data_loaders/ncbi_ftp/metadata.py +++ b/src/cdm_data_loaders/ncbi_ftp/metadata.py @@ -246,7 +246,7 @@ def archive_descriptor( # noqa: PLR0913 return False raise - copy_object( + _ = copy_object( f"{bucket}/{source_key}", f"{bucket}/{archive_key}", ) diff --git a/src/cdm_data_loaders/ncbi_ftp/promote.py b/src/cdm_data_loaders/ncbi_ftp/promote.py index 0c50118d..4fbe6707 100644 --- a/src/cdm_data_loaders/ncbi_ftp/promote.py +++ b/src/cdm_data_loaders/ncbi_ftp/promote.py @@ -270,7 +270,10 @@ def _promote_one(staged_key: str) -> tuple[DescriptorResource, str]: logger.debug("Descriptor already exists, skipping: %s", descriptor_key) else: descriptor = create_descriptor(adir, acc, resources) - upload_descriptor(descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False) + descriptor_key = upload_descriptor( + descriptor, adir, lakehouse_bucket, lakehouse_key_prefix, dry_run=False + ) + logger.debug("Uploaded descriptor: %s", descriptor_key) descriptors_written += 1 except Exception: logger.exception("Failed to write descriptor for %s", adir) @@ -402,7 +405,7 @@ def _archive_assemblies( # noqa: PLR0913 # Archive the frictionless descriptor alongside raw data if assembly_dir: try: - archive_descriptor( + archived_desc = archive_descriptor( assembly_dir, lakehouse_bucket, lakehouse_key_prefix, @@ -410,6 +413,8 @@ def _archive_assemblies( # noqa: PLR0913 archive_reason=archive_reason, dry_run=dry_run, ) + if not archived_desc: + logger.debug("No descriptor found to archive for %s", assembly_dir) except Exception: logger.exception("Failed to archive descriptor for %s", assembly_dir) diff --git a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py index 062575f6..9eff6af3 100644 --- a/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py +++ b/src/cdm_data_loaders/pipelines/ncbi_ftp_download.py @@ -104,9 +104,11 @@ def _upload_assembly_dir( if f.is_file(): relative = f.relative_to(tmp_root) dest_prefix = f"{bucket}/{staging_key_prefix.rstrip('/')}/{relative.parent}" - upload_file(f, dest_prefix, show_progress=False) + if upload_file(f, dest_prefix, show_progress=False): + count += 1 + else: + logger.warning("Failed to upload %s to %s", f, dest_prefix) f.unlink() - count += 1 shutil.rmtree(assembly_dir, ignore_errors=True) return count @@ -360,8 +362,10 @@ def _attempt() -> dict[str, Any]: report_path = tmp / "download_report.json" with report_path.open("w") as f: json.dump(report, f, indent=2) - upload_file(report_path, f"{bucket}/{staging_key_prefix.rstrip('/')}", show_progress=False) - staged_objects += 1 + if upload_file(report_path, f"{bucket}/{staging_key_prefix.rstrip('/')}", show_progress=False): + staged_objects += 1 + else: + logger.warning("Failed to upload download report to s3://%s/%s", bucket, staging_key_prefix) logger.info("Staged %d objects to s3://%s/%s", staged_objects, bucket, staging_key_prefix) logger.info( diff --git a/tests/integration/test_full_pipeline.py b/tests/integration/test_full_pipeline.py index 0c948764..63776ae5 100644 --- a/tests/integration/test_full_pipeline.py +++ b/tests/integration/test_full_pipeline.py @@ -61,7 +61,7 @@ def test_full_pipeline_small_batch( assert len(diff.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" manifest_path = tmp_path / "transfer_manifest.txt" - write_transfer_manifest(diff, filtered, manifest_path) + _ = write_transfer_manifest(diff, filtered, manifest_path) # ── Phase 2: Download one assembly from real FTP ──────────────── output_dir = tmp_path / "output" @@ -129,7 +129,7 @@ def test_full_pipeline_incremental( assert len(diff1.new) > 0, f"No new assemblies in prefix {STABLE_PREFIX}" manifest1 = tmp_path / "transfer_manifest_1.txt" - write_transfer_manifest(diff1, filtered, manifest1) + _ = write_transfer_manifest(diff1, filtered, manifest1) output1 = tmp_path / "output1" output1.mkdir() @@ -179,13 +179,13 @@ def test_full_pipeline_incremental( assert downloaded_acc in diff2.updated, f"Expected {downloaded_acc} in updated list" manifest2 = tmp_path / "transfer_manifest_2.txt" - write_transfer_manifest(diff2, filtered, manifest2) + _ = write_transfer_manifest(diff2, filtered, manifest2) updated_manifest = tmp_path / "updated_manifest.txt" - write_updated_manifest(diff2, updated_manifest) + _ = write_updated_manifest(diff2, updated_manifest) removed_manifest = tmp_path / "removed_manifest.txt" - write_removed_manifest(diff2, removed_manifest) + _ = write_removed_manifest(diff2, removed_manifest) # Phase 2 — re-download the updated assembly output2 = tmp_path / "output2"